This is SNPfiltR v.1.0.7
Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
If you use SNPfiltR in your published work, please cite the following papers:
DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 22, 2443-2453. http://doi.org/10.1111/1755-0998.13618
Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
Code
library(vcfR)
***** *** vcfR *** *****
This is vcfR 1.15.0
browseVignettes('vcfR') # Documentation
citation('vcfR') # Citation
***** ***** ***** *****
Code
library(ggplot2)
Warning: package 'ggplot2' was built under R version 4.5.2
Code
library(stringr)library(StAMPP)
Loading required package: pegas
Warning: package 'pegas' was built under R version 4.5.2
Loading required package: ape
Attaching package: 'pegas'
The following object is masked from 'package:ape':
mst
The following objects are masked from 'package:vcfR':
getINFO, write.vcf
Registered S3 method overwritten by 'ade4':
method from
print.amova pegas
Code
library(adegenet)
Loading required package: ade4
Attaching package: 'ade4'
The following object is masked from 'package:pegas':
amova
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ dplyr::where() masks ape::where()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#reorder sampling file to match order of samples in vcfsample.info<-sample.info[match(colnames(vcfR@gt)[-1], sample.info$Desired_Sequence_Name),]sample.info$Desired_Sequence_Name ==colnames(vcfR@gt)[-1]
#add in locality to sample name for PCA labelpopmap$new_id <-paste(popmap$pop, popmap$id, sep ="_")popmap$new_id <-str_replace_all( popmap$new_id,c("_ao"="","Yoro_Hon"="Honduras","Tabasco_MX"="Tabasco","Oaxaca_MX"="Oaxaca","Chiapas_MX"="Chiapas","Colima_MX"="Colima","Socal_Parrot"="SCP","Tamaulipas_MX"="Tamaulipas","Veracruz_MX"="Veracruz","San Luis Potosi_MX"="San_Luis_Potosi","Nayarit_MX"="Nayarit","Michoacan_MX"="Michoacan","Guerrero_MX"="Guerrero" ))popmap
#outliers: Tamaulipas_MLZ_32244 appears to be contaminated above - removedvcfR <- vcfR[, !colnames(vcfR@gt) %in%c("ao_MLZ_32244")]#subset popmap to only include retained individualspopmap<-popmap[popmap$id %in%colnames(vcfR@gt),]#check missing datamiss<-missing_by_sample(vcfR)
No popmap provided
Code
vcfR.trim<-missing_by_sample(vcfR, cutoff =0.80)
7 samples are above a 0.8 missing data cutoff, and were removed from VCF
1.10 Check how much data is missing by SNP, select a threshold, and save filtered VCF
Code
#check missing by SNPmissing_by_snp(vcfR.trim)
cutoff is not specified, exploratory visualizations will be generated
Warning in ggridges::geom_density_ridges(jittered_points = TRUE, position =
"raincloud", : Ignoring unknown parameters: `size`
#load vcf and popmap# vcfR.trim.80<-read.vcfR("~/Dropbox/08_yhpa_072025/amazona.oratrix.final.filtered.snps.vcf.gz")# popmap<-read.csv("~/Dropbox/08_yhpa_072025/yhpa_popmap.csv")#subset popmap to only include retained individualspopmap<-popmap[popmap$id %in%colnames(vcfR.trim.80@gt),]#read in sample data for subspecies columnyhpa_sample_data<-read.csv("~/Dropbox/08_yhpa_072025/table01_updated_yhpa_sample_sheet.csv")dim(yhpa_sample_data)
[1] 54 13
Code
yhpa_occs<-yhpa_sample_data[c(1:36),]#remove hondurasyhpa_occs<-yhpa_occs[yhpa_occs$Subspecies !="hondurensis",]#changing popmap column names to matchsubspecies_popmap<-popmapcolnames(subspecies_popmap)<-c("sample_ID", "Country_State")subspecies<-left_join(subspecies_popmap, yhpa_occs)
Joining with `by = join_by(sample_ID, Country_State)`
#labeled pca#convert each to genlightgena<-vcfR2genlight(vcfR.trim.80)#perform PCAdi.pca<-glPca(gena, nf=4)#isolate PCA scores as a dataframedi.pca.scores<-as.data.frame(di.pca$scores)#reorder sampling file to match alphabetical orderpopmap<-popmap %>%slice(order(factor(id, levels =rownames(di.pca.scores))))#check that they all matchrownames(di.pca.scores) == popmap$id
#load vcf and popmap# vcfR.trim.80.unlinked<-read.vcfR("~/Dropbox/08_yhpa_072025/2026/snp_filtering/amazona.oratrix.80.unlinked.vcf.gz")# popmap<-read.csv("~/Dropbox/08_yhpa_072025/yhpa_popmap.csv")#subset popmap to only include retained individualspopmap.unlinked<-popmap[popmap$id %in%colnames(vcfR.trim.80.unlinked@gt),]
1.14 Linkage Thinned PCA
Code
#labeled pca#convert each to genlightgena.unlinked<-vcfR2genlight(vcfR.trim.80.unlinked)#perform PCAdi.pca.unlinked<-glPca(gena.unlinked, nf=4)#isolate PCA scores as a dataframedi.pca.unlinked.scores<-as.data.frame(di.pca.unlinked$scores)#reorder sampling file to match alphabetical orderpopmap<-popmap %>%slice(order(factor(id, levels =rownames(di.pca.unlinked.scores))))#check that they all matchrownames(di.pca.unlinked.scores) == popmap.unlinked$id
pop(gena)<-gena@ind.names#Pairwise Nei’s genetic distance between individualssample.div.80<-stamppNeisD(gena, pop =FALSE)#pairwise comparison scatterplotplot(sample.div.80)
Code
#need to rename samples for splitstreerownames(sample.div.80)
#load in sample data for filtered samples with numbered localitieslocality_groups<-read.csv("~/Dropbox/08_yhpa_072025/locality_groups.csv")popmap_locality<-left_join(subspecies_popmap, locality_groups)names(popmap_locality) <-make.names(names(popmap_locality), unique =TRUE)popmap_locality <- popmap_locality %>%mutate(Subspecies =ifelse( Subspecies ==""|is.na(Subspecies),"Unknown", Subspecies ) )popmap_locality
Code
#convert each to genlightgena.east<-vcfR2genlight(east.vcf)#perform PCAdi.pca.east<-glPca(gena.east, nf=4, parallel =TRUE, n.cores =7)#isolate PCA scores as a dataframedi.pca.east.scores<-as.data.frame(di.pca.east$scores)#reorder sampling file to match alphabetical orderpopmap<-popmap %>%slice(order(factor(id, levels =rownames(di.pca.east.scores))))#check that they all matchrownames(di.pca.east.scores) == popmap$id