This is SNPfiltR v.1.0.2
Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
If you use SNPfiltR in your published work, please cite the following papers:
DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 22, 2443-2453. http://doi.org/10.1111/1755-0998.13618
Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
Code
library(vcfR)
***** *** vcfR *** *****
This is vcfR 1.15.0
browseVignettes('vcfR') # Documentation
citation('vcfR') # Citation
***** ***** ***** *****
Code
library(ggplot2)library(stringr)library(StAMPP)
Loading required package: pegas
Loading required package: ape
Attaching package: 'pegas'
The following object is masked from 'package:ape':
mst
The following objects are masked from 'package:vcfR':
getINFO, write.vcf
Registered S3 method overwritten by 'ade4':
method from
print.amova pegas
Code
library(adegenet)
Loading required package: ade4
Attaching package: 'ade4'
The following object is masked from 'package:pegas':
amova
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ dplyr::where() masks ape::where()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(sf)
Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
2 Testing locator - trying to predict locality of ao_MLZ_48333 (Tamaulipas)
2.0.1 Creating vcf for locator
Using filtered vcf with 80% threshold and thinned created using oratrix_snp_filtering.qmd. This vcf corresponds to the samples in the sample data document locator uses to predict localities sample_data_tamaulipas_test.txt.
Code
#removing 7 Belize samples with the most missing data (only keeping 2 samples with the most data for balance)miss<-missing_by_sample(vcfR)
No popmap provided
Code
#we need to remove socal samples (ao_MLZ_1105, ao_MLZ_70063, and ao_MLZ_70074) since we do not know their origin#removing confiscated scp samples because we don't have their localitiestest.vcf<-(vcfR)[, !colnames((vcfR)@gt) %in%c("ao_MLZ_1105", #socal"ao_MLZ_70063", #socal"ao_MLZ_70074", #socal"ao_BC_108", #belize"ao_BC_109", #belize"ao_BC_A113", #belize"ao_BC_A114", #belize"ao_BC_A116", #belize"ao_BC_A117", #belize"ao_BC_A118", #belize"ao_SP_1", #all SP samples - confiscated"ao_SP_2","ao_SP_3","ao_SP_4","ao_SP_5","ao_SP_6","ao_SP_7","ao_SP_8","ao_SP_831","ao_SP_832","ao_SP_833","ao_SP_834","ao_SP_835","ao_SP_836","ao_SP_837","ao_SP_838","ao_SP_839","ao_SP_840")]test.vcf
***** Object of Class vcfR *****
18 samples
26 CHROMs
2,274 variants
Object size: 2.4 Mb
14.34 percent missing data
***** ***** *****
2.0.2 Create popmap and save vcf
Code
#subset popmap to only include retained individualspopmap<-popmap[popmap$id %in%colnames(test.vcf@gt),]#make sure sampling file matches vcfpopmap$id %in%colnames(test.vcf@gt)[-1]
#reorder sampling file to match order of samples in vcfsample.info<-popmap[match(colnames(test.vcf@gt)[-1], popmap$id),]popmap$id ==colnames(test.vcf@gt)[-1]
***** Object of Class vcfR *****
18 samples
26 CHROMs
1,312 variants
Object size: 1.6 Mb
14.32 percent missing data
***** ***** *****
Code
#confirm sample sheet matches the samples in the vcftest_sample_data<-read_delim("~/Dropbox/08_yhpa_072025/final_locator/tamaulipas/sample_data_tamaulipas_final.txt")
Rows: 18 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): sampleID
dbl (2): x, y
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 54 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): sampleID
dbl (2): x, y
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#we need to remove socal samples (ao_MLZ_1105, ao_MLZ_70063, and ao_MLZ_70074) since we do not know their origin#need to downsample Belize again#not trying to predict confiscated scp samples because we don't have their localitiesyhpa.scp.vcf<-(vcfR)[, !colnames((vcfR)@gt) %in%c("ao_BC_108", #belize"ao_BC_109", #belize"ao_BC_A113", #belize"ao_BC_A114", #belize"ao_BC_A116", #belize"ao_BC_A117", #belize"ao_BC_A118", #belize"ao_MLZ_1105", #socal"ao_MLZ_70063", #socal"ao_MLZ_70074")] #socalyhpa.scp.vcf
***** Object of Class vcfR *****
36 samples
26 CHROMs
2,274 variants
Object size: 3.6 Mb
7.636 percent missing data
***** ***** *****
#subset popmap to only include retained individualspopmap<-popmap[popmap$id %in%colnames(yhpa.scp.vcf@gt),]#make sure sampling file matches vcfpopmap$id %in%colnames(yhpa.scp.vcf@gt)[-1]
#reorder sampling file to match order of samples in vcfsample.info<-popmap[match(colnames(yhpa.scp.vcf@gt)[-1], popmap$id),]popmap$id ==colnames(yhpa.scp.vcf@gt)[-1]
***** Object of Class vcfR *****
36 samples
26 CHROMs
1,972 variants
Object size: 3.2 Mb
7.726 percent missing data
***** ***** *****
Code
#confirm sample sheet matches the samples in the vcfyhpa_sample_data<-read.delim("~/Dropbox/08_yhpa_072025/final_locator/yhpa/sample_data_yhpa_final.txt")colnames(yhpa.scp.vcf.mac2@gt)[-1]
#!/bin/bash
#
#SBATCH --job-name=locator_array
#SBATCH --partition=mlz-medium
#SBATCH --nodelist=n028
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --time=24:00:00
#SBATCH --array=1-5
#SBATCH --output=locator_seed_%A_%a.out
# Define seeds
SEEDS=(835 163 321 958 841)
# Pick seed based on SLURM_ARRAY_TASK_ID
SEED=${SEEDS[$SLURM_ARRAY_TASK_ID-1]}
# Run Locator with all your parameters + current seed
locator \
--vcf /home/ramirezb/oratrix_locator/final_data/yhpa.final.vcf.gz \
--sample_data /home/ramirezb/oratrix_locator/final_data/sample_data_yhpa_final.txt \
--out /home/ramirezb/oratrix_locator/final_run_oct2025/out_yhpa/oratrix_seed${SEED} \
--bootstrap \
--nboots 200 \
--train_split 0.8 \
--nlayers 3 \
--seed $SEED
Code
setwd("~/Dropbox/08_yhpa_072025/final_locator/yhpa/out_yhpa")# Get list of all predloc filesyhpa_files <-list.files(pattern ="predlocs.*\\.txt")yhpa_files#remove full runyhpa_files <- yhpa_files[-c(201)]#This is the prediction from the full model trained on all SNPs (no resampling)yhpa_full<-read_delim("oratrix_seed321_bootFULL_predlocs.txt")# Read and bind them - all bootstrapped predictionsyhpa_all_preds <- yhpa_files %>%map_df(~read_delim(.x, delim =",", col_names =TRUE), .id ="file")yhpa_individual_means <- yhpa_all_preds %>%group_by(sampleID) %>%summarize(mean_long =mean(x, na.rm =TRUE),mean_lat =mean(y, na.rm =TRUE))yhpa_summary_stats<-yhpa_all_preds %>%group_by(sampleID) %>%summarize(mean_long =mean(x, na.rm =TRUE),mean_lat =mean(y, na.rm =TRUE),sd_x =sd(x, na.rm =TRUE),sd_y =sd(y, na.rm =TRUE))
5 Plotting Confiscated YHPA Predictions
Code
#showing 95% boostrap ellipse# confiscated_ellipse<-ggplot() +geom_sf(data = cropped_yhpa_map, aes(fill ="map"), color ="black") +geom_sf(data = cropped_mx_states, fill="gray95") +geom_sf(data = cropped_yhpa_shp, aes(fill ="YHPA"), color ="black") +geom_sf(data = cropped_ynpa_shp, fill ="cornsilk2", color ="black") +coord_sf(xlim =c(-109, -86), ylim =c(15.3, 26), expand =FALSE) +scale_fill_manual(values =c("map"="gray95","YHPA"="cornsilk2") ,breaks =c("map","YHPA"),labels =c("map"="outside observed range", "YHPA"="observed range") ) +guides(fill =guide_legend(override.aes =list(shape =c(21, 21), # NA for ellipse (no point)linetype =c(0, 0), # 2 = dashed for ellipsecolour =c("black", "black"), size =8.5 ) )) + ggnewscale::new_scale_fill() +# Ellipse around bootstrapped pointsstat_ellipse(data = yhpa_all_preds, aes(x = y, y = x, group = sampleID, fill ="bootstrapped 95% confidence ellipse"), level =0.95, linewidth =1, linetype =2, alpha =0.5,color ="#008B8B", inherit.aes =FALSE, show.legend =TRUE) +geom_point(data = yhpa_individual_means, aes(x = mean_lat, y = mean_long, fill ="mean of 200 bootstraps"), colour="black", size =10) +geom_point(data = full_test_sample_data, aes(x = y, y = x, fill = Subspecies), colour="black",pch=21, size =10) +scale_color_manual(values =c("bootstrapped 95% confidence ellipse"="#008B8B"),) +scale_fill_manual(values =c("auropalliata"="#fff700","belizensis"="#ff99fd","oratrix (east)"="#ffab00","oratrix (west)"="#1E90FF","tresmariae"="#F75394","mean of 200 bootstraps"="black","bootstrapped 95% confidence ellipse"="#008B8B" ),breaks =c("auropalliata","tresmariae","oratrix (west)","oratrix (east)","belizensis","mean of 200 bootstraps","bootstrapped 95% confidence ellipse" ),labels =c("auropalliata"="*A. auropalliata*","tresmariae"="*A. o. tresmariae*","oratrix (west)"="*A. o. oratrix* (west)","oratrix (east)"="*A. o. oratrix* (east)","belizensis"="*A. o. belizensis*","mean of 200 bootstraps"="mean of bootstrap predictions","bootstrapped 95% confidence ellipse"="bootstrapped 95% confidence ellipse" ) , guide =guide_legend(override.aes =list(size =7.5))) +guides(fill =guide_legend(override.aes =list(shape =c(21, 21, 21, 21, 21, 21, 21), # NA for ellipse (no point)linetype =c(0, 0, 0, 0, 0, 0, 2), # 2 = dashed for ellipsecolour =c("black", "black", "black","black", "black", "black", "#008B8B"), size =9 ) )) +geom_text(data = full_test_sample_data, aes(x = y, y = x, label = group),color ="black",size =6.5, family ="Arial" ) +geom_text(data = locator_test, aes(x = y, y = x, label = group),color ="black",size =6.5, family ="Arial" ) +theme_classic() +theme(legend.text = ggtext::element_markdown(family ="Arial",size =20,color ="black",margin =margin(l =0.5)),legend.title =element_blank(),# legend.position = c(0.95, 0.98),# legend.box = "horizontal",legend.justification =c("right", "top"),axis.title =element_blank() )
#we want to find localities of southern california samples (ao_MLZ_1105, ao_MLZ_70063, and ao_MLZ_70074)#need to downsample Belize again#removing confiscated scp samples because we don't have their localitiessocal.vcf<-(vcfR)[, !colnames((vcfR)@gt) %in%c("ao_BC_108", #belize"ao_BC_109", #belize"ao_BC_A113", #belize"ao_BC_A114", #belize"ao_BC_A116", #belize"ao_BC_A117", #belize"ao_BC_A118", #belize"ao_SP_1", #all SP samples confiscated"ao_SP_2","ao_SP_3","ao_SP_4","ao_SP_5","ao_SP_6","ao_SP_7","ao_SP_8","ao_SP_831","ao_SP_832","ao_SP_833","ao_SP_834","ao_SP_835","ao_SP_836","ao_SP_837","ao_SP_838","ao_SP_839","ao_SP_840")]socal.vcf
***** Object of Class vcfR *****
21 samples
26 CHROMs
2,274 variants
Object size: 2.9 Mb
12.3 percent missing data
***** ***** *****
#subset popmap to only include retained individualspopmap<-popmap[popmap$id %in%colnames(socal.vcf@gt),]#make sure sampling file matches vcfpopmap$id %in%colnames(socal.vcf@gt)[-1]
#reorder sampling file to match order of samples in vcfsample.info<-popmap[match(colnames(socal.vcf@gt)[-1], popmap$id),]popmap$id ==colnames(socal.vcf@gt)[-1]
***** Object of Class vcfR *****
21 samples
26 CHROMs
1,464 variants
Object size: 2.1 Mb
12.33 percent missing data
***** ***** *****
Code
#confirm sample sheet matches the samples in the vcfsocal_sample_data<-read_delim("~/Dropbox/08_yhpa_072025/final_locator/socal/sample_data_socal_final.txt")
Rows: 21 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): sampleID
dbl (2): x, y
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#!/bin/bash
#
#SBATCH --job-name=locator_array
#SBATCH --partition=mlz-unlimited
#SBATCH --nodelist=n029
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --time=24:00:00
#SBATCH --array=1-5
#SBATCH --output=locator_seed_%A_%a.out
# Define seeds
SEEDS=(835 163 321 958 841)
# Pick seed based on SLURM_ARRAY_TASK_ID
SEED=${SEEDS[$SLURM_ARRAY_TASK_ID-1]}
# Run Locator with all your parameters + current seed
locator \
--vcf /home/ramirezb/oratrix_locator/final_data/socal.final.vcf.gz \
--sample_data /home/ramirezb/oratrix_locator/final_data/sample_data_socal_final.txt \
--out /home/ramirezb/oratrix_locator/final_run_oct2025/out_socal/oratrix_seed${SEED} \
--bootstrap \
--nboots 200 \
--train_split 0.7 \
--nlayers 6 \
--seed $SEED
Code
setwd("~/Dropbox/08_yhpa_072025/final_locator/socal/out_socal")# Get list of all predloc filessocal_files <-list.files(pattern ="predlocs.*\\.txt")socal_files#remove full runsocal_files <- socal_files[-c(201)]#This is the prediction from the full model trained on all SNPs (no resampling)socal_full<-read_delim("oratrix_seed321_bootFULL_predlocs.txt")# Read and bind them - all bootstrapped predictionssocal_all_preds <- socal_files %>%map_df(~read_delim(.x, delim =",", col_names =TRUE), .id ="file")socal_individual_means <- socal_all_preds %>%group_by(sampleID) %>%summarize(mean_long =mean(x, na.rm =TRUE),mean_lat =mean(y, na.rm =TRUE))socal_summary_stats<-socal_all_preds %>%group_by(sampleID) %>%summarize(mean_long =mean(x, na.rm =TRUE),mean_lat =mean(y, na.rm =TRUE),sd_x =sd(x, na.rm =TRUE),sd_y =sd(y, na.rm =TRUE))
7 Plotting Rescue Predictions
Code
#showing 95% boostrap ellipse#socal_ellipse<-ggplot() +geom_sf(data = cropped_yhpa_map, aes(fill ="map"), color ="black") +geom_sf(data = cropped_mx_states, fill="gray95") +geom_sf(data = cropped_yhpa_shp, aes(fill ="YHPA"), color ="black") +geom_sf(data = cropped_ynpa_shp, fill ="cornsilk2", color ="black") +coord_sf(xlim =c(-109, -86), ylim =c(15.3, 26), expand =FALSE) +scale_fill_manual(values =c("map"="gray95","YHPA"="cornsilk2") ,breaks =c("map","YHPA"),labels =c("map"="outside observed range", "YHPA"="observed range") ) +guides(fill =guide_legend(override.aes =list(shape =c(21, 21), # NA for ellipse (no point)linetype =c(0, 0), # 2 = dashed for ellipsecolour =c("black", "black"), size =8.5 ) )) + ggnewscale::new_scale_fill() +# Ellipse around bootstrapped pointsstat_ellipse(data = socal_all_preds, aes(x = y, y = x, group = sampleID, fill ="bootstrapped 95% confidence ellipse"),level =0.95, linewidth =1, linetype =2, alpha =0.5, color ="#008B8B",inherit.aes =FALSE, show.legend =TRUE) +geom_point(data = socal_individual_means, aes(x = mean_lat, y = mean_long, fill ="mean of 200 bootstraps"), colour="black", size =10) +geom_point(data = full_test_sample_data, aes(x = y, y = x, fill = Subspecies), colour="black",pch=21, size =10) +scale_color_manual(values =c("bootstrapped 95% confidence ellipse"="#008B8B"), ) +scale_fill_manual(values =c("auropalliata"="#fff700","belizensis"="#ff99fd","oratrix (east)"="#ffab00","oratrix (west)"="#1E90FF","tresmariae"="#F75394","mean of 200 bootstraps"="black","bootstrapped 95% confidence ellipse"="#008B8B" ),breaks =c("auropalliata","tresmariae","oratrix (west)","oratrix (east)","belizensis","mean of 200 bootstraps","bootstrapped 95% confidence ellipse" ),labels =c("auropalliata"="*A. auropalliata*","tresmariae"="*A. o. tresmariae*","oratrix (west)"="*A. o. oratrix* (west)","oratrix (east)"="*A. o. oratrix* (east)","belizensis"="*A. o. belizensis*","mean of 200 bootstraps"="mean of bootstrap predictions","bootstrapped 95% confidence ellipse"="bootstrapped 95% confidence ellipse" ) , guide =guide_legend(override.aes =list(size =7.5))) +guides(fill =guide_legend(override.aes =list(shape =c(21, 21, 21, 21, 21, 21, 21), # NA for ellipse (no point)linetype =c(0, 0, 0, 0, 0, 0, 2), # 2 = dashed for ellipsecolour =c("black", "black", "black", "black", "black", "black", "#008B8B"), size =9 ) )) +geom_text(data = full_test_sample_data, aes(x = y, y = x, label = group),color ="black",size =6.5, family ="Arial" ) +geom_text(data = locator_test, aes(x = y, y = x, label = group),color ="black",size =6.5, family ="Arial" ) +theme_classic() +theme(legend.text = ggtext::element_markdown(family ="Arial",size =20,color ="black",margin =margin(l =0.5)),legend.title =element_blank(),# legend.position = c(0.95, 0.98),# legend.box = "horizontal",legend.justification =c("right", "top"),axis.title =element_blank() )