Code
library(vcfR)
library(ggplot2)
library(SNPfiltR)
library(StAMPP)
library(adegenet)
library(dplyr)
v<-read.vcfR("~/Downloads/yhpa_final_filtered.vcf.gz")
samps<-read.csv("~/Downloads/Table1.for.manuscript.csv")
#make new column
samps$pop<-samps$Subspecies
#subset sampling file and fix pop IDs
samps<-samps[samps$sample_ID %in% colnames(v@gt),]
popmap<-samps
Code
#labeled pca
#convert each to genlight
gena<-vcfR2genlight(v)
#perform PCA
di.pca<-glPca(gena, nf=4)
#isolate PCA scores as a dataframe
di.pca.scores<-as.data.frame(di.pca$scores)
#check if sample order matches in vcf versus sampling table
rownames(di.pca.scores) == popmap$sample_ID 
 [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Code
#reorder sampling file to match alphabetical order
popmap<-popmap %>% slice(order(factor(sample_ID, levels = rownames(di.pca.scores))))
#check that they all match
rownames(di.pca.scores) == popmap$sample_ID 
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[46] TRUE
Code
#add in subspecies labels
di.pca.scores$Subspecies<-popmap$Subspecies
#add in percent variance explained
var_frac <- di.pca$eig/sum(di.pca$eig)
# pca<-
ggplot(di.pca.scores, aes(x=PC1, y=PC2)) +
  geom_point(aes(fill=Subspecies), pch=21, size=4) +
   scale_fill_manual(
    values = c(
      "auropalliata" = "#fff700", 
      "belizensis"  = "#ff99fd",
      # "hondurensis" = "#a76d00",
      "oratrix (east)" = "#ffab00",
      "oratrix (west)" = "#1E90FF",
      "tresmariae" = "#F75394",
      "unknown" = "#008B8B"
  )) +
  xlab(paste0("PC1, ", round(var_frac[1] * 100, 2), "% variance explained")) + 
  ylab(paste0("PC2, ", round(var_frac[2] * 100, 2), "% variance explained")) + 
  theme_classic() + 
  labs(fill = "Subspecies")

0.1 repeat with the thinned SNP dataset

Code
v.thin<-distance_thin(v,1000)

  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |======================================================================| 100%
1598 out of 2274 input SNPs were not located within 1000 base-pairs of another SNP and were retained despite filtering
Code
v.thin
***** Object of Class vcfR *****
46 samples
26 CHROMs
1,598 variants
Object size: 3.2 Mb
8.007 percent missing data
*****        *****         *****
Code
#labeled pca
#convert each to genlight
gena<-vcfR2genlight(v.thin)
#perform PCA
di.pca<-glPca(gena, nf=4)
#isolate PCA scores as a dataframe
di.pca.scores<-as.data.frame(di.pca$scores)
#add in subspecies labels
di.pca.scores$Subspecies<-popmap$Subspecies
#add in percent variance explained
var_frac <- di.pca$eig/sum(di.pca$eig)
# pca<-
ggplot(di.pca.scores, aes(x=PC1, y=PC2)) +
  geom_point(aes(fill=Subspecies), pch=21, size=4) +
   scale_fill_manual(
    values = c(
      "auropalliata" = "#fff700", 
      "belizensis"  = "#ff99fd",
      # "hondurensis" = "#a76d00",
      "oratrix (east)" = "#ffab00",
      "oratrix (west)" = "#1E90FF",
      "tresmariae" = "#F75394",
      "unknown" = "#008B8B"
  )) +
  xlab(paste0("PC1, ", round(var_frac[1] * 100, 2), "% variance explained")) + 
  ylab(paste0("PC2, ", round(var_frac[2] * 100, 2), "% variance explained")) + 
  theme_classic() + 
  labs(fill = "Subspecies")