library(vcfR)library(ggplot2)library(SNPfiltR)library(StAMPP)library(adegenet)v<-read.vcfR("~/Downloads/yhpa_final_filtered.vcf.gz")samps<-read.csv("~/Downloads/Table1.for.manuscript.csv")#make new columnsamps[16,5]<-"SoCal"samps[17,5]<-"SoCal"samps[18,5]<-"SoCal"samps$pop<-samps$Subspecies
0.2 remove single auropalliata sample and then calculate FST between groups
***** Object of Class vcfR *****
45 samples
26 CHROMs
2,274 variants
Object size: 4.2 Mb
7.873 percent missing data
***** ***** *****
Code
#remove invariant SNPsvsub<-min_mac(vsub, 1)
1.14% of SNPs fell below a minor allele count of 1 and were removed from the VCF
Code
vsub
***** Object of Class vcfR *****
45 samples
26 CHROMs
2,248 variants
Object size: 4.2 Mb
7.858 percent missing data
***** ***** *****
Code
#subset sampling file and fix pop IDssamps<-samps[samps$sample_ID %in%colnames(vsub@gt),]samps[16,5]<-"SoCal"samps[17,5]<-"SoCal"samps[18,5]<-"SoCal"#reorder sample file to match vcfsamps<-samps[match(colnames(vsub@gt)[-1],samps$sample_ID),]#convert vcfr to genogen<-vcfR2genlight(vsub)#assign popspop(gen)<-samps$Subspecies#calc FSTx<-stamppFst(gen)#reassignm<-x$Fsts#fill in upper triangle of matrixm[upper.tri(m)] <-t(m)[upper.tri(m)]#melt for plottingheat <- reshape::melt(m)
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
0.3 plot pairwise FST
Code
#reorder factors for plottingplot.order<-c("tresmariae","oratrix (west)","oratrix (east)","belizensis","SoCal","unknown")heat$X1<-factor(heat$X1, levels=plot.order)heat$X2<-factor(heat$X2, levels=plot.order)#plot with labelsggplot(data = heat, aes(x=X1, y=X2, fill=value)) +geom_tile(color ="black", size=.5)+geom_text(data=heat,aes(label=round(value, 2)), size=2.25)+theme_minimal()+scale_fill_gradient2(low ="white", high ="darkgrey", space ="Lab", name="Fst") +theme(axis.text.x =element_text(angle =45, hjust =1),axis.text.y =element_text(angle =45, hjust =1))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_text()`).
Code
#ggsave(file="~/Desktop/YHPA.fst.pdf", units="in",width=4,height=3) #saves g
0.4 use a loop to calculate the p-value associated with each pairwise Fst comparison based on 1K permutations
Code
#set up comparison dataframedf<-data.frame(X1=c("oratrix (west)","oratrix (east)","belizensis","SoCal","unknown","oratrix (east)","belizensis","SoCal","unknown","belizensis","SoCal","unknown","SoCal","unknown","unknown"),X2=c("tresmariae","tresmariae","tresmariae","tresmariae","tresmariae","oratrix (west)","oratrix (west)","oratrix (west)","oratrix (west)","oratrix (east)","oratrix (east)","oratrix (east)","belizensis","belizensis","SoCal"),empirical.Fst=c(rep(NA, times=15)),pval=c(rep(NA, times=15)))for (j in12:nrow(df)){#isolate pops of interest vsub2<-vsub[,c(TRUE,samps$Subspecies == df[j,1] | samps$Subspecies == df[j,2])]colnames(vsub2@gt) vsub2#subset sampling file to only these samples samps.sub<-samps[samps$sample_ID %in%colnames(vsub2@gt),]#calculate empirical FST gen<-vcfR2genlight(vsub2)#assign popspop(gen)<-samps.sub$Subspecies#calc FST x<-stamppFst(gen)#print empirical FST x$Fsts#loop to calculate FST with randomized sample assignment 1K times y<-c() #open empty vector to track results#loop that will randomize sample assignments while maintaining identical sample sizes, and store the resulting informationfor (i in1:1000){#assign pops based on random number generatorpop(gen)<-samps.sub$Subspecies[sample(1:nrow(samps.sub),size =nrow(samps.sub))]#Calc FST and store value y[i]<-stamppFst(gen)$Fsts[2,1]#inform that this iteration is complete#print(paste("iteration",i,"complete")) }#calc p-valuetable(y >= x$Fsts[2,1])mean(y >= x$Fsts[2,1])#make figureplot(density(y),main=paste("1K replicates of randomized sample assignments:\n",df[j,1],"vs.",df[j,2],", p =",round(mean(y >= x$Fsts[2,1]),4)),xlab ="Fst values")polygon(density(y), col ="slateblue1")abline(v=x$Fsts[2,1],lwd=2,lty=2,col="red")#store results in the dataframe df[j,3]<- x$Fsts[2,1] #store Fst df[j,4]<-mean(y >= x$Fsts[2,1]) #store pval#save the results to diskwrite.csv(y, file=paste0("~/Downloads/",df[j,1],"versus",df[j,2],".csv"), row.names = F)}#print the resultsdf
0.5 re-run the analysis using stored data (above chunk only needs to be run once)
Code
#set up comparison dataframedf<-data.frame(X1=c("oratrix (west)","oratrix (east)","belizensis","SoCal","unknown","oratrix (east)","belizensis","SoCal","unknown","belizensis","SoCal","unknown","SoCal","unknown","unknown"),X2=c("tresmariae","tresmariae","tresmariae","tresmariae","tresmariae","oratrix (west)","oratrix (west)","oratrix (west)","oratrix (west)","oratrix (east)","oratrix (east)","oratrix (east)","belizensis","belizensis","SoCal"),empirical.Fst=c(rep(NA, times=15)),pval=c(rep(NA, times=15)))for (j in1:nrow(df)){#isolate pops of interest vsub2<-vsub[,c(TRUE,samps$pop == df[j,1] | samps$pop == df[j,2])]colnames(vsub2@gt) vsub2#subset sampling file to only these samples samps.sub<-samps[samps$sample_ID %in%colnames(vsub2@gt),]#calculate empirical FST gen<-vcfR2genlight(vsub2)#assign popspop(gen)<-samps.sub$pop#calc FST x<-stamppFst(gen)#print empirical FST x$Fsts#read in the precomputed dataframes y<-read.csv(paste0("~/Downloads/",df[j,1],"versus",df[j,2],".csv")) y$x<-round(y$x, 8)#make figureplot(density(y$x),main=paste("1K replicates of randomized sample assignments:\n",df[j,1],"vs.",df[j,2],", p =",round(mean(y$x >=round(x$Fsts[2,1],8)),4)),xlab ="Fst values")polygon(density(y$x), col ="slateblue1")abline(v=x$Fsts[2,1],lwd=2,lty=2,col="red")#store results in the dataframe df[j,3]<- x$Fsts[2,1] #store Fst df[j,4]<-round(mean(y$x >=round(x$Fsts[2,1],8)),4) #store pval}
Because these FST values often just reflect sample size (e.g., comparisons with tresmariae are coming out with high p-values because small sample size in that population makes the probability of getting the accurate empirical sample assignments in the randomization procedure quite high - despite the fact that it is actually the most divergent population!) we chose not to include p-values in this manuscript.
Source Code
---title: "YHPA.FST.pvalues.via.randomization"format: html: code-fold: show code-tools: truetoc: truetoc-title: Document Contentsnumber-sections: trueembed-resources: true---### load R packages; read in vcf and sample sheet```{r}#| output: falselibrary(vcfR)library(ggplot2)library(SNPfiltR)library(StAMPP)library(adegenet)v<-read.vcfR("~/Downloads/yhpa_final_filtered.vcf.gz")samps<-read.csv("~/Downloads/Table1.for.manuscript.csv")#make new columnsamps[16,5]<-"SoCal"samps[17,5]<-"SoCal"samps[18,5]<-"SoCal"samps$pop<-samps$Subspecies```### remove single auropalliata sample and then calculate FST between groups```{r}#remove samplevsub<-v[,colnames(v@gt) !="ao_LSUMZ_39731"]vsub#remove invariant SNPsvsub<-min_mac(vsub, 1)vsub#subset sampling file and fix pop IDssamps<-samps[samps$sample_ID %in%colnames(vsub@gt),]samps[16,5]<-"SoCal"samps[17,5]<-"SoCal"samps[18,5]<-"SoCal"#reorder sample file to match vcfsamps<-samps[match(colnames(vsub@gt)[-1],samps$sample_ID),]#convert vcfr to genogen<-vcfR2genlight(vsub)#assign popspop(gen)<-samps$Subspecies#calc FSTx<-stamppFst(gen)#reassignm<-x$Fsts#fill in upper triangle of matrixm[upper.tri(m)] <-t(m)[upper.tri(m)]#melt for plottingheat <- reshape::melt(m)```### plot pairwise FST```{r}#reorder factors for plottingplot.order<-c("tresmariae","oratrix (west)","oratrix (east)","belizensis","SoCal","unknown")heat$X1<-factor(heat$X1, levels=plot.order)heat$X2<-factor(heat$X2, levels=plot.order)#plot with labelsggplot(data = heat, aes(x=X1, y=X2, fill=value)) +geom_tile(color ="black", size=.5)+geom_text(data=heat,aes(label=round(value, 2)), size=2.25)+theme_minimal()+scale_fill_gradient2(low ="white", high ="darkgrey", space ="Lab", name="Fst") +theme(axis.text.x =element_text(angle =45, hjust =1),axis.text.y =element_text(angle =45, hjust =1))#ggsave(file="~/Desktop/YHPA.fst.pdf", units="in",width=4,height=3) #saves g```### use a loop to calculate the p-value associated with each pairwise Fst comparison based on 1K permutations```{r, eval=FALSE}#set up comparison dataframedf<-data.frame(X1=c("oratrix (west)","oratrix (east)","belizensis","SoCal","unknown","oratrix (east)","belizensis","SoCal","unknown","belizensis","SoCal","unknown","SoCal","unknown","unknown"),X2=c("tresmariae","tresmariae","tresmariae","tresmariae","tresmariae","oratrix (west)","oratrix (west)","oratrix (west)","oratrix (west)","oratrix (east)","oratrix (east)","oratrix (east)","belizensis","belizensis","SoCal"),empirical.Fst=c(rep(NA, times=15)),pval=c(rep(NA, times=15)))for (j in12:nrow(df)){#isolate pops of interest vsub2<-vsub[,c(TRUE,samps$Subspecies == df[j,1] | samps$Subspecies == df[j,2])]colnames(vsub2@gt) vsub2#subset sampling file to only these samples samps.sub<-samps[samps$sample_ID %in%colnames(vsub2@gt),]#calculate empirical FST gen<-vcfR2genlight(vsub2)#assign popspop(gen)<-samps.sub$Subspecies#calc FST x<-stamppFst(gen)#print empirical FST x$Fsts#loop to calculate FST with randomized sample assignment 1K times y<-c() #open empty vector to track results#loop that will randomize sample assignments while maintaining identical sample sizes, and store the resulting informationfor (i in1:1000){#assign pops based on random number generatorpop(gen)<-samps.sub$Subspecies[sample(1:nrow(samps.sub),size =nrow(samps.sub))]#Calc FST and store value y[i]<-stamppFst(gen)$Fsts[2,1]#inform that this iteration is complete#print(paste("iteration",i,"complete")) }#calc p-valuetable(y >= x$Fsts[2,1])mean(y >= x$Fsts[2,1])#make figureplot(density(y),main=paste("1K replicates of randomized sample assignments:\n",df[j,1],"vs.",df[j,2],", p =",round(mean(y >= x$Fsts[2,1]),4)),xlab ="Fst values")polygon(density(y), col ="slateblue1")abline(v=x$Fsts[2,1],lwd=2,lty=2,col="red")#store results in the dataframe df[j,3]<- x$Fsts[2,1] #store Fst df[j,4]<-mean(y >= x$Fsts[2,1]) #store pval#save the results to diskwrite.csv(y, file=paste0("~/Downloads/",df[j,1],"versus",df[j,2],".csv"), row.names = F)}#print the resultsdf```### re-run the analysis using stored data (above chunk only needs to be run once)```{r}#set up comparison dataframedf<-data.frame(X1=c("oratrix (west)","oratrix (east)","belizensis","SoCal","unknown","oratrix (east)","belizensis","SoCal","unknown","belizensis","SoCal","unknown","SoCal","unknown","unknown"),X2=c("tresmariae","tresmariae","tresmariae","tresmariae","tresmariae","oratrix (west)","oratrix (west)","oratrix (west)","oratrix (west)","oratrix (east)","oratrix (east)","oratrix (east)","belizensis","belizensis","SoCal"),empirical.Fst=c(rep(NA, times=15)),pval=c(rep(NA, times=15)))for (j in1:nrow(df)){#isolate pops of interest vsub2<-vsub[,c(TRUE,samps$pop == df[j,1] | samps$pop == df[j,2])]colnames(vsub2@gt) vsub2#subset sampling file to only these samples samps.sub<-samps[samps$sample_ID %in%colnames(vsub2@gt),]#calculate empirical FST gen<-vcfR2genlight(vsub2)#assign popspop(gen)<-samps.sub$pop#calc FST x<-stamppFst(gen)#print empirical FST x$Fsts#read in the precomputed dataframes y<-read.csv(paste0("~/Downloads/",df[j,1],"versus",df[j,2],".csv")) y$x<-round(y$x, 8)#make figureplot(density(y$x),main=paste("1K replicates of randomized sample assignments:\n",df[j,1],"vs.",df[j,2],", p =",round(mean(y$x >=round(x$Fsts[2,1],8)),4)),xlab ="Fst values")polygon(density(y$x), col ="slateblue1")abline(v=x$Fsts[2,1],lwd=2,lty=2,col="red")#store results in the dataframe df[j,3]<- x$Fsts[2,1] #store Fst df[j,4]<-round(mean(y$x >=round(x$Fsts[2,1],8)),4) #store pval}```Because these FST values often just reflect sample size (e.g., comparisons with tresmariae are coming out with high p-values because small sample size in that population makes the probability of getting the accurate empirical sample assignments in the randomization procedure quite high - despite the fact that it is actually the most divergent population!) we chose not to include p-values in this manuscript.