1 Running Locator on YHPA Samples

1.0.1 Load libraries

Code
library(SNPfiltR)
This is SNPfiltR v.1.0.2

Detailed usage information is available at: devonderaad.github.io/SNPfiltR/ 

If you use SNPfiltR in your published work, please cite the following papers: 

DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 22, 2443-2453. http://doi.org/10.1111/1755-0998.13618 

Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
Code
library(vcfR)

   *****       ***   vcfR   ***       *****
   This is vcfR 1.15.0 
     browseVignettes('vcfR') # Documentation
     citation('vcfR') # Citation
   *****       *****      *****       *****
Code
library(ggplot2)
library(stringr)
library(StAMPP)
Loading required package: pegas
Loading required package: ape

Attaching package: 'pegas'
The following object is masked from 'package:ape':

    mst
The following objects are masked from 'package:vcfR':

    getINFO, write.vcf
Registered S3 method overwritten by 'ade4':
  method      from 
  print.amova pegas
Code
library(adegenet)
Loading required package: ade4

Attaching package: 'ade4'
The following object is masked from 'package:pegas':

    amova

   /// adegenet 2.1.11 is loaded ////////////

   > overview: '?adegenet'
   > tutorials/doc/questions: 'adegenetWeb()' 
   > bug reports/feature requests: adegenetIssues()
Code
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ dplyr::where()  masks ape::where()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(sf)
Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
Code
library(rnaturalearth)
library(geosphere)
library(ggtext)
library(ggnewscale)

1.0.2 Read in vcf and popmap

Code
vcfR<-read.vcfR("~/Dropbox/08_yhpa_072025/amazona.oratrix.final.filtered.snps.vcf.gz")
vcfR

popmap<-read.csv("~/Dropbox/08_yhpa_072025/yhpa_popmap.csv")
popmap<-popmap[popmap$id %in% colnames(vcfR@gt),]
popmap

#make sure sampling file matches vcf
popmap$id %in% colnames(vcfR@gt)[-1]
colnames(vcfR@gt)[-1] %in% popmap$id

2 Testing locator - trying to predict locality of ao_MLZ_48333 (Tamaulipas)

2.0.1 Creating vcf for locator

Using filtered vcf with 80% threshold and thinned created using oratrix_snp_filtering.qmd. This vcf corresponds to the samples in the sample data document locator uses to predict localities sample_data_tamaulipas_test.txt.

Code
#removing 7 Belize samples with the most missing data (only keeping 2 samples with the most data for balance)
miss<-missing_by_sample(vcfR)
No popmap provided

Code
#we need to remove socal samples (ao_MLZ_1105, ao_MLZ_70063, and ao_MLZ_70074) since we do not know their origin
#removing confiscated scp samples because we don't have their localities
test.vcf<-(vcfR)[, !colnames((vcfR)@gt) %in% c("ao_MLZ_1105", #socal
                                               "ao_MLZ_70063", #socal
                                               "ao_MLZ_70074", #socal
                                               "ao_BC_108", #belize
                                               "ao_BC_109", #belize
                                               "ao_BC_A113", #belize
                                               "ao_BC_A114", #belize
                                               "ao_BC_A116", #belize
                                               "ao_BC_A117", #belize
                                               "ao_BC_A118", #belize
                                               "ao_SP_1", #all SP samples - confiscated
                                               "ao_SP_2",
                                               "ao_SP_3",
                                               "ao_SP_4",
                                               "ao_SP_5",
                                               "ao_SP_6",
                                               "ao_SP_7",
                                               "ao_SP_8",
                                               "ao_SP_831",
                                               "ao_SP_832",
                                               "ao_SP_833",
                                               "ao_SP_834",
                                               "ao_SP_835",
                                               "ao_SP_836",
                                               "ao_SP_837",
                                               "ao_SP_838",
                                               "ao_SP_839",
                                               "ao_SP_840")]

test.vcf
***** Object of Class vcfR *****
18 samples
26 CHROMs
2,274 variants
Object size: 2.4 Mb
14.34 percent missing data
*****        *****         *****

2.0.2 Create popmap and save vcf

Code
#subset popmap to only include retained individuals
popmap<-popmap[popmap$id %in% colnames(test.vcf@gt),]

#make sure sampling file matches vcf
popmap$id %in% colnames(test.vcf@gt)[-1]
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE
Code
colnames(test.vcf@gt)[-1] %in% popmap$id
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE
Code
#reorder sampling file to match order of samples in vcf
sample.info<-popmap[match(colnames(test.vcf@gt)[-1], popmap$id),]
popmap$id == colnames(test.vcf@gt)[-1]
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE
Code
test.vcf.mac2<-min_mac(test.vcf, min.mac = 2)

42.3% of SNPs fell below a minor allele count of 2 and were removed from the VCF
Code
# vcfR::write.vcf(test.vcf.mac2, "~/Dropbox/08_yhpa_072025/final_locator/tamaulipas/tamaulipas.final.vcf.gz")

test.vcf.mac2
***** Object of Class vcfR *****
18 samples
26 CHROMs
1,312 variants
Object size: 1.6 Mb
14.32 percent missing data
*****        *****         *****
Code
#confirm sample sheet matches the samples in the vcf
test_sample_data<-read_delim("~/Dropbox/08_yhpa_072025/final_locator/tamaulipas/sample_data_tamaulipas_final.txt")
Rows: 18 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): sampleID
dbl (2): x, y

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
colnames(test_sample_data)<-c("sample_ID", "x", "y")

colnames(test.vcf.mac2@gt)
 [1] "FORMAT"         "ao_BC_107"      "ao_BC_A115"     "ao_LSUMZ_33050"
 [5] "ao_LSUMZ_39731" "ao_LSUMZ_43831" "ao_LSUMZ_43832" "ao_MLZ_35920"  
 [9] "ao_MLZ_40634"   "ao_MLZ_41497"   "ao_MLZ_48333"   "ao_MLZ_50773"  
[13] "ao_MLZ_50774"   "ao_MLZ_50775"   "ao_MLZ_59507"   "ao_UMMZ_103984"
[17] "ao_UMMZ_130517" "ao_UMMZ_95618"  "ao_UMMZ_95619" 
Code
test_sample_data$sample_ID
 [1] "ao_BC_107"      "ao_BC_A115"     "ao_LSUMZ_33050" "ao_LSUMZ_39731"
 [5] "ao_LSUMZ_43831" "ao_LSUMZ_43832" "ao_MLZ_35920"   "ao_MLZ_40634"  
 [9] "ao_MLZ_41497"   "ao_MLZ_48333"   "ao_MLZ_50773"   "ao_MLZ_50774"  
[13] "ao_MLZ_50775"   "ao_MLZ_59507"   "ao_UMMZ_103984" "ao_UMMZ_130517"
[17] "ao_UMMZ_95618"  "ao_UMMZ_95619" 
Code
test_sample_data$sample_ID %in% colnames(test.vcf.mac2@gt)[-1]
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE
Code
colnames(test.vcf.mac2@gt)[-1] %in% test_sample_data$sample_ID
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE

2.0.3 In command line tamaulipas_locator.sh using Occidental College’s HPC Cluster Bletchley

#!/bin/bash
#
#SBATCH --job-name=locator_array
#SBATCH --partition=mlz-medium
#SBATCH --nodelist=n027
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --time=24:00:00
#SBATCH --array=1-5
#SBATCH --output=locator_seed_%A_%a.out

# Define seeds
SEEDS=(835 163 321 958 841)

# Pick seed based on SLURM_ARRAY_TASK_ID
SEED=${SEEDS[$SLURM_ARRAY_TASK_ID-1]}

# Run Locator with all your parameters + current seed
locator \
  --vcf /home/ramirezb/oratrix_locator/final_data/tamaulipas.final.vcf.gz \
  --sample_data /home/ramirezb/oratrix_locator/final_data/sample_data_tamaulipas_final.txt \
  --out /home/ramirezb/oratrix_locator/final_run_oct2025/out_tamaulipas/oratrix_seed${SEED} \
  --bootstrap \
  --nboots 200 \
  --train_split 0.8 \
  --nlayers 3 \
  --seed $SEED

2.0.4 Testing to see if locator predicted the correct locality

Code
full_test_sample_data<-read.csv("~/Dropbox/08_yhpa_072025/oratrix_sample_data.csv")

#all data
all_data<-read_delim("~/Dropbox/08_yhpa_072025/final_locator/all_samples_sample_data.txt")
Rows: 54 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): sampleID
dbl (2): x, y

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
locator_test<-all_data[all_data$sampleID %in% c("ao_MLZ_48333"),]
locator_test$y<-as.numeric(locator_test$y)

locator_test$group<-"7"

2.0.5 Read in shapefiles

Code
cropped_yhpa_map<-st_read("~/Dropbox/08_yhpa_072025/iucn_shp_files/final_shapefiles/cropped_yhpa_map.shp")
Reading layer `cropped_yhpa_map' from data source 
  `/Users/brendaramirez/Library/CloudStorage/Dropbox/08_yhpa_072025/iucn_shp_files/final_shapefiles/cropped_yhpa_map.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 3 features and 168 fields
Geometry type: POLYGON
Dimension:     XY
Bounding box:  xmin: -92.22925 ymin: 15 xmax: -86 ymax: 18.49998
Geodetic CRS:  WGS 84
Code
cropped_mx_states<-st_read("~/Dropbox/08_yhpa_072025/iucn_shp_files/final_shapefiles/cropped_mx_states.shp")
Reading layer `cropped_mx_states' from data source 
  `/Users/brendaramirez/Library/CloudStorage/Dropbox/08_yhpa_072025/iucn_shp_files/final_shapefiles/cropped_mx_states.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 31 features and 121 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -109.45 ymin: 15.2371 xmax: -86.70059 ymax: 26.47262
Geodetic CRS:  WGS 84
Code
cropped_yhpa_shp<-st_read("~/Dropbox/08_yhpa_072025/iucn_shp_files/final_shapefiles/cropped_yhpa.shp")
Reading layer `cropped_yhpa' from data source 
  `/Users/brendaramirez/Library/CloudStorage/Dropbox/08_yhpa_072025/iucn_shp_files/final_shapefiles/cropped_yhpa.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 3 features and 15 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -106.6964 ymin: 15.43637 xmax: -87.50127 ymax: 24.93329
Geodetic CRS:  WGS 84
Code
cropped_ynpa_shp<-st_read("~/Dropbox/08_yhpa_072025/iucn_shp_files/final_shapefiles/cropped_ynpa.shp")
Reading layer `cropped_ynpa' from data source 
  `/Users/brendaramirez/Library/CloudStorage/Dropbox/08_yhpa_072025/iucn_shp_files/final_shapefiles/cropped_ynpa.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 1 feature and 15 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -94.05951 ymin: 15.24002 xmax: -86 ymax: 16.44794
Geodetic CRS:  WGS 84
Code
ggplot() +
  geom_sf(data = cropped_yhpa_map, fill = "gray95", color = "black") +
  geom_sf(data = cropped_mx_states, fill="gray95") +
  geom_sf(data = cropped_yhpa_shp, fill = "cornsilk2", show.legend = FALSE) +
  geom_sf(data = cropped_ynpa_shp, fill = "cornsilk2", color = "black") +
  coord_sf(xlim = c(-109.45, -86), ylim = c(15.2, 26), expand = FALSE) +
  theme_classic()

2.0.6 Load in the predictions

Code
setwd("~/Dropbox/08_yhpa_072025/final_locator/tamaulipas/out_tamaulipas")
# Get list of all predloc files
files <- list.files(pattern = "predlocs.*\\.txt")
files
#remove full run
files <- files[-c(201)]

#This is the prediction from the full model trained on all SNPs (no resampling)
full<-read_delim("oratrix_seed835_bootFULL_predlocs.txt")

# Read and bind them - all bootstrapped predictions
all_preds <- files %>%
  map_df(~ read_delim(.x, delim = ",", col_names = TRUE), .id = "file")

individual_means <- all_preds %>%
  group_by(sampleID) %>%
  summarize(mean_long = mean(y, na.rm = TRUE),
            mean_lat = mean(x, na.rm = TRUE))

summary_stats<-all_preds %>%
  group_by(sampleID) %>%
  summarize(mean_long = mean(y, na.rm = TRUE),
            mean_lat = mean(x, na.rm = TRUE),
            sd_x = sd(x, na.rm = TRUE),
            sd_y = sd(y, na.rm = TRUE))

#calculate distance the predictions are from the actual locality
df_mean <- locator_test %>%
  inner_join(individual_means, by = "sampleID") %>%
  mutate(
    dist_km_mean = distHaversine(
      cbind(y, x),                 # true location (lon, lat)
      cbind(mean_long, mean_lat)   # mean prediction (lon, lat)
    ) / 1000
  )

df_full <- locator_test %>%
  inner_join(full, by = "sampleID") %>%
  mutate(
    dist_km_full = distHaversine(
      cbind(locator_test$y, locator_test$x),     # true location
      cbind(full$y, full$x)  # full prediction (renamed cols after join)
    ) / 1000
  )

distance_df <- df_mean %>%
  select(sampleID, dist_km_mean) %>%
  left_join(df_full %>% select(sampleID, dist_km_full), by = "sampleID")

distance_df

distance_line_means <- data.frame(
  true_x = locator_test$x,
  true_y = locator_test$y,
  pred_x = individual_means$mean_lat,
  pred_y = individual_means$mean_long
)

distance_line_full <- data.frame(
  true_x = locator_test$x,
  true_y = locator_test$y,
  pred_x = full$x,
  pred_y = full$y
)

3 Plotting ao_MLZ_48333 Prediction

Code
#showing 95% boostrap ellipse
# tamaulipas_ellipse<-
  ggplot() +
    
    geom_sf(data = cropped_yhpa_map, aes(fill = "map"), color = "black") +
    geom_sf(data = cropped_mx_states, fill="gray95") +
    geom_sf(data = cropped_yhpa_shp, aes(fill = "YHPA"), color = "black") +
    geom_sf(data = cropped_ynpa_shp, fill = "cornsilk2", color = "black") +
    coord_sf(xlim = c(-109, -86), ylim = c(15.3, 26), expand = FALSE) +
  
  scale_fill_manual(
    values = c("map" = "gray95",
               "YHPA" = "cornsilk2")
    ,
    breaks = c("map",
               "YHPA"),
    
    labels = c("map" = "outside observed range", "YHPA" = "observed range")
  ) +
  
  guides(
  fill = guide_legend(
    override.aes = list(
      shape = c(21, 21),    # NA for ellipse (no point)
      linetype = c(0, 0),        # 2 = dashed for ellipse
      colour = c("black", "black"),              
      size = 8.5
    )
  )
) +
  
  ggnewscale::new_scale_fill() +
    
  # Ellipse around bootstrapped points
  stat_ellipse(data = all_preds, 
               aes(x = y, y = x, 
                   group = sampleID, 
                   fill = "bootstrapped 95% confidence ellipse"), 
                   level = 0.95, 
                   linewidth = 1, 
                   linetype = 2, 
                   alpha = 0.5,
                   color = "gray40", 
                   inherit.aes = FALSE, 
                   show.legend = TRUE) +
    
  geom_point(data = individual_means, aes(x = mean_long, y = mean_lat, fill = "mean of 200 bootstraps"), 
       colour="black", pch=21, size = 10) +
    
  geom_point(data = full_test_sample_data, aes(x = y, y = x, fill = Subspecies), 
       colour="black", pch=21, size = 10) +
    
  geom_point(data = locator_test, aes(x = y, y = x, fill = "ao_MLZ_48333"),
       colour="black", pch=21, size = 10) +
    
  #adding in distance between true locality and prediction  
   #mean prediction: 361.0916   km
   geom_segment(data = distance_line_means,
               aes(x = true_y, y = true_x, xend = pred_y, yend = pred_x),
               color = "gray10", linetype = "dashed") +

  scale_color_manual(
  values = c("bootstrapped 95% confidence ellipse" = "gray20")
) +
  
scale_fill_manual(
  values = c(
    "auropalliata" = "#fff700",
    "belizensis" = "#ff99fd",
    "oratrix (east)" = "#ffab00",
    "oratrix (west)" = "#1E90FF",
    "tresmariae" = "#F75394",
    "ao_MLZ_48333" ="gray40",
    "mean of 200 bootstraps" = "black",
    "bootstrapped 95% confidence ellipse" = "gray20"
  ),
  breaks = c(
    "auropalliata",
    "tresmariae",
    "oratrix (west)",
    "oratrix (east)",
    "belizensis",
    "ao_MLZ_48333",
    "mean of 200 bootstraps",
    "bootstrapped 95% confidence ellipse"
  ),
  labels = c(
    "auropalliata" = "*A. auropalliata*",
    "tresmariae" = "*A. o. tresmariae*",
    "oratrix (west)" = "*A. o. oratrix* (west)",
    "oratrix (east)" = "*A. o. oratrix* (east)",
    "belizensis" = "*A. o. belizensis*",
    "ao_MLZ_48333" = "ao_MLZ_48333",
    "mean of 200 bootstraps" = "mean of bootstrap predictions",
    "bootstrapped 95% confidence ellipse" = "bootstrapped 95% confidence ellipse"
  )
  , guide = guide_legend(override.aes = list(size = 7.5))
) +
    
  guides(
    fill = guide_legend(
    override.aes = list(
      shape = c(21, 21, 21, 21, 21, 21, 21, 21),
      linetype = c(0, 0, 0, 0, 0, 0, 0, 2), 
      colour = c("black", "black", "black", "black", "black", "black", "black", "gray50"),       
      size = 9
    )
  )
) +
  
  geom_text(data = full_test_sample_data, aes(x = y, y = x, label = group),
  color = "black",
  size = 6.5, 
  family = "Arial"
  ) +
    
  geom_text(data = locator_test, aes(x = y, y = x, label = group),
  color = "black",
  size = 6.5, 
  family = "Arial"
  ) +
    
  theme_classic() +
  theme(legend.text = ggtext::element_markdown(
        family = "Arial",
        size = 20,
        color = "black",
        margin = margin(l = 0.5)),
        legend.title = element_blank(),
        # legend.position = c(0.95, 0.98),
        # legend.box = "horizontal",
        legend.justification = c("right", "top"),
        axis.title = element_blank()
    )
Code
# ggsave(tamaulipas_ellipse,
#       # filename="~/Dropbox/08_yhpa_072025/final_locator/cropped_figures/YHPA.figS1.updated.svg",
#       # device="svg",
#       filename="~/Dropbox/08_yhpa_072025/final_locator/cropped_figures/YHPA.figS1.updated.png",
#       device="png",
#       dpi = 400, width = 15, height = 9)
Code
knitr::include_graphics("~/Dropbox/08_yhpa_072025/final_figures/YHPA.figS1.updated.png")


4 Predicting Confiscated YHPA samples

Code
#we need to remove socal samples (ao_MLZ_1105, ao_MLZ_70063, and ao_MLZ_70074) since we do not know their origin
#need to downsample Belize again
#not trying to predict confiscated scp samples because we don't have their localities
yhpa.scp.vcf<-(vcfR)[, !colnames((vcfR)@gt) %in% c("ao_BC_108", #belize
                                                   "ao_BC_109", #belize
                                                   "ao_BC_A113", #belize
                                                   "ao_BC_A114", #belize
                                                   "ao_BC_A116", #belize
                                                   "ao_BC_A117", #belize
                                                   "ao_BC_A118", #belize
                                                   "ao_MLZ_1105", #socal
                                                   "ao_MLZ_70063", #socal
                                                   "ao_MLZ_70074")] #socal

yhpa.scp.vcf
***** Object of Class vcfR *****
36 samples
26 CHROMs
2,274 variants
Object size: 3.6 Mb
7.636 percent missing data
*****        *****         *****
Code
popmap<-read.csv("~/Dropbox/08_yhpa_072025/yhpa_popmap.csv")
popmap
               id                pop
1   ao_ANSP_90568           Yoro_Hon
2       ao_BC_107             Belize
3       ao_BC_108             Belize
4       ao_BC_109             Belize
5      ao_BC_A112             Belize
6      ao_BC_A113             Belize
7      ao_BC_A114             Belize
8      ao_BC_A115             Belize
9      ao_BC_A116             Belize
10     ao_BC_A117             Belize
11     ao_BC_A118             Belize
12 ao_LSUMZ_23890         Tabasco_MX
13 ao_LSUMZ_33050          Oaxaca_MX
14 ao_LSUMZ_39731         Chiapas_MX
15 ao_LSUMZ_43831          Colima_MX
16 ao_LSUMZ_43832          Colima_MX
17    ao_MLZ_1105       Socal_Parrot
18   ao_MLZ_35920        Veracruz_MX
19   ao_MLZ_39530      Tamaulipas_MX
20   ao_MLZ_40633      Tamaulipas_MX
21   ao_MLZ_40634      Tamaulipas_MX
22   ao_MLZ_40635      Tamaulipas_MX
23   ao_MLZ_41497 San Luis Potosi_MX
24   ao_MLZ_45517          Oaxaca_MX
25   ao_MLZ_48333      Tamaulipas_MX
26   ao_MLZ_50773        Guerrero_MX
27   ao_MLZ_50774        Guerrero_MX
28   ao_MLZ_50775        Guerrero_MX
29   ao_MLZ_59507        Veracruz_MX
30   ao_MLZ_70063       Socal_Parrot
31   ao_MLZ_70074       Socal_Parrot
32        ao_SP_1       Socal_Parrot
33        ao_SP_2       Socal_Parrot
34        ao_SP_3       Socal_Parrot
35        ao_SP_4       Socal_Parrot
36        ao_SP_5       Socal_Parrot
37        ao_SP_6       Socal_Parrot
38        ao_SP_7       Socal_Parrot
39        ao_SP_8       Socal_Parrot
40      ao_SP_831       Socal_Parrot
41      ao_SP_832       Socal_Parrot
42      ao_SP_833       Socal_Parrot
43      ao_SP_834       Socal_Parrot
44      ao_SP_835       Socal_Parrot
45      ao_SP_836       Socal_Parrot
46      ao_SP_837       Socal_Parrot
47      ao_SP_838       Socal_Parrot
48      ao_SP_839       Socal_Parrot
49      ao_SP_840       Socal_Parrot
50 ao_UMMZ_103984         Tabasco_MX
51 ao_UMMZ_130517       Michoacan_MX
52  ao_UMMZ_95618         Nayarit_MX
53  ao_UMMZ_95619         Nayarit_MX
Code
#subset popmap to only include retained individuals
popmap<-popmap[popmap$id %in% colnames(yhpa.scp.vcf@gt),]

#make sure sampling file matches vcf
popmap$id %in% colnames(yhpa.scp.vcf@gt)[-1]
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE TRUE
Code
colnames(yhpa.scp.vcf@gt)[-1] %in% popmap$id
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE TRUE
Code
#reorder sampling file to match order of samples in vcf
sample.info<-popmap[match(colnames(yhpa.scp.vcf@gt)[-1], popmap$id),]
popmap$id == colnames(yhpa.scp.vcf@gt)[-1]
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE TRUE
Code
yhpa.scp.vcf.mac2<-min_mac(yhpa.scp.vcf, min.mac = 2)

13.28% of SNPs fell below a minor allele count of 2 and were removed from the VCF
Code
# vcfR::write.vcf(yhpa.scp.vcf.mac2, "~/Dropbox/08_yhpa_072025/final_locator/yhpa/yhpa.final.vcf.gz")

yhpa.scp.vcf.mac2
***** Object of Class vcfR *****
36 samples
26 CHROMs
1,972 variants
Object size: 3.2 Mb
7.726 percent missing data
*****        *****         *****
Code
#confirm sample sheet matches the samples in the vcf
yhpa_sample_data<-read.delim("~/Dropbox/08_yhpa_072025/final_locator/yhpa/sample_data_yhpa_final.txt")

colnames(yhpa.scp.vcf.mac2@gt)[-1]
 [1] "ao_BC_107"      "ao_BC_A115"     "ao_LSUMZ_33050" "ao_LSUMZ_39731"
 [5] "ao_LSUMZ_43831" "ao_LSUMZ_43832" "ao_MLZ_35920"   "ao_MLZ_40634"  
 [9] "ao_MLZ_41497"   "ao_MLZ_48333"   "ao_MLZ_50773"   "ao_MLZ_50774"  
[13] "ao_MLZ_50775"   "ao_MLZ_59507"   "ao_SP_1"        "ao_SP_2"       
[17] "ao_SP_3"        "ao_SP_4"        "ao_SP_5"        "ao_SP_6"       
[21] "ao_SP_7"        "ao_SP_8"        "ao_SP_831"      "ao_SP_832"     
[25] "ao_SP_833"      "ao_SP_834"      "ao_SP_835"      "ao_SP_836"     
[29] "ao_SP_837"      "ao_SP_838"      "ao_SP_839"      "ao_SP_840"     
[33] "ao_UMMZ_103984" "ao_UMMZ_130517" "ao_UMMZ_95618"  "ao_UMMZ_95619" 
Code
yhpa_sample_data$sampleID
 [1] "ao_BC_107"      "ao_BC_A115"     "ao_LSUMZ_33050" "ao_LSUMZ_39731"
 [5] "ao_LSUMZ_43831" "ao_LSUMZ_43832" "ao_MLZ_35920"   "ao_MLZ_40634"  
 [9] "ao_MLZ_41497"   "ao_MLZ_48333"   "ao_MLZ_50773"   "ao_MLZ_50774"  
[13] "ao_MLZ_50775"   "ao_MLZ_59507"   "ao_SP_1"        "ao_SP_2"       
[17] "ao_SP_3"        "ao_SP_4"        "ao_SP_5"        "ao_SP_6"       
[21] "ao_SP_7"        "ao_SP_8"        "ao_SP_831"      "ao_SP_832"     
[25] "ao_SP_833"      "ao_SP_834"      "ao_SP_835"      "ao_SP_836"     
[29] "ao_SP_837"      "ao_SP_838"      "ao_SP_839"      "ao_SP_840"     
[33] "ao_UMMZ_103984" "ao_UMMZ_130517" "ao_UMMZ_95618"  "ao_UMMZ_95619" 
Code
yhpa_sample_data$sampleID %in% colnames(yhpa.scp.vcf.mac2@gt)[-1]
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE TRUE
Code
colnames(yhpa.scp.vcf.mac2@gt)[-1] %in% yhpa_sample_data$sampleID 
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE TRUE

In Bletchley run yhpa_locator.sh

#!/bin/bash
#
#SBATCH --job-name=locator_array
#SBATCH --partition=mlz-medium
#SBATCH --nodelist=n028
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --time=24:00:00
#SBATCH --array=1-5
#SBATCH --output=locator_seed_%A_%a.out

# Define seeds
SEEDS=(835 163 321 958 841)

# Pick seed based on SLURM_ARRAY_TASK_ID
SEED=${SEEDS[$SLURM_ARRAY_TASK_ID-1]}

# Run Locator with all your parameters + current seed
locator \
  --vcf /home/ramirezb/oratrix_locator/final_data/yhpa.final.vcf.gz \
  --sample_data /home/ramirezb/oratrix_locator/final_data/sample_data_yhpa_final.txt \
  --out /home/ramirezb/oratrix_locator/final_run_oct2025/out_yhpa/oratrix_seed${SEED} \
  --bootstrap \
  --nboots 200 \
  --train_split 0.8 \
  --nlayers 3 \
  --seed $SEED
Code
setwd("~/Dropbox/08_yhpa_072025/final_locator/yhpa/out_yhpa")
# Get list of all predloc files
yhpa_files <- list.files(pattern = "predlocs.*\\.txt")
yhpa_files
#remove full run
yhpa_files <- yhpa_files[-c(201)]

#This is the prediction from the full model trained on all SNPs (no resampling)
yhpa_full<-read_delim("oratrix_seed321_bootFULL_predlocs.txt")

# Read and bind them - all bootstrapped predictions
yhpa_all_preds <- yhpa_files %>%
  map_df(~ read_delim(.x, delim = ",", col_names = TRUE), .id = "file")

yhpa_individual_means <- yhpa_all_preds %>%
  group_by(sampleID) %>%
  summarize(mean_long = mean(x, na.rm = TRUE),
            mean_lat = mean(y, na.rm = TRUE))

yhpa_summary_stats<-yhpa_all_preds %>%
  group_by(sampleID) %>%
  summarize(mean_long = mean(x, na.rm = TRUE),
            mean_lat = mean(y, na.rm = TRUE),
            sd_x = sd(x, na.rm = TRUE),
            sd_y = sd(y, na.rm = TRUE))

5 Plotting Confiscated YHPA Predictions

Code
#showing 95% boostrap ellipse
# confiscated_ellipse<-
ggplot() +
  
    geom_sf(data = cropped_yhpa_map, aes(fill = "map"), color = "black") +
    geom_sf(data = cropped_mx_states, fill="gray95") +
    geom_sf(data = cropped_yhpa_shp, aes(fill = "YHPA"), color = "black") +
    geom_sf(data = cropped_ynpa_shp, fill = "cornsilk2", color = "black") +
    coord_sf(xlim = c(-109, -86), ylim = c(15.3, 26), expand = FALSE) +
  
  scale_fill_manual(
    values = c("map" = "gray95",
               "YHPA" = "cornsilk2")
    ,
    breaks = c("map",
               "YHPA"),
    
    labels = c("map" = "outside observed range", "YHPA" = "observed range")
  ) +
  
  guides(
  fill = guide_legend(
    override.aes = list(
      shape = c(21, 21),    # NA for ellipse (no point)
      linetype = c(0, 0),        # 2 = dashed for ellipse
      colour = c("black", "black"),              
      size = 8.5
    )
  )
) +
  
  ggnewscale::new_scale_fill() +
  
  # Ellipse around bootstrapped points
  stat_ellipse(data = yhpa_all_preds, 
               aes(x = y, y = x, 
                   group = sampleID, 
                   fill = "bootstrapped 95% confidence ellipse"), 
                   level = 0.95, 
                   linewidth = 1, 
                   linetype = 2, 
                   alpha = 0.5,
                   color = "#008B8B", 
                   inherit.aes = FALSE, 
                   show.legend = TRUE) +
  
  geom_point(data = yhpa_individual_means, aes(x = mean_lat, y = mean_long, fill = "mean of 200 bootstraps"), 
       colour="black", size = 10) +

  geom_point(data = full_test_sample_data, aes(x = y, y = x, fill = Subspecies), 
       colour="black",pch=21, size = 10) +
  
scale_color_manual(
  values = c("bootstrapped 95% confidence ellipse" = "#008B8B"),
) +

  scale_fill_manual(
  values = c(
    "auropalliata" = "#fff700",
    "belizensis" = "#ff99fd",
    "oratrix (east)" = "#ffab00",
    "oratrix (west)" = "#1E90FF",
    "tresmariae" = "#F75394",
    "mean of 200 bootstraps" = "black",
    "bootstrapped 95% confidence ellipse" = "#008B8B"
  ),
  breaks = c(
    "auropalliata",
    "tresmariae",
    "oratrix (west)",
    "oratrix (east)",
    "belizensis",
    "mean of 200 bootstraps",
    "bootstrapped 95% confidence ellipse"
  ),
  labels = c(
    "auropalliata" = "*A. auropalliata*",
    "tresmariae" = "*A. o. tresmariae*",
    "oratrix (west)" = "*A. o. oratrix* (west)",
    "oratrix (east)" = "*A. o. oratrix* (east)",
    "belizensis" = "*A. o. belizensis*",
    "mean of 200 bootstraps" = "mean of bootstrap predictions",
    "bootstrapped 95% confidence ellipse" = "bootstrapped 95% confidence ellipse"
  )
  , guide = guide_legend(override.aes = list(size = 7.5))
) +
  
guides(
  fill = guide_legend(
    override.aes = list(
      shape = c(21, 21, 21, 21, 21, 21, 21),    # NA for ellipse (no point)
      linetype = c(0, 0, 0, 0, 0, 0, 2),        # 2 = dashed for ellipse
      colour = c("black", "black", "black","black", "black", "black", "#008B8B"),              
      size = 9
    )
  )
) +
  
  geom_text(data = full_test_sample_data, aes(x = y, y = x, label = group),
  color = "black",
  size = 6.5, 
  family = "Arial"
  ) +
    
  geom_text(data = locator_test, aes(x = y, y = x, label = group),
  color = "black",
  size = 6.5, 
  family = "Arial"
  ) +

  theme_classic() +
  theme(legend.text = ggtext::element_markdown(
        family = "Arial",
        size = 20,
        color = "black",
        margin = margin(l = 0.5)),
        legend.title = element_blank(),
        # legend.position = c(0.95, 0.98),
        # legend.box = "horizontal",
        legend.justification = c("right", "top"),
        axis.title = element_blank()
    )
Code
# ggsave(
#        confiscated_ellipse,
#        filename="~/Dropbox/08_yhpa_072025/final_locator/cropped_figures/YHPA.fig3A.updated.svg",
#        device="svg",
#        # filename="~/Dropbox/08_yhpa_072025/final_locator/cropped_figures/YHPA.fig3A.updated.png",
#        # device="png",
#        dpi = 400, width = 15, height = 9)
Code
knitr::include_graphics("~/Dropbox/08_yhpa_072025/final_figures/YHPA.fig3A.updated.png")


6 Predicting southern California Rescue Samples

Code
#we want to find localities of southern california samples (ao_MLZ_1105, ao_MLZ_70063, and ao_MLZ_70074)
#need to downsample Belize again
#removing confiscated scp samples because we don't have their localities
socal.vcf<-(vcfR)[, !colnames((vcfR)@gt) %in% c("ao_BC_108", #belize
                                                "ao_BC_109", #belize
                                                "ao_BC_A113", #belize
                                                "ao_BC_A114", #belize
                                                "ao_BC_A116", #belize
                                                "ao_BC_A117", #belize
                                                "ao_BC_A118", #belize
                                                "ao_SP_1", #all SP samples confiscated
                                                "ao_SP_2",
                                                "ao_SP_3",
                                                "ao_SP_4",
                                                "ao_SP_5",
                                                "ao_SP_6",
                                                "ao_SP_7",
                                                "ao_SP_8",
                                                "ao_SP_831",
                                                "ao_SP_832",
                                                "ao_SP_833",
                                                "ao_SP_834",
                                                "ao_SP_835",
                                                "ao_SP_836",
                                                "ao_SP_837",
                                                "ao_SP_838",
                                                "ao_SP_839",
                                                "ao_SP_840")]

socal.vcf
***** Object of Class vcfR *****
21 samples
26 CHROMs
2,274 variants
Object size: 2.9 Mb
12.3 percent missing data
*****        *****         *****
Code
popmap<-read.csv("~/Dropbox/08_yhpa_072025/yhpa_popmap.csv")
popmap
               id                pop
1   ao_ANSP_90568           Yoro_Hon
2       ao_BC_107             Belize
3       ao_BC_108             Belize
4       ao_BC_109             Belize
5      ao_BC_A112             Belize
6      ao_BC_A113             Belize
7      ao_BC_A114             Belize
8      ao_BC_A115             Belize
9      ao_BC_A116             Belize
10     ao_BC_A117             Belize
11     ao_BC_A118             Belize
12 ao_LSUMZ_23890         Tabasco_MX
13 ao_LSUMZ_33050          Oaxaca_MX
14 ao_LSUMZ_39731         Chiapas_MX
15 ao_LSUMZ_43831          Colima_MX
16 ao_LSUMZ_43832          Colima_MX
17    ao_MLZ_1105       Socal_Parrot
18   ao_MLZ_35920        Veracruz_MX
19   ao_MLZ_39530      Tamaulipas_MX
20   ao_MLZ_40633      Tamaulipas_MX
21   ao_MLZ_40634      Tamaulipas_MX
22   ao_MLZ_40635      Tamaulipas_MX
23   ao_MLZ_41497 San Luis Potosi_MX
24   ao_MLZ_45517          Oaxaca_MX
25   ao_MLZ_48333      Tamaulipas_MX
26   ao_MLZ_50773        Guerrero_MX
27   ao_MLZ_50774        Guerrero_MX
28   ao_MLZ_50775        Guerrero_MX
29   ao_MLZ_59507        Veracruz_MX
30   ao_MLZ_70063       Socal_Parrot
31   ao_MLZ_70074       Socal_Parrot
32        ao_SP_1       Socal_Parrot
33        ao_SP_2       Socal_Parrot
34        ao_SP_3       Socal_Parrot
35        ao_SP_4       Socal_Parrot
36        ao_SP_5       Socal_Parrot
37        ao_SP_6       Socal_Parrot
38        ao_SP_7       Socal_Parrot
39        ao_SP_8       Socal_Parrot
40      ao_SP_831       Socal_Parrot
41      ao_SP_832       Socal_Parrot
42      ao_SP_833       Socal_Parrot
43      ao_SP_834       Socal_Parrot
44      ao_SP_835       Socal_Parrot
45      ao_SP_836       Socal_Parrot
46      ao_SP_837       Socal_Parrot
47      ao_SP_838       Socal_Parrot
48      ao_SP_839       Socal_Parrot
49      ao_SP_840       Socal_Parrot
50 ao_UMMZ_103984         Tabasco_MX
51 ao_UMMZ_130517       Michoacan_MX
52  ao_UMMZ_95618         Nayarit_MX
53  ao_UMMZ_95619         Nayarit_MX
Code
#subset popmap to only include retained individuals
popmap<-popmap[popmap$id %in% colnames(socal.vcf@gt),]

#make sure sampling file matches vcf
popmap$id %in% colnames(socal.vcf@gt)[-1]
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE
Code
colnames(socal.vcf@gt)[-1] %in% popmap$id
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE
Code
#reorder sampling file to match order of samples in vcf
sample.info<-popmap[match(colnames(socal.vcf@gt)[-1], popmap$id),]
popmap$id == colnames(socal.vcf@gt)[-1]
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE
Code
socal.vcf.mac2<-min_mac(socal.vcf, min.mac = 2)

35.62% of SNPs fell below a minor allele count of 2 and were removed from the VCF
Code
# vcfR::write.vcf(socal.vcf, "~/Dropbox/08_yhpa_072025/final_locator/socal/socal.final.vcf.gz")

socal.vcf.mac2
***** Object of Class vcfR *****
21 samples
26 CHROMs
1,464 variants
Object size: 2.1 Mb
12.33 percent missing data
*****        *****         *****
Code
#confirm sample sheet matches the samples in the vcf
socal_sample_data<-read_delim("~/Dropbox/08_yhpa_072025/final_locator/socal/sample_data_socal_final.txt")
Rows: 21 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): sampleID
dbl (2): x, y

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
colnames(socal.vcf.mac2@gt)[-1]
 [1] "ao_BC_107"      "ao_BC_A115"     "ao_LSUMZ_33050" "ao_LSUMZ_39731"
 [5] "ao_LSUMZ_43831" "ao_LSUMZ_43832" "ao_MLZ_1105"    "ao_MLZ_35920"  
 [9] "ao_MLZ_40634"   "ao_MLZ_41497"   "ao_MLZ_48333"   "ao_MLZ_50773"  
[13] "ao_MLZ_50774"   "ao_MLZ_50775"   "ao_MLZ_59507"   "ao_MLZ_70063"  
[17] "ao_MLZ_70074"   "ao_UMMZ_103984" "ao_UMMZ_130517" "ao_UMMZ_95618" 
[21] "ao_UMMZ_95619" 
Code
socal_sample_data$sampleID
 [1] "ao_BC_107"      "ao_BC_A115"     "ao_LSUMZ_33050" "ao_LSUMZ_39731"
 [5] "ao_LSUMZ_43831" "ao_LSUMZ_43832" "ao_MLZ_1105"    "ao_MLZ_35920"  
 [9] "ao_MLZ_40634"   "ao_MLZ_41497"   "ao_MLZ_48333"   "ao_MLZ_50773"  
[13] "ao_MLZ_50774"   "ao_MLZ_50775"   "ao_MLZ_59507"   "ao_MLZ_70063"  
[17] "ao_MLZ_70074"   "ao_UMMZ_103984" "ao_UMMZ_130517" "ao_UMMZ_95618" 
[21] "ao_UMMZ_95619" 
Code
socal_sample_data$sampleID %in% colnames(socal.vcf.mac2@gt)[-1]
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE
Code
colnames(socal.vcf.mac2@gt)[-1] %in% socal_sample_data$sampleID
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE

In Bletchley run final_locator_socal.sh

#!/bin/bash
#
#SBATCH --job-name=locator_array
#SBATCH --partition=mlz-unlimited
#SBATCH --nodelist=n029
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --time=24:00:00
#SBATCH --array=1-5
#SBATCH --output=locator_seed_%A_%a.out

# Define seeds
SEEDS=(835 163 321 958 841)

# Pick seed based on SLURM_ARRAY_TASK_ID
SEED=${SEEDS[$SLURM_ARRAY_TASK_ID-1]}

# Run Locator with all your parameters + current seed
locator \
  --vcf /home/ramirezb/oratrix_locator/final_data/socal.final.vcf.gz \
  --sample_data /home/ramirezb/oratrix_locator/final_data/sample_data_socal_final.txt \
  --out /home/ramirezb/oratrix_locator/final_run_oct2025/out_socal/oratrix_seed${SEED} \
  --bootstrap \
  --nboots 200 \
  --train_split 0.7 \
  --nlayers 6 \
  --seed $SEED
Code
setwd("~/Dropbox/08_yhpa_072025/final_locator/socal/out_socal")
# Get list of all predloc files
socal_files <- list.files(pattern = "predlocs.*\\.txt")
socal_files
#remove full run
socal_files <- socal_files[-c(201)]

#This is the prediction from the full model trained on all SNPs (no resampling)
socal_full<-read_delim("oratrix_seed321_bootFULL_predlocs.txt")

# Read and bind them - all bootstrapped predictions
socal_all_preds <- socal_files %>%
  map_df(~ read_delim(.x, delim = ",", col_names = TRUE), .id = "file")

socal_individual_means <- socal_all_preds %>%
  group_by(sampleID) %>%
  summarize(mean_long = mean(x, na.rm = TRUE),
            mean_lat = mean(y, na.rm = TRUE))

socal_summary_stats<-socal_all_preds %>%
  group_by(sampleID) %>%
  summarize(mean_long = mean(x, na.rm = TRUE),
            mean_lat = mean(y, na.rm = TRUE),
            sd_x = sd(x, na.rm = TRUE),
            sd_y = sd(y, na.rm = TRUE))

7 Plotting Rescue Predictions

Code
#showing 95% boostrap ellipse
#socal_ellipse<-
  ggplot() +
  
    geom_sf(data = cropped_yhpa_map, aes(fill = "map"), color = "black") +
    geom_sf(data = cropped_mx_states, fill="gray95") +
    geom_sf(data = cropped_yhpa_shp, aes(fill = "YHPA"), color = "black") +
    geom_sf(data = cropped_ynpa_shp, fill = "cornsilk2", color = "black") +
    coord_sf(xlim = c(-109, -86), ylim = c(15.3, 26), expand = FALSE) +
  
  scale_fill_manual(
    values = c("map" = "gray95",
               "YHPA" = "cornsilk2")
    ,
    breaks = c("map",
               "YHPA"),
    
    labels = c("map" = "outside observed range", "YHPA" = "observed range")
  ) +
  
    guides(
  fill = guide_legend(
    override.aes = list(
      shape = c(21, 21),    # NA for ellipse (no point)
      linetype = c(0, 0),        # 2 = dashed for ellipse
      colour = c("black", "black"),              
      size = 8.5
    )
  )
) +
  
  ggnewscale::new_scale_fill() +

   # Ellipse around bootstrapped points
    stat_ellipse(data = socal_all_preds, 
                 aes(x = y, y = x, 
                     group = sampleID, 
                     fill = "bootstrapped 95% confidence ellipse"),
                 level = 0.95, 
                 linewidth = 1, 
                 linetype = 2, 
                 alpha = 0.5, 
                 color = "#008B8B",
                 inherit.aes = FALSE, 
                 show.legend = TRUE) +
  
  geom_point(data = socal_individual_means, aes(x = mean_lat, y = mean_long, fill = "mean of 200 bootstraps"), 
             colour="black", size = 10) +

    geom_point(data = full_test_sample_data, aes(x = y, y = x, fill = Subspecies), 
       colour="black",pch=21, size = 10) +

    scale_color_manual(
    values = c("bootstrapped 95% confidence ellipse" = "#008B8B"),
  ) +
    
    scale_fill_manual(
     values = c(
    "auropalliata" = "#fff700",
    "belizensis" = "#ff99fd",
    "oratrix (east)" = "#ffab00",
    "oratrix (west)" = "#1E90FF",
    "tresmariae" = "#F75394",
    "mean of 200 bootstraps" = "black",
    "bootstrapped 95% confidence ellipse" = "#008B8B"
  ),
  breaks = c(
    "auropalliata",
    "tresmariae",
    "oratrix (west)",
    "oratrix (east)",
    "belizensis",
    "mean of 200 bootstraps",
    "bootstrapped 95% confidence ellipse"
  ),
  labels = c(
    "auropalliata" = "*A. auropalliata*",
    "tresmariae" = "*A. o. tresmariae*",
    "oratrix (west)" = "*A. o. oratrix* (west)",
    "oratrix (east)" = "*A. o. oratrix* (east)",
    "belizensis" = "*A. o. belizensis*",
    "mean of 200 bootstraps" = "mean of bootstrap predictions",
    "bootstrapped 95% confidence ellipse" = "bootstrapped 95% confidence ellipse"
  )
    , guide = guide_legend(override.aes = list(size = 7.5))
) +
  
  guides(
  fill = guide_legend(
    override.aes = list(
      shape = c(21, 21, 21, 21, 21, 21, 21),    # NA for ellipse (no point)
      linetype = c(0, 0, 0, 0, 0, 0, 2),        # 2 = dashed for ellipse
      colour = c("black", "black", "black", "black", "black", "black", "#008B8B"),              
      size = 9
    )
  )
) + 
    
  geom_text(data = full_test_sample_data, aes(x = y, y = x, label = group),
  color = "black",
  size = 6.5, 
  family = "Arial"
  ) +
    
  geom_text(data = locator_test, aes(x = y, y = x, label = group),
  color = "black",
  size = 6.5, 
  family = "Arial"
  ) +
      
  theme_classic() +
  theme(legend.text = ggtext::element_markdown(
        family = "Arial",
        size = 20,
        color = "black",
        margin = margin(l = 0.5)),
        legend.title = element_blank(),
        # legend.position = c(0.95, 0.98),
        # legend.box = "horizontal",
        legend.justification = c("right", "top"),
        axis.title = element_blank()
    )
Code
# ggsave(socal_ellipse,
#        filename="~/Dropbox/08_yhpa_072025/final_locator/cropped_figures/YHPA.fig3B.updated.svg",
#        device="svg",
#        # filename="~/Dropbox/08_yhpa_072025/final_locator/cropped_figures/YHPA.fig3B.updated.png",
#        # device="png",
#        dpi = 400, width = 15, height = 9)
Code
knitr::include_graphics("~/Dropbox/08_yhpa_072025/final_figures/YHPA.fig3B.updated.png")