This script was developed using a subset of advanced selections of the University of Florida blueberry breeding program. The population is made up of 138 individuals phenotyped for a moderately heritable trait and genotyped with targeted sequence capture that yielded ~40k SNPs after dosage allele calling. In this hands-on, the optimization of genomic selection (GS) was presented for optimizing (i) training size (TSO) and; (ii) marker size (MSO).
#On windows, install rtools
#On MAC Install Xcode from apple, and gfortran6.3 or gfortran8.2 from
#https://cran.r-project.org/bin/macosx/tools/
#Download and install gfortran-12.2-universal.pkg
#Please install also Rcpp and Rcpp-Armadillo packages.
#library(devtools)
#install_github("TheRocinante-lab/TrainSel")
#Load libraries
library(TrainSel)
## Loading required package: cluster
## Loading required package: parallel
##
## ***********************************************************
##
## This is 'TrainSel' package, v 2.0
## Copyright 2022 StatGen. All rights reserved.
## Please read the LICENSE file for more information
## ***********************************************************
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading objects:
## pheno_dat
## geno_dat
## Initial data:
## Number of Individuals: 138
## Number of Markers: 41050
##
## Missing data check:
## Total SNPs: 41050
## 58 SNPs dropped due to missing data threshold of 0.5
## Total of: 40992 SNPs
##
## MAF check:
## No SNPs with MAF below 0
##
## Heterozigosity data check:
## No SNPs with heterozygosity, missing threshold of = 0
##
## Summary check:
## Initial: 41050 SNPs
## Final: 40992 SNPs ( 58 SNPs removed)
##
## Completed! Time = 3.789 seconds
For TSO, two approaches was tested. Firstly, random selection of a
defined number of individuals (size). The sampling for size
(used as training set) was repeated rep
times. For the
second approach, CDMean algorithm was employed from TrainSel (Deniz Akdemir and Sánchez 2021) R package.
Example 1
Here, y is the vector of the phenotype while G is the Gmatrix. In
this example, a seed
is used to randomly sample individuals
used as training set (TS).
set.seed(123)
size = 25
y = pheno_dat$trait1
n = c(1:length(y))
rand = sample(n, size, F)
test = n[-rand]
yNA = y
yNA[test] = NA
#Fit - gblub
fit = mixed.solve(yNA, K=G)
yhat = fit$u[test]
r2 = cor(yhat, y[test],use="complete")
print(r2)
## [1] 0.3762877
In other to allow for fair representation of all individuals in the population, random sampling was employed to repeatedly select the individuals used as TS with different seeds (5 times). Ideally, this should be repeated about 100 times but kept to 5 for simplicity.
#Create a function for random TS
gblup_ts_rand = function(rep,size,y,G){
r2_gb = vector()
#Set seed to randomly sample rep numbers
set.seed(123)
mod.seed=sample(1:10000,rep,F)
for (i in 1:rep) {
set.seed(mod.seed[i])
n = c(1:length(y))
rand = sample(n, size, F)
test = n[-rand]
y[is.nan(y)]<-NA; yNA = y
yNA[test] = NA
#Fit - gblub
fit = mixed.solve(yNA, K=G)
yhat = fit$u[test]
r2_gb[i] = cor(yhat, y[test],use="complete")
}
return(list(gblup=r2_gb))
}
#Random selection
#25 individuals
rand_25 = gblup_ts_rand(rep=50,size=25,y=pheno_dat$trait1,G=G)
#50 individuals
rand_50 = gblup_ts_rand(rep=50,size=50,y=pheno_dat$trait1,G=G)
#75 individuals
rand_75 = gblup_ts_rand(rep=50,size=75,y=pheno_dat$trait1,G=G)
Mean coefficient of determination (CDMean) is the expected correlation between the true and predicted genotypic values. Read Laloë (1993) (Laloë 1993) and Rincent et al. (2012) (Rincent 2012) for more information. CDMean can be used to measure the suitability of a training set to make predictions over a target population (Fernández-González et al., 2022) (Fernández-González 2023). CDMean depends on the linear mixed models.
\[y = X\beta + Zu + e\]
Example 2
TSC = TrainSelControl()
TSC$niterations = 1000 #Number of iterations
#Select CDmean-Optimal subset of 25, 50, 75 samples
dataCDMEANopt = list(G = G, lambda = 1)
CDMEANOPT = function(soln, Data){
G = Data[["G"]]
lambda = Data[["lambda"]]
Vinv = solve(G[soln,soln]+lambda*diag(length(soln)))
outmat = (G[,soln]%*%(Vinv-(Vinv%*%Vinv)/sum(Vinv))%*%G[soln,])/G
return(mean(diag(outmat[-soln,-soln])))
}
#Set seed to randomly sample 5 numbers
set.seed(123)
mod.seed=sample(1:10000,5,F)
#Optimal 25
TSOUTCD_25 = list()
for (i in 1:length(mod.seed)) {
set.seed(mod.seed[i])
selected = TrainSel(Data = dataCDMEANopt,
Candidates = list(1:nrow(G)),
setsizes = c(25),
settypes = "UOS",
Stat = CDMEANOPT, control = TSC)
TSOUTCD_25[[i]] = rownames(G)[selected$BestSol_int]
}
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
TSOUTCD_25 = do.call('cbind',TSOUTCD_25)
write.csv(TSOUTCD_25, file = "TSOUTCD_25.csv", row.names = F)
#Optimal 50
TSOUTCD_50 = list()
for (i in 1:length(mod.seed)) {
set.seed(mod.seed[i])
selected = TrainSel(Data = dataCDMEANopt,
Candidates = list(1:nrow(G)),
setsizes = c(50),
settypes = "UOS",
Stat = CDMEANOPT, control = TSC)
TSOUTCD_50[[i]] = rownames(G)[selected$BestSol_int]
}
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
TSOUTCD_50 = do.call('cbind',TSOUTCD_50)
write.csv(TSOUTCD_50, file = "TSOUTCD_50.csv", row.names = F)
#Optimal 75
TSOUTCD_75 = list()
for (i in 1:length(mod.seed)) {
set.seed(mod.seed[i])
selected = TrainSel(Data = dataCDMEANopt,
Candidates = list(1:nrow(G)),
setsizes = c(75),
settypes = "UOS",
Stat = CDMEANOPT, control = TSC)
TSOUTCD_75[[i]] = rownames(G)[selected$BestSol_int]
}
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
## Convergence Achieved
## (no improv in the last 'minitbefstop' iters).
TSOUTCD_75 = do.call('cbind',TSOUTCD_75)
write.csv(TSOUTCD_75, file = "TSOUTCD_75.csv", row.names = F)
#Visualizing convergence
plot(selected$maxvec, xlab = "Iteration", ylab = "Obj-Func Max Value")
Load saved selections
## Rows: 25 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): V1, V2, V3, V4, V5
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 50 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): V1, V2, V3, V4, V5
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 75 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): V1, V2, V3, V4, V5
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Function to run gblup gs: opt and k-fold CVs
gs_opt
function runs gblup genomic prediction with the
five reps of selected individuals from optimization algorithm.
Additionally, gblup_gs
function runs 10 fold
cross-validation for the full dataset. This was used to compare the TSO
outputs.
gs_opt = function(G,train_ind,pheno,trait){
trait_id = which(names(pheno)==trait)
y = pheno[,trait_id]
r2_gb = vector()
reps = ncol(train_ind)
for (i in 1:reps) {
sel = train_ind[,i]
test = which(!pheno[,1] %in% sel)
yNA = y; yNA[test] = NA
#Fit - gblub
fit = mixed.solve(yNA, K=G)
yhat = fit$u[test]
r2_gb[i] = cor(yhat, y[test],use="complete")
}
return(list(gblup=r2_gb))
}
#10 Folds CV
gblup_gs = function(G,y,reps){
set.seed(123)
seed=sample(1:10000,reps,F)
rep_r2 = list();
for (i in 1:reps) {
set.seed(seed[i])
print(i)
folds=sample(1:10,size=length(y),replace=T)
y[is.nan(y)]<-NA
r2 = vector()
for (j in 1:max(folds)) {
test = which(folds==j)
yNA = y; yNA[test] = NA
#Fit
fit = mixed.solve(yNA, K=G)
yhat = fit$u[test]
r2[j] = cor(yhat, y[test],use="complete")
}
rep_r2[[i]] = r2
}
return(rep_r2)
}
cd_25 = gs_opt(G,TSOUTCD_25,pheno_dat,"trait1")
cd_50 = gs_opt(G,TSOUTCD_50,pheno_dat,"trait1")
cd_75 = gs_opt(G,TSOUTCD_75,pheno_dat,"trait1")
all = gblup_gs(G,pheno_dat$trait1,5)
size = c(25, 50, 75, "all")
random = c(mean(rand_25$gblup),mean(rand_50$gblup),
mean(rand_75$gblup),mean(unlist(all)))
cdmean = c(mean(cd_25$gblup,na.rm=T),mean(cd_50$gblup,na.rm=T),
mean(cd_75$gblup,na.rm=T),mean(unlist(all)))
tso = data.frame(Size=size,Random=random,CDMean=cdmean)
knitr::kable(tso)
Size | Random | CDMean |
---|---|---|
25 | 0.3233391 | 0.3998368 |
50 | 0.3940426 | 0.4173206 |
75 | 0.4138347 | 0.5312799 |
all | 0.4583492 | 0.4583492 |
Example 3
In this example, a defined number of snp markers were randomly sampled to fit genomic prediction models. However, there are other data-driven and genetic methods in literature (MAF, mismatch rate, LD, Call rate etc.) for informed mso (Kainer et al., 2018) (David Kainer 2018). In the example below, 10k snps were randomly selected.
set.seed(123)
#Select random index
size = 10000
sel = sample(1:ncol(geno_dat), size, replace = F)
rand_snp = geno_dat[,sel];
#Select snps
n=length(pheno_dat$trait1); k=10
#set folds
folds=sample(1:k,size=n,replace=T)
#construct g matrix
G_sel = Gmatrix(rand_snp,ploidy = 4)
## Initial data:
## Number of Individuals: 138
## Number of Markers: 10000
##
## Missing data check:
## Total SNPs: 10000
## 6 SNPs dropped due to missing data threshold of 0.5
## Total of: 9994 SNPs
##
## MAF check:
## No SNPs with MAF below 0
##
## Heterozigosity data check:
## No SNPs with heterozygosity, missing threshold of = 0
##
## Summary check:
## Initial: 10000 SNPs
## Final: 9994 SNPs ( 6 SNPs removed)
##
## Completed! Time = 0.685 seconds
y = pheno_dat$trait1
#10 cv
r2 = vector()
for(i in 1:max(folds)){
test = which(folds==i)
yNA = y
yNA[test] = NA
#Fit - gblup
fit = mixed.solve(yNA, K=G_sel)
yhat = fit$u[test]
r2[i] = cor(yhat, y[test],use="complete")
}
mean(r2,na.rm=T)
## [1] 0.5525211
In the example below, the snp markers were randomly selected with
different seeds repeatedly (5 times) to avoid bias using
gblup
and rrblup
models.
#random marker size selection: gblup
rand_gblup_probes = function(snp,pheno,trait,size){
reps = list()
#Set seed to randomly sample 5 numbers
set.seed(123)
mod.seed=sample(1:10000,5,F)
trait_id = which(names(pheno)==trait)
y = pheno[,trait_id]; y[is.nan(y)]<-NA
for (j in 1:length(mod.seed)) {
print(j)
#Set seed
set.seed(mod.seed[j])
#Select random index
sel = sample(1:ncol(snp), size, replace = F)
rand_snp = snp[,sel];
#Select snps
n=length(y); k=10
#set folds
folds=sample(1:k,size=n,replace=T)
#construct g matrix
G_sel = Gmatrix(rand_snp,ploidy = 4)
#10 cv
r2 = vector()
for(i in 1:max(folds)){
test = which(folds==i)
yNA = y
yNA[test] = NA
#Fit - gblup
fit = mixed.solve(yNA, K=G_sel)
yhat = fit$u[test]
r2[i] = cor(yhat, y[test],use="complete")
}
reps[[j]]=r2
}
print(dim(rand_snp));print(rand_snp[1:5,1:5])
return(reps)
}
#imputation function
imputation = function(dat){
if (any(is.na(dat))) {
imputvalue = apply(dat, 2, mean, na.rm = TRUE)
ix = which(is.na(dat), arr.ind = TRUE)
dat[ix] = imputvalue[ix[, 2]]
}
return(dat)
}
#random marker size selection: rrblup
rand_rr_probes = function(snp,pheno,trait,size){
reps = list()
#Impute missing data
snp = imputation(snp)
#Set seed to randomly sample 5 numbers
set.seed(123)
mod.seed=sample(1:10000,5,F)
trait_id = which(names(pheno)==trait)
y = pheno[,trait_id]; y[is.nan(y)]<-NA
for (j in 1:length(mod.seed)) {
print(j)
#Set seed
set.seed(mod.seed[j])
#Select random index
sel = sample(1:ncol(snp), size, replace = F)
rand_snp = snp[,sel];
#Select snps
n=length(y); k=10
#set folds
folds=sample(1:k,size=n,replace=T)
#10 cv
r2 = vector()
for(i in 1:max(folds)){
test = which(folds==i)
y_train=y[-test]; y_test = y[test]
x_train= rand_snp[-test,]; x_test= rand_snp[test,]
#Fit
fit = mixed.solve(y_train, x_train)
yhat = x_test %*% fit$u
r2[i] = cor(yhat, y_test,use="complete")
}
reps[[j]]=r2
}
return(reps)
}
rand_gblup_1k = rand_gblup_probes(geno_dat,pheno_dat,"trait1",1000)
rand_gblup_5k = rand_gblup_probes(geno_dat,pheno_dat,"trait1",5000)
rand_gblup_10k = rand_gblup_probes(geno_dat,pheno_dat,"trait1",10000)
rand_gblup_20k = rand_gblup_probes(geno_dat,pheno_dat,"trait1",20000)
rand_rr_1k = rand_rr_probes(geno_dat,pheno_dat,"trait1",1000)
rand_rr_5k = rand_rr_probes(geno_dat,pheno_dat,"trait1",5000)
rand_rr_10k = rand_rr_probes(geno_dat,pheno_dat,"trait1",10000)
rand_rr_20k = rand_rr_probes(geno_dat,pheno_dat,"trait1",20000)
size = c(1000, 5000, 10000, 20000)
gblup = c(mean(unlist(rand_gblup_1k),na.rm=T),mean(unlist(rand_gblup_5k),na.rm=T),
mean(unlist(rand_gblup_10k),na.rm=T),mean(unlist(rand_gblup_20k),na.rm=T))
rrblup = c(mean(unlist(rand_rr_1k),na.rm=T),mean(unlist(rand_rr_5k),na.rm=T),
mean(unlist(rand_rr_10k),na.rm=T),mean(unlist(rand_rr_20k),na.rm=T))
mso = data.frame(Size=size,GBlup=gblup,RRBlup=rrblup)
knitr::kable(mso)
Size | GBlup | RRBlup |
---|---|---|
1000 | 0.4335913 | 0.4340556 |
5000 | 0.4364223 | 0.4365127 |
10000 | 0.4840333 | 0.4840627 |
20000 | 0.4541483 | 0.4542154 |
University of Florida, mresende@ufl.edu↩︎
University of Florida, https://pauliben.github.io/pauladunola.github.io/index.html↩︎