Genomic-wide association study (GWAS)
This is an script developed using a population simulated in AlphaSimR package (Gaynor, Gorjanc, and Hickey 2021). One trait with high heritability and controlled by a few QTLs was simulated into two breeding populations. In addition, four GWAS models were implemented, as follows:
Model 1. Non Adjusted markers
Model 2. Adjusted by Q
Model 3. Adjusted by K
Model 4. Adjusted by K+Q
library(AlphaSimR) # For genomic simulation
library(rrBLUP) #For K and Q+K GWAS
library(qqman) #For Manhattan and Q-Q plots
library(ggfortify) #For plotting PCA
library(stringr) #For splitting strings
# Global parameters
#nQtlPerChr = 4 #Per pair of chromosome
#nSnpPerChr = 1000 # must be > nQtlPerChr
# Simulate mapping population
#set.seed(18556255)
#FOUNDERPOP = runMacs(nInd=200,
# nChr=10,
# segSites=nSnpPerChr,
# split=10)
# Simulate the trait of interest
#SP = SimParam$
# new(FOUNDERPOP)$
# restrSegSites(minSnpPerChr=nSnpPerChr,
# minQtlPerChr=nSnpPerChr,
# overlap=TRUE)$
# addTraitA(nQtlPerChr,gamma=TRUE)$
# addSnpChip(nSnpPerChr)$
# setVarE(H2=0.8) # Trait heritability
#pop = newPop(FOUNDERPOP)
In a way to create a population structure, we implemented two steps.
The first was to split the base genome into two. We used above
(runMacs
function) the argument split=20
,
which splits the base genome in two, 20 generations ago. The second, is
going to be the cycles of selections where each population go through
(below). As we did split the base genome before, we will create two
populations (popA and popB), where the popA takes the individuals from
1:500, and the second from 501:1000 out of 1000 individuals from the
base genome. Then, the first population (popA) go through 6 cycles of
selections and the second population (popB) face 4 cycles of selection.
As they initially differ, after the different number of cycles of
selection, it is likely that the allele frequency of the two populations
would be different.
#Create 2 sub populations
#popA = pop[1:100]
#popB = pop[101:200]
#Cycles of selection
#for(i in 1:4){
# popA = selectCross(popA,
# nInd=10,
# nCrosses=20,
# nProgeny=5)
# if(i<3){
# popB = selectCross(popB,
# nInd=10,
# nCrosses=20,
# nProgeny=5)
# }
#}
#Combining both pops
#pop = c(popA,popB)
For the GWAS analysis, we going to organize the data. We will need dosage matrix (coded -1,0,1), phenotypes values for the trait of interest, and the QTL positions.
#rawGeno = pullSnpGeno(pop)
# Genotypes/SNP matrix/dosage matrix
#rawGeno = rawGeno-1
#geno = as.data.frame(t(rawGeno))
#geno = data.frame(snp=row.names(geno),
# chr=rep(1:10,each=nSnpPerChr),
# pos=rep(1:nSnpPerChr,10),
# geno)
# Create "pheno" data.frames for rrBLUP
#pheno = data.frame(gid=names(geno)[-(1:3)],
# trait1=pop@pheno[,1])
# Tracking the information of population
#phenoQ = data.frame(gid=pheno$gid,
# subPop=factor(rep(c("a","b"),each=100)),
# trait1=pheno$trait1)
# Find QTL locations within SNP chip and chromosome position
#qtl = paste0(rep(1:10, each = 4), '_',SP$traits[[1]]@lociLoc)
#df = data.frame(eff = SP$traits[[1]]@addEff,
# Mkr = SP$traits[[1]]@lociLoc,
# qtl = qtl)
#write.csv(geno,file = "genofile.csv",row.names = F)
#write.csv(phenoQ, file = "phenofile.csv", row.names = F)
#write.csv(df, file = "qtl_pos.csv", row.names = F)
= read.csv("genofile.csv") #SNP data
X.org #X.org[1:5,1:5]
= t(X.org[,-c(1:3)]) #Transpose marker
X = as.matrix(X) #transforming to matrix
X colnames(X) = X.org$snp
1:5,1:5] X[
## 1_1 1_2 1_3 1_4 1_5
## X701 -1 1 1 -1 1
## X702 -1 1 1 -1 1
## X703 -1 1 1 -1 1
## X704 -1 1 1 -1 1
## X705 -1 1 1 -1 1
= read.csv("phenofile.csv") #Phenotypic data
dat str(dat)
## 'data.frame': 200 obs. of 3 variables:
## $ gid : chr "X701" "X702" "X703" "X704" ...
## $ subPop: chr "a" "a" "a" "a" ...
## $ trait1: num 2.99 4.13 4.34 3.67 4.55 ...
unique(dat$subPop)
## [1] "a" "b"
= read.csv("qtl_pos.csv") #Marker effect from simulation snp_qtl
In this step we will extract the SNPs from the core population and we will create a PCA to explore the structure in the data.
# Format genotypes for rrBLUP
= X+1
X2 = colMeans(X2)/2
freq = apply(cbind(freq,1-freq),1,min)
MAF
# Ploting PCA
<- prcomp(X)
pca_res
autoplot(pca_res,data=dat,colour="subPop")
For the GWAS analysis, we going to organize the data. We will need dosage matrix (coded -1,0,1), phenotypes values for the trait of interest, and the QTL positions.
= X.org geno
Single marker regression
\[y = S\alpha + e\]
#No adjustments----
= data.frame(snp=geno$snp,
model0 chr=geno$chr,
pos=geno$pos,
trait1=rep(NA,nrow(dat)))
$chr=as.numeric(model0$chr);model0$pos=as.numeric(model0$pos)
model0
for(i in 1:ncol(X)){
if(MAF[i]>=0.05){
#Fit linear model with lm() and extract p-value
= summary(lm(dat$trait1~X[,i]))
mod1 = mod1[[4]][2,4]
tmp #Check for markers confounded with structure
if(is.na(tmp)){
$trait1[i] = 1
model0else{
}$trait1[i] = tmp
model0
}else{
}$trait1[i] = 1
model0
}
}
#Account for p-value=0
$trait1[model0$trait1==0] = 1e-300 model0
Single marker regression with population as fixed effect
\[y = X\beta + S\alpha + e\]
#Adjust for population structure (Q)----
= model0
modelQ $chr=as.numeric(modelQ$chr);modelQ$pos=as.numeric(modelQ$pos)
modelQ
for(i in 1:ncol(X)){
if(MAF[i]>=0.05){
#Fit linear model with lm() and extract p-value
= summary(lm(dat$trait1~dat$subPop+X[,i]))[[4]]
mod2 if(dim(mod2)[1] == 2){
= NA
tmp else{
}= mod2[3,4]
tmp
}#Check for markers confounded with structure
if(is.na(tmp)){
$trait1[i] = 1
modelQelse{
}$trait1[i] = tmp
modelQ
}else{
}$trait1[i] = 1
modelQ
}
}#Account for p-value=0
$trait1[modelQ$trait1==0] = 1e-300 modelQ
Single marker regression with covariance information among the individuals. The matrix will be calculated internally by the function ’A.mat`.
\[y = S\alpha + Qv + e \]
#Adjust for kinship (K)----
= A.mat(X)
Kmat = GWAS(pheno=dat[,-2],geno=geno,K=Kmat,plot=FALSE) modelK
## [1] "GWAS for trait: trait1"
## [1] "Variance components estimated. Testing markers."
$trait1 = 10^(-modelK$trait1) #Revert to p-value modelK
\[y = X\beta + S\alpha + Qv + e \]
#Adjust for both structure and kinship (Q+K)----
= GWAS(pheno=dat,geno=geno,K=Kmat,fixed="subPop",plot=FALSE) modelQK
## [1] "GWAS for trait: trait1"
## [1] "Variance components estimated. Testing markers."
$trait1 = 10^(-modelQK$trait1) #Revert to p-value modelQK
# Assuming Bonferroni Threshold (0.05/m)
= -log10(0.05/ncol(X))
hline
# Manhattan plots
= par(mfrow=c(2,2),mai=c(.9,.9,0.9,0.9),
op mar=c(2.5,2.5,1,1)+0.1,
mgp = c(1.5,0.5,0))
manhattan(model0,chr="chr",bp="pos",p="trait1",snp="snp",highlight=snp_qtl$qtl,
main="Unadjusted", col = c("blue4", "orange3"),
suggestiveline = FALSE, genomewideline = hline)
manhattan(modelQ,chr="chr",bp="pos",p="trait1",snp="snp",highlight=snp_qtl$qtl,
main="Adjusted for Q", col = c("blue4", "orange3"),
suggestiveline = FALSE, genomewideline = hline)
manhattan(modelK,chr="chr",bp="pos",p="trait1",snp="snp",highlight=snp_qtl$qtl,
main="Adjusted for K", col = c("blue4", "orange3"),
suggestiveline = FALSE, genomewideline = hline)
manhattan(modelQK,chr="chr",bp="pos",p="trait1",snp="snp",highlight=snp_qtl$qtl,
main="Adjusted for Q+K", col = c("blue4", "orange3"),
suggestiveline = FALSE, genomewideline = hline)
#Q-Q plots
= par(mfrow=c(2,2),mai=c(.9,.9,0.9,0.9),
op mar=c(2.5,2.5,1,1)+0.1,
mgp = c(1.5,0.5,0))
qq(model0$trait1,main="Unadjusted", col = "blue4")
qq(modelQ$trait1,main="Adjusted for Q", col = "blue4")
qq(modelK$trait1,main="Adjusted for K", col = "blue4")
qq(modelQK$trait1,main="Adjusted for Q+K", col = "blue4")
###
## EXTRA 1: Extracting the effects
## Let's extract the effects of the highest peak
= GWAS(pheno=dat,geno=geno,K=Kmat,fixed="subPop",plot=FALSE) modelQK
## [1] "GWAS for trait: trait1"
## [1] "Variance components estimated. Testing markers."
which.max(modelQK$trait1) #SNP #7246
## [1] 7246
colnames(X)[7246]
## [1] "8_246"
= X[,7246]
h.snp = pca_res$x[,c(1:2)]
PCs
<- mixed.solve(y=dat$trait1,K=Kmat,X=cbind(h.snp,PCs))
model.full $beta #you have beta (fixed effects) with four effects, the 2 PCs and the SNP effect model.full
## h.snp PC1 PC2
## -2.061597463 0.005077491 0.021643665
$beta[1] #with the increase in one unit of X the phenotype reduces on -2.061597463 model.full
## h.snp
## -2.061597
## EXTRA 2: Extracting the % variance explained by the SNP
## There is no straightforward way to compute the % of the phenotypic variance explained by this SNP with rrBLUP package. So, to get such statistic you would need to use some tricks or use another software
## 1) Using a simpler model
= lm(dat$trait1 ~ h.snp + PCs) #so no K component
fit anova(fit)
## Analysis of Variance Table
##
## Response: dat$trait1
## Df Sum Sq Mean Sq F value Pr(>F)
## h.snp 1 28.807 28.8075 75.726 1.329e-15 ***
## PCs 2 13.679 6.8397 17.980 6.772e-08 ***
## Residuals 196 74.561 0.3804
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## % of variance explained by the SNP is 24.6%
28.8075 / (28.8075 + 13.679 + 74.561)
## [1] 0.246118
## 2) Using a two-step strategy
#### i) fit the linear mixed model
#### i) take out the polygenic effects from the data
#### ii) use transformed phenotype to extract variance %
<- mixed.solve(y=dat$trait1,K=Kmat,X=cbind(h.snp,PCs))
model.full = as.vector(dat$trait1) - model.full$u
Yhat = lm(Yhat ~ h.snp + PCs) #so no K component
fit anova(fit)
## Analysis of Variance Table
##
## Response: Yhat
## Df Sum Sq Mean Sq F value Pr(>F)
## h.snp 1 28.807 28.8075 75.726 1.329e-15 ***
## PCs 2 13.679 6.8397 17.980 6.772e-08 ***
## Residuals 196 74.561 0.3804
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## % of variance explained by the SNP is 24.6%
28.8075 / (28.8075 + 13.679 + 74.561)
## [1] 0.246118
Here, we will run different GWAS models described by Zhiwu Zwang using Genomic Association and Prediction Integrated Tool (GAPIT) R package. We will run FarmCPU and BLINK models using the simulated data described earlier. Gapit R package can be used by installation into R environment or calling the function from the source.
# Install an important dependency from GitHub with
#devtools::install_github("SFUStatgen/LDheatmap")
#install.packages("remotes")
#remotes::install_github("jiabowang/GAPIT3")
library(GAPIT)
#or
# loading packages for GAPIT and GAPIT functions
#source("https://zzlab.net/GAPIT/emma.txt")
#source("https://zzlab.net/GAPIT/gapit_functions.txt")
# from GAPIT user manual
#source("http://zzlab.net/GAPIT/GAPIT.library.R")
#source("http://zzlab.net/GAPIT/gapit_functions.txt")
Model 5. Adjusted by Psuedo QTNs (S)+K
Model 6. Adjusted by Psuedo QTNs (S)+S
We will need dosage matrix (coded 0,1,2), therefore, we will add 1 to the snp marker dataframe.
= dat[,-2] #Genotype id and response variable(s)
myY = data.frame(X.org[,1:3]) #Marker information: SNP name, chromosome and position
myGM = t(X.org[,-c(1:3)])+1 #change dosage to 0,1,2
XX = data.frame(dat["gid"],XX) #Genotype id and Imputed SNP marker
myGD rownames(myGD) = NULL
colnames(myGD)[2:c(ncol(XX)+1)] = X.org$snp
FarmCPU removes associated markers that is confounding from kinship by using it as fixed-effect
::include_graphics("FarmCPU.png") knitr
# s = Testing marker
# S = Psuedo QTNs
# K = Kinship
#myGAPIT_FarmCPU <- GAPIT(
# Y=myY, #fist column is individual ID,
# GD=myGD,
# GM=myGM,
# PCA.total=2, #Two groups
# model="FarmCPU", #c("MLM","MLMM","CLMM","FarmCPU","Blink"),
# SNP.MAF = 0.05,
# Geno.View.output = FALSE)
BLINK select most significantly associated maker as reference and eliminate remaining markers that are in LD with the most associated marker.
::include_graphics("BLINK.png") knitr
# s = Testing marker
# S = Psuedo QTNs
#myGAPIT_Blink <- GAPIT(
# Y=myY,
# GD=myGD,
# GM=myGM,
# PCA.total=2,
# model="Blink",
# SNP.MAF = 0.05,
# Geno.View.output = FALSE)
::include_graphics("data_sum.png") #Phenotype diagnosis and pca knitr
::include_graphics("gwas_plot.png") #manhattan plot and qqplot knitr
::include_graphics("gwas_pve.png") #Distribution of significant markers knitr
::include_graphics("gwas_sig.png") #Distribution of significant markers knitr
#snp_qtl2 = snp_qtl[order(snp_qtl$eff,decreasing = T),]
#snp_qtl2[which(snp_qtl2$qtl%in%c("8_250","2_896","7_804")),] #8_248
University of Florida, mresende@ufl.edu↩︎
University of Florida, deamorimpeixotom@ufl.edu↩︎
University of Florida, paul.adunola@ufl.edu↩︎