library(tidyverse)
load("~/Box/imlab-data/Projects/PTRS-PGRS-Rosetta/Data-From-Abe-Palmer-Lab/Rdata/genoGex.RData")
"%&%" = function(a,b) paste(a,b,sep="")
<- "/Users/natashasanthanam/Github/rat-genomic-analysis/data/"
wd
= geno[,match(rownames(gexAc_transpose), colnames(geno))]
geno_Ac = geno[,match(rownames(gexIl_transpose), colnames(geno))]
geno_Il = geno[,match(rownames(gexLh_transpose), colnames(geno))]
geno_Lh = geno[,match(rownames(gexPl_transpose), colnames(geno))]
geno_Pl = geno[,match(rownames(gexVo_transpose), colnames(geno))]
geno_Vo
<- cbind(phyMap$chr, phyMap$pos, rownames(geno_Ac), phyMap$refAllele, phyMap$effectAllele, geno_Ac)
Ac_bimbam <- cbind(phyMap$chr, phyMap$pos, rownames(geno_Il),phyMap$refAllele, phyMap$effectAllele, geno_Il)
Il_bimbam <- cbind(phyMap$chr, phyMap$pos, rownames(geno_Lh),phyMap$refAllele, phyMap$effectAllele, geno_Lh)
Lh_bimbam <- cbind(phyMap$chr, phyMap$pos, rownames(geno_Pl),phyMap$refAllele, phyMap$effectAllele, geno_Pl)
Pl_bimbam <- cbind(phyMap$chr, phyMap$pos, rownames(geno_Vo),phyMap$refAllele, phyMap$effectAllele, geno_Vo)
Vo_bimbam
write.table(Ac_bimbam, file = wd %&% "Ac_bimbam",quote=F,col.names=F,row.names=F)
write.table(Il_bimbam, file = wd %&% "Il_bimbam",quote=F,col.names=F,row.names=F)
write.table(Lh_bimbam, file = wd %&% "Lh_bimbam",quote=F,col.names=F,row.names=F)
write.table(Pl_bimbam, file = wd %&%"Pl_bimbam",quote=F,col.names=F,row.names=F)
write.table(Vo_bimbam, file = wd %&%"Vo_bimbam",quote=F,col.names=F,row.names=F)
01.Heritability_Sparsity
Generate Heritability and Sparsity Estimates for all 5 tissues
Calculate Cis Heritability within 1MB
First we create bimbam formats for genotypes from the original genotype file. The bimbam format is the input for gemma, which we will use for both heritability and sparisty estiamtes.
Collect list of individuals from the expression files
<- read_tsv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gtf.txt", col_names=TRUE)
gtf <- read.table("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gexAc_transpose.txt")
gexAc_transpose <- read.table("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gexIl_transpose.txt")
gexIl_transpose <- read.table("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gexLh_transpose.txt")
gexLh_transpose <- read.table("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gexPl_transpose.txt")
gexPl_transpose <- read.table("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gexVo_transpose.txt")
gexVo_transpose
<- colnames(gexAc_transpose)
ensidlist <- colnames(gexIl_transpose)
ensidlist_Il <- colnames(gexLh_transpose)
ensidlist_Lh <- colnames(gexPl_transpose)
ensidlist_Pl <- colnames(gexVo_transpose) ensidlist_Vo
Ac
set directory
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/phenotype_files/"
pheno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/genotype_files/"
ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/bim_bam/"
bim.dir #Read in bimbam file
<- bim.dir %&% "Ac_bimbam" ###get SNP position information###
bimbamfile <- read.table(bimbamfile) bimbam
Make local GRMs for each gene
setwd("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/")
for(i in 1:length(ensidlist)){
cat(i,"/",length(ensidlist),"\n")
<- ensidlist[i]
gene <- gtf[match(gene, gtf$Gene),]
geneinfo <- geneinfo[1]
chr <- chr$Chr
c <- geneinfo$Start - 1e6 ### 1Mb lower bound for cis-eQTLS
start <- geneinfo$End + 1e6 ### 1Mb upper bound for cis-eQTLs
end <- subset(bimbam, bimbam[,1]==c) ### pull snps on same chr
chrsnps <- subset(chrsnps,chrsnps[,2]>=start & chrsnps[,2]<=end) ### pull cis-SNP info
cissnps <- cissnps[,3:ncol(cissnps)]
snplist write.table(snplist, file= ge.dir %&% "tmp.Ac.geno" %&% gene, quote=F,col.names=F,row.names=F)
<- cbind(gexAc_transpose[,i])
geneexp write.table(geneexp, file= pheno.dir %&% "tmp.pheno." %&% gene, col.names=F, row.names = F, quote=F) #output pheno for gemma
<- "gemma -g " %&% ge.dir %&% "tmp.Ac.geno" %&% gene %&% " -p " %&% pheno.dir %&% "tmp.pheno." %&% gene %&% " -gk -o grm_Ac_" %&% gene
runGEMMAgrm system(runGEMMAgrm)
}
Now we do the above process for the rest of tissues
Il
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/phenotype_files/"
pheno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/genotype_files/"
ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/bim_bam/"
bim.dir #Read in bimbam file
<- bim.dir %&% "Il_bimbam" ###get SNP position information###
bimbamfile <- read.table(bimbamfile) bimbam
Make local GRMs for each gene
setwd("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/")
for(i in 1:length(ensidlist)){
cat(i,"/",length(ensidlist),"\n")
<- ensidlist[i]
gene <- gtf[match(gene, gtf$Gene),]
geneinfo <- geneinfo[1]
chr <- chr$Chr
c <- geneinfo$Start - 1e6 ### 1Mb lower bound for cis-eQTLS
start <- geneinfo$End + 1e6 ### 1Mb upper bound for cis-eQTLs
end <- subset(bimbam, bimbam[,1]==c) ### pull snps on same chr
chrsnps <- subset(chrsnps,chrsnps[,2]>=start & chrsnps[,2]<=end) ### pull cis-SNP info
cissnps <- cissnps[,3:ncol(cissnps)]
snplist write.table(snplist, file= ge.dir %&% "tmp.Il.geno" %&% gene, quote=F,col.names=F,row.names=F)
<- cbind(gexIl_transpose[,i])
geneexp write.table(geneexp, file= pheno.dir %&% "tmp.pheno." %&% gene, col.names=F, row.names = F, quote=F) #output pheno for gemma
<- "gemma -g " %&% ge.dir %&% "tmp.Il.geno" %&% gene %&% " -p " %&% pheno.dir %&% "tmp.pheno." %&% gene %&% " -gk -o grm_Il_" %&% gene
runGEMMAgrm system(runGEMMAgrm)
}
Lh
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/phenotype_files/"
pheno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/genotype_files/"
ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/bim_bam/"
bim.dir #Read in bimbam file
<- bim.dir %&% "Lh_bimbam" ###get SNP position information###
bimbamfile <- read.table(bimbamfile) bimbam
Make local GRMs for each gene
setwd("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/")
for(i in 1:length(ensidlist)){
cat(i,"/",length(ensidlist),"\n")
<- ensidlist[i]
gene <- gtf[match(gene, gtf$Gene),]
geneinfo <- geneinfo[1]
chr <- chr$Chr
c <- geneinfo$Start - 1e6 ### 1Mb lower bound for cis-eQTLS
start <- geneinfo$End + 1e6 ### 1Mb upper bound for cis-eQTLs
end <- subset(bimbam, bimbam[,1]==c) ### pull snps on same chr
chrsnps <- subset(chrsnps,chrsnps[,2]>=start & chrsnps[,2]<=end) ### pull cis-SNP info
cissnps <- cissnps[,3:ncol(cissnps)]
snplist write.table(snplist, file= ge.dir %&% "tmp.Lh.geno" %&% gene, quote=F,col.names=F,row.names=F)
<- cbind(gexLh_transpose[,i])
geneexp write.table(geneexp, file= pheno.dir %&% "tmp.pheno." %&% gene, col.names=F, row.names = F, quote=F) #output pheno for gemma
<- "gemma -g " %&% ge.dir %&% "tmp.Lh.geno" %&% gene %&% " -p " %&% pheno.dir %&% "tmp.pheno." %&% gene %&% " -gk -o grm_Lh_" %&% gene
runGEMMAgrm system(runGEMMAgrm)
}
Pl
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/phenotype_files/"
pheno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/genotype_files/"
ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/bim_bam/"
bim.dir #Read in bimbam file
<- bim.dir %&% "Pl_bimbam" ###get SNP position information###
bimbamfile <- read.table(bimbamfile) bimbam
Make local GRMs for each gene
setwd("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/")
for(i in 1:length(ensidlist)){
cat(i,"/",length(ensidlist),"\n")
<- ensidlist[i]
gene <- gtf[match(gene, gtf$Gene),]
geneinfo <- geneinfo[1]
chr <- chr$Chr
c <- geneinfo$Start - 1e6 ### 1Mb lower bound for cis-eQTLS
start <- geneinfo$End + 1e6 ### 1Mb upper bound for cis-eQTLs
end <- subset(bimbam, bimbam[,1]==c) ### pull snps on same chr
chrsnps <- subset(chrsnps,chrsnps[,2]>=start & chrsnps[,2]<=end) ### pull cis-SNP info
cissnps <- cissnps[,3:ncol(cissnps)]
snplist write.table(snplist, file= ge.dir %&% "tmp.Pl.geno" %&% gene, quote=F,col.names=F,row.names=F)
<- cbind(gexPl_transpose[,i])
geneexp write.table(geneexp, file= pheno.dir %&% "tmp.pheno." %&% gene, col.names=F, row.names = F, quote=F) #output pheno for gemma
<- "gemma -g " %&% ge.dir %&% "tmp.Pl.geno" %&% gene %&% " -p " %&% pheno.dir %&% "tmp.pheno." %&% gene %&% " -gk -o grm_Pl_" %&% gene
runGEMMAgrm system(runGEMMAgrm)
}
#Vo
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/phenotype_files/"
pheno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/genotype_files/"
ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/bim_bam/"
bim.dir #Read in bimbam file
<- bim.dir %&% "Vo_bimbam" ###get SNP position information###
bimbamfile <- read.table(bimbamfile) bimbam
Make local GRMs for each gene
setwd("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/")
for(i in 1:length(ensidlist)){
cat(i,"/",length(ensidlist),"\n")
<- ensidlist[i]
gene <- gtf[match(gene, gtf$Gene),]
geneinfo <- geneinfo[1]
chr <- chr$Chr
c <- geneinfo$Start - 1e6 ### 1Mb lower bound for cis-eQTLS
start <- geneinfo$End + 1e6 ### 1Mb upper bound for cis-eQTLs
end <- subset(bimbam, bimbam[,1]==c) ### pull snps on same chr
chrsnps <- subset(chrsnps,chrsnps[,2]>=start & chrsnps[,2]<=end) ### pull cis-SNP info
cissnps <- cissnps[,3:ncol(cissnps)]
snplist write.table(snplist, file= ge.dir %&% "tmp.Vo.geno" %&% gene, quote=F,col.names=F,row.names=F)
<- cbind(gexVo_transpose[,i])
geneexp write.table(geneexp, file= pheno.dir %&% "tmp.pheno." %&% gene, col.names=F, row.names = F, quote=F) #output pheno for gemma
<- "gemma -g " %&% ge.dir %&% "tmp.Vo.geno" %&% gene %&% " -p " %&% pheno.dir %&% "tmp.pheno." %&% gene %&% " -gk -o grm_Vo_" %&% gene
runGEMMAgrm system(runGEMMAgrm)
}
Sparsity Analysis
All the code above generates the local GRM for each phenotype (gene). With the GRM we then run gemma again to calculate both PVE (heritability) and PGE (sparsity). I used a badger template to calculate h2 and sparsity for each tissue. This steps takes a lot of computing power, so we use Badger. It takes approximatley 2-3 days to run.
The code to run badger is here
source("./Rat_Genomics_Paper_Pipeline/analysis/Sparsity_Badger_Template.Rmd")
GEMMA then generates a .hyp file for each phenotype or in our case gene of interest. The hyp file contains the posterior samples for the hyper-parameters (h, PVE, rho, PGE, pi and gamma) for every 10th iteration. For our purposes, we are interested in the PGE and PVE parameters.
To then calculate the point estimate and credible set for Proportion of Variance Explained (PVE) and Proportion of genetic variance explained by the sparse effects terms (PGE), we calculate the posterior probability for each gene for each tissue.
I generated a function that calculates the beta of the posterior distribution and can be found here
Ac
Find point estimate and 95% credible interval in CRI for PVE
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/output"
ge.dir <- list.files(path = ge.dir, pattern = ".hyp.txt", full.names = TRUE) files
<- as.data.frame(matrix(NA, 0, 4))
PVE_df
for(i in 1:length(files)){
<- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
gene <- read_tsv(files[i])
df
<- quantile(df$pve, 0.1)
q1 <- quantile(df$pve, 0.9)
q2 =list(p=.1 ,x=q1)
quantile1=list(p=.9, x=q2)
quantile2if(quantile1$x != quantile2$x) {
<- beta.select(quantile1, quantile2)
prior <- list(qbeta(0.025,prior[1], prior[2]), qbeta(0.975,prior[1], prior[2]))
credible_set
1] <- gene
PVE_df[i, 2] <- qbeta(0.5, prior[1], prior[2])
PVE_df[i, 3] <- credible_set[1]
PVE_df[i, 4] <- credible_set[2]
PVE_df[i,
}else
= i+1
i
}
write_tsv(PVE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/PVE_estimates.txt", col_names = FALSE )
<- as.data.frame(matrix(NA, 0, 4))
PGE_df for(i in 1:length(files)){
<- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
gene <- read_tsv(files[i])
df
<- quantile(df$pge, 0.5)
q1 <- quantile(df$pge, 0.9)
q2 =list(p=.5 ,x=q1)
quantile1=list(p=.9, x=q2)
quantile2if(quantile1$x != quantile2$x) {
<- beta.select(quantile1, quantile2)
prior <- list(qbeta(0.025,prior[1], prior[2]), qbeta(0.975,prior[1], prior[2]))
credible_set
1] <- gene
PGE_df[i, 2] <- qbeta(0.5, prior[1], prior[2])
PGE_df[i, 3] <- credible_set[1]
PGE_df[i, 4] <- credible_set[2]
PGE_df[i,
}else
= i+1
i
}
write_tsv(PGE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/PGE_estimates.txt", col_names = FALSE )
Il
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/output"
ge.dir <- list.files(path = ge.dir, pattern = ".hyp.txt", full.names = TRUE) files
<- as.data.frame(matrix(NA, 0, 4))
PVE_df
for(i in 1:length(files)){
<- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
gene <- read_tsv(files[i])
df
<- quantile(df$pve, 0.1)
q1 <- quantile(df$pve, 0.9)
q2 =list(p=.1 ,x=q1)
quantile1=list(p=.9, x=q2)
quantile2if(quantile1$x != quantile2$x) {
<- beta.select(quantile1, quantile2)
prior <- list(qbeta(0.025,prior[1], prior[2]), qbeta(0.975,prior[1], prior[2]))
credible_set
1] <- gene
PVE_df[i, 2] <- qbeta(0.5, prior[1], prior[2])
PVE_df[i, 3] <- credible_set[1]
PVE_df[i, 4] <- credible_set[2]
PVE_df[i,
}else
= i+1
i
}
write_tsv(PVE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/PVE_estimates.txt", col_names = FALSE )
<- as.data.frame(matrix(NA, 0, 4))
PGE_df for(i in 1:length(files)){
<- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
gene <- read_tsv(files[i])
df
<- quantile(df$pge, 0.5)
q1 <- quantile(df$pge, 0.9)
q2 =list(p=.5 ,x=q1)
quantile1=list(p=.9, x=q2)
quantile2if(quantile1$x != quantile2$x) {
<- beta.select(quantile1, quantile2)
prior <- list(qbeta(0.025,prior[1], prior[2]), qbeta(0.975,prior[1], prior[2]))
credible_set
1] <- gene
PGE_df[i, 2] <- qbeta(0.5, prior[1], prior[2])
PGE_df[i, 3] <- credible_set[1]
PGE_df[i, 4] <- credible_set[2]
PGE_df[i,
}else
= i+1
i
}
write_tsv(PGE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/PGE_estimates.txt", col_names = FALSE )
Lh
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/output"
ge.dir <- list.files(path = ge.dir, pattern = ".hyp.txt", full.names = TRUE) files
<- as.data.frame(matrix(NA, 0, 4))
PVE_df
for(i in 1:length(files)){
<- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
gene <- read_tsv(files[i])
df
<- quantile(df$pve, 0.1)
q1 <- quantile(df$pve, 0.9)
q2 =list(p=.1 ,x=q1)
quantile1=list(p=.9, x=q2)
quantile2if(quantile1$x != quantile2$x) {
<- beta.select(quantile1, quantile2)
prior <- list(qbeta(0.025,prior[1], prior[2]), qbeta(0.975,prior[1], prior[2]))
credible_set
1] <- gene
PVE_df[i, 2] <- qbeta(0.5, prior[1], prior[2])
PVE_df[i, 3] <- credible_set[1]
PVE_df[i, 4] <- credible_set[2]
PVE_df[i,
}else
= i+1
i
}
write_tsv(PVE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/PVE_estimates.txt", col_names = FALSE )
<- as.data.frame(matrix(NA, 0, 4))
PGE_df for(i in 1:length(files)){
<- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
gene <- read_tsv(files[i])
df
<- quantile(df$pge, 0.5)
q1 <- quantile(df$pge, 0.9)
q2 =list(p=.5 ,x=q1)
quantile1=list(p=.9, x=q2)
quantile2if(quantile1$x != quantile2$x) {
<- beta.select(quantile1, quantile2)
prior <- list(qbeta(0.025,prior[1], prior[2]), qbeta(0.975,prior[1], prior[2]))
credible_set
1] <- gene
PGE_df[i, 2] <- qbeta(0.5, prior[1], prior[2])
PGE_df[i, 3] <- credible_set[1]
PGE_df[i, 4] <- credible_set[2]
PGE_df[i,
}else
= i+1
i
}
write_tsv(PGE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/PGE_estimates.txt", col_names = FALSE )
Pl
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/output"
ge.dir <- list.files(path = ge.dir, pattern = ".hyp.txt", full.names = TRUE) files
<- as.data.frame(matrix(NA, 0, 4))
PVE_df
for(i in 1:length(files)){
<- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
gene <- read_tsv(files[i])
df
<- quantile(df$pve, 0.1)
q1 <- quantile(df$pve, 0.9)
q2 =list(p=.1 ,x=q1)
quantile1=list(p=.9, x=q2)
quantile2if(quantile1$x != quantile2$x) {
<- beta.select(quantile1, quantile2)
prior <- list(qbeta(0.025,prior[1], prior[2]), qbeta(0.975,prior[1], prior[2]))
credible_set
1] <- gene
PVE_df[i, 2] <- qbeta(0.5, prior[1], prior[2])
PVE_df[i, 3] <- credible_set[1]
PVE_df[i, 4] <- credible_set[2]
PVE_df[i,
}else
= i+1
i
}
write_tsv(PVE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/PVE_estimates.txt", col_names = FALSE )
<- as.data.frame(matrix(NA, 0, 4))
PGE_df for(i in 1:length(files)){
<- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
gene <- read_tsv(files[i])
df
<- quantile(df$pge, 0.5)
q1 <- quantile(df$pge, 0.9)
q2 =list(p=.5 ,x=q1)
quantile1=list(p=.9, x=q2)
quantile2if(quantile1$x != quantile2$x) {
<- beta.select(quantile1, quantile2)
prior <- list(qbeta(0.025,prior[1], prior[2]), qbeta(0.975,prior[1], prior[2]))
credible_set
1] <- gene
PGE_df[i, 2] <- qbeta(0.5, prior[1], prior[2])
PGE_df[i, 3] <- credible_set[1]
PGE_df[i, 4] <- credible_set[2]
PGE_df[i,
}else
= i+1
i
}
write_tsv(PGE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/PGE_estimates.txt", col_names = FALSE )
Vo
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/output"
ge.dir <- list.files(path = ge.dir, pattern = ".hyp.txt", full.names = TRUE) files
<- as.data.frame(matrix(NA, 0, 4))
PVE_df
for(i in 1:length(files)){
<- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
gene <- read_tsv(files[i])
df
<- quantile(df$pve, 0.1)
q1 <- quantile(df$pve, 0.9)
q2 =list(p=.1 ,x=q1)
quantile1=list(p=.9, x=q2)
quantile2if(quantile1$x != quantile2$x) {
<- beta.select(quantile1, quantile2)
prior <- list(qbeta(0.025,prior[1], prior[2]), qbeta(0.975,prior[1], prior[2]))
credible_set
1] <- gene
PVE_df[i, 2] <- qbeta(0.5, prior[1], prior[2])
PVE_df[i, 3] <- credible_set[1]
PVE_df[i, 4] <- credible_set[2]
PVE_df[i,
}else
= i+1
i
}
write_tsv(PVE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/PVE_estimates.txt", col_names = FALSE )
<- as.data.frame(matrix(NA, 0, 4))
PGE_df for(i in 1:length(files)){
<- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
gene <- read_tsv(files[i])
df
<- quantile(df$pge, 0.5)
q1 <- quantile(df$pge, 0.9)
q2 =list(p=.5 ,x=q1)
quantile1=list(p=.9, x=q2)
quantile2if(quantile1$x != quantile2$x) {
<- beta.select(quantile1, quantile2)
prior <- list(qbeta(0.025,prior[1], prior[2]), qbeta(0.975,prior[1], prior[2]))
credible_set
1] <- gene
PGE_df[i, 2] <- qbeta(0.5, prior[1], prior[2])
PGE_df[i, 3] <- credible_set[1]
PGE_df[i, 4] <- credible_set[2]
PGE_df[i,
}else
= i+1
i
}
write_tsv(PGE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/PGE_estimates.txt", col_names = FALSE )
Now we have PVE and PGE estimates for each tissue and can generate the fitted curve figure for Heritability.
Generate Fitted Heritability Curves with R2
I’m showing the curve for Ac tissue but I generated the one for all other tissues as well.
theme_set(theme_bw(base_size = 15))
<- "/Users/natashasanthanam/Box/imlab-data/Projects/PTRS-PGRS-Rosetta/Rat-Genomics/Tyson-PalmerLab_PredictDB_Results/sql/"
Data <- "/Users/natashasanthanam/Github/rat-genomic-analysis/data/" poly_dir
<- function(df){
load_pve <- df[order(df$point_estimate),]
df $index <- 1:nrow(df)
dfreturn(df)
}
First we get the R2 for the tissue from the model we generated
<- Data %&% "Ac_output_db.db"
filename <- dbDriver("SQLite")
sqlite.driver <- dbConnect(RSQLite::SQLite(), filename)
conn
<- dbGetQuery(conn, 'select * from extra')
extra dbDisconnect(conn)
Next we create the ordered plot for heritability for Ac. We use PVE as h2, the proportion of variance in phenotypes explained by typed genotypes.
<- read_csv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/human_PVE.csv") %>% rename(point_estimate = pve)
human_h2 <- read_tsv(poly_dir %&% "Ac_PVE_estimates.txt", col_names = FALSE)
PVE_Ac
$ensid = sapply(strsplit(human_h2$ensid, "\\."), `[`, 1)
human_h2colnames(PVE_Ac) <- c("gene", "point_estimate", "credible_set_1", "credible_set_2")
<- data.frame(rat_genes = PVE_Ac$gene, ensid = orth.rats[match(PVE_Ac$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 1]$ensembl_gene_id)
overlap <- overlap[na.omit(match(human_h2$ensid, overlap$ensid)), ]
overlap
<- PVE_Ac %>% filter(gene %in% overlap$rat_genes) PVE_Ac
<- inner_join(PVE_Ac, extra, by = "gene")
PVE_Ac
<- load_pve(PVE_Ac)
A_df_Ac <- (ggplot(data = A_df_Ac, aes(x = index))
plt_1 + geom_point(aes(x=index, y=R2), colour = "dodgerblue2", size = 0.2)
+ geom_line(aes(y = point_estimate), lwd=1.5)
+ geom_hline(yintercept = 0, linetype=2)
+ geom_ribbon(aes(ymax = credible_set_2, ymin = credible_set_1),
alpha = 0.25)
+ labs(x = 'Genes Sorted by Proportion of Variance Explained (PVE)',
y = 'PVE')
+ ylim(c(0,1))
+ annotate("text", x = 1250, y = 0.9, label = "Mean h2 = 0.098", size = 6)
+ annotate("text", x = 1250, y = 0.8, label = "Mean R2 = 0.085", size = 6))
plt_1
Generate Analagous Plot in Humans
Get R2 and H2 from Heather’s Models for Brain
sqlite3 genarch.dbon
.headers mode csv
.
.output human_PVE.csvselect gene, ensid, en_r2 ,pve,pve_CI from results where tissue = "DGN-WB";
.quit
$credible_set_1 = as.numeric(ifelse(substr(human_h2$pve_CI ,1,1) == "-", paste("-", sapply(strsplit(human_h2$pve_CI, "-"), `[`, 2), sep = ""), sapply(strsplit(human_h2$pve_CI, "-"), `[`, 1)))
human_h2$credible_set_2 = as.numeric(ifelse(substr(human_h2$pve_CI ,1,1) == "-", sapply(strsplit(human_h2$pve_CI, "-"), `[`, 3), sapply(strsplit(human_h2$pve_CI, "-"), `[`, 2)))
human_h2
<- human_h2 %>% filter(ensid %in% overlap$ensid)
human_h2
<- load_pve(human_h2)
hum_df <- (ggplot(data = hum_df, aes(x = index))
plt_2 + geom_point(aes(x=index, y=en_r2), colour = "maroon1", size = 0.2)
+ geom_line(aes(y = point_estimate), lwd = 1.5)
+ geom_hline(yintercept = 0, linetype=2)
+ geom_ribbon(aes(ymax = credible_set_2, ymin = credible_set_1),
alpha = 0.25)
+ labs(x = 'Genes Sorted by Proportion of Variance Explained (PVE)',
y = 'PVE')
+ ylim(c(0,1))
+ annotate("text", x = 1690, y = 0.9, label = "Mean h2 = 0.124", size = 6)
+ annotate("text", x = 1710, y = 0.8, label = "Mean R2 = 0.114", size = 6))
plt_2
<- tibble(gene)
all_h2 for(l in all) {
<- l %>% select(c(gene, point_estimate))
l <- full_join(all_h2, l, by = "gene")
all_h2 }