01.Heritability_Sparsity

Author

natasha.santhanam

Published

February 7, 2022

Generate Heritability and Sparsity Estimates for all 5 tissues

Calculate Cis Heritability within 1MB

First we create bimbam formats for genotypes from the original genotype file. The bimbam format is the input for gemma, which we will use for both heritability and sparisty estiamtes.

library(tidyverse)
load("~/Box/imlab-data/Projects/PTRS-PGRS-Rosetta/Data-From-Abe-Palmer-Lab/Rdata/genoGex.RData")
"%&%" = function(a,b) paste(a,b,sep="")
wd <- "/Users/natashasanthanam/Github/rat-genomic-analysis/data/"

geno_Ac = geno[,match(rownames(gexAc_transpose), colnames(geno))]
geno_Il = geno[,match(rownames(gexIl_transpose), colnames(geno))]
geno_Lh = geno[,match(rownames(gexLh_transpose), colnames(geno))]
geno_Pl = geno[,match(rownames(gexPl_transpose), colnames(geno))]
geno_Vo = geno[,match(rownames(gexVo_transpose), colnames(geno))]

Ac_bimbam <- cbind(phyMap$chr, phyMap$pos, rownames(geno_Ac), phyMap$refAllele, phyMap$effectAllele, geno_Ac)
Il_bimbam <- cbind(phyMap$chr, phyMap$pos, rownames(geno_Il),phyMap$refAllele, phyMap$effectAllele,  geno_Il)
Lh_bimbam <- cbind(phyMap$chr, phyMap$pos, rownames(geno_Lh),phyMap$refAllele, phyMap$effectAllele,  geno_Lh)
Pl_bimbam <- cbind(phyMap$chr, phyMap$pos, rownames(geno_Pl),phyMap$refAllele, phyMap$effectAllele,  geno_Pl)
Vo_bimbam <- cbind(phyMap$chr, phyMap$pos, rownames(geno_Vo),phyMap$refAllele, phyMap$effectAllele,  geno_Vo)

write.table(Ac_bimbam, file = wd %&% "Ac_bimbam",quote=F,col.names=F,row.names=F)
write.table(Il_bimbam, file = wd %&% "Il_bimbam",quote=F,col.names=F,row.names=F)
write.table(Lh_bimbam, file = wd %&% "Lh_bimbam",quote=F,col.names=F,row.names=F)
write.table(Pl_bimbam, file = wd %&%"Pl_bimbam",quote=F,col.names=F,row.names=F)
write.table(Vo_bimbam, file = wd %&%"Vo_bimbam",quote=F,col.names=F,row.names=F)

Collect list of individuals from the expression files

gtf <- read_tsv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gtf.txt", col_names=TRUE)
gexAc_transpose <- read.table("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gexAc_transpose.txt")
gexIl_transpose <- read.table("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gexIl_transpose.txt")
gexLh_transpose <- read.table("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gexLh_transpose.txt")
gexPl_transpose <- read.table("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gexPl_transpose.txt")
gexVo_transpose <- read.table("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/gexVo_transpose.txt")

ensidlist <- colnames(gexAc_transpose)
ensidlist_Il <- colnames(gexIl_transpose)
ensidlist_Lh <- colnames(gexLh_transpose)
ensidlist_Pl <- colnames(gexPl_transpose)
ensidlist_Vo <- colnames(gexVo_transpose)

Ac

set directory

pheno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/phenotype_files/"
ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/genotype_files/"
bim.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/bim_bam/"
#Read in bimbam file 
bimbamfile <- bim.dir %&% "Ac_bimbam" ###get SNP position information###
bimbam <- read.table(bimbamfile)

Make local GRMs for each gene

setwd("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/")
for(i in 1:length(ensidlist)){
    cat(i,"/",length(ensidlist),"\n")
    gene <- ensidlist[i]
    geneinfo <- gtf[match(gene, gtf$Gene),]
    chr <- geneinfo[1]
    c <- chr$Chr
    start <- geneinfo$Start - 1e6 ### 1Mb lower bound for cis-eQTLS
    end <- geneinfo$End + 1e6 ### 1Mb upper bound for cis-eQTLs
    chrsnps <- subset(bimbam, bimbam[,1]==c) ### pull snps on same chr
    cissnps <- subset(chrsnps,chrsnps[,2]>=start & chrsnps[,2]<=end) ### pull cis-SNP info
    snplist <- cissnps[,3:ncol(cissnps)]
    write.table(snplist, file= ge.dir %&% "tmp.Ac.geno" %&% gene, quote=F,col.names=F,row.names=F)
    
    geneexp <- cbind(gexAc_transpose[,i])
    write.table(geneexp, file= pheno.dir %&% "tmp.pheno." %&% gene, col.names=F, row.names = F, quote=F) #output pheno for gemma
    runGEMMAgrm <- "gemma -g " %&%  ge.dir %&% "tmp.Ac.geno" %&% gene %&% " -p " %&% pheno.dir %&% "tmp.pheno." %&%  gene  %&%  " -gk -o grm_Ac_" %&% gene
    system(runGEMMAgrm)
}

Now we do the above process for the rest of tissues

Il

pheno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/phenotype_files/"
ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/genotype_files/"
bim.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/bim_bam/"
#Read in bimbam file 
bimbamfile <- bim.dir %&% "Il_bimbam" ###get SNP position information###
bimbam <- read.table(bimbamfile)

Make local GRMs for each gene

setwd("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/")
for(i in 1:length(ensidlist)){
    cat(i,"/",length(ensidlist),"\n")
    gene <- ensidlist[i]
    geneinfo <- gtf[match(gene, gtf$Gene),]
    chr <- geneinfo[1]
    c <- chr$Chr
    start <- geneinfo$Start - 1e6 ### 1Mb lower bound for cis-eQTLS
    end <- geneinfo$End + 1e6 ### 1Mb upper bound for cis-eQTLs
    chrsnps <- subset(bimbam, bimbam[,1]==c) ### pull snps on same chr
    cissnps <- subset(chrsnps,chrsnps[,2]>=start & chrsnps[,2]<=end) ### pull cis-SNP info
    snplist <- cissnps[,3:ncol(cissnps)]    
    write.table(snplist, file= ge.dir %&% "tmp.Il.geno" %&% gene, quote=F,col.names=F,row.names=F)
    
    geneexp <- cbind(gexIl_transpose[,i])
    write.table(geneexp, file= pheno.dir %&% "tmp.pheno." %&% gene, col.names=F, row.names = F, quote=F) #output pheno for gemma
    runGEMMAgrm <- "gemma -g " %&%  ge.dir %&% "tmp.Il.geno" %&% gene %&% " -p " %&% pheno.dir %&% "tmp.pheno." %&%  gene  %&%  " -gk -o grm_Il_" %&% gene
    system(runGEMMAgrm)
}

Lh

pheno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/phenotype_files/"
ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/genotype_files/"
bim.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/bim_bam/"
#Read in bimbam file 
bimbamfile <- bim.dir %&% "Lh_bimbam" ###get SNP position information###
bimbam <- read.table(bimbamfile)

Make local GRMs for each gene

setwd("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/")
for(i in 1:length(ensidlist)){
    cat(i,"/",length(ensidlist),"\n")
    gene <- ensidlist[i]
    geneinfo <- gtf[match(gene, gtf$Gene),]
    chr <- geneinfo[1]
    c <- chr$Chr
    start <- geneinfo$Start - 1e6 ### 1Mb lower bound for cis-eQTLS
    end <- geneinfo$End + 1e6 ### 1Mb upper bound for cis-eQTLs
    chrsnps <- subset(bimbam, bimbam[,1]==c) ### pull snps on same chr
    cissnps <- subset(chrsnps,chrsnps[,2]>=start & chrsnps[,2]<=end) ### pull cis-SNP info
    snplist <- cissnps[,3:ncol(cissnps)]    
    write.table(snplist, file= ge.dir %&% "tmp.Lh.geno" %&% gene, quote=F,col.names=F,row.names=F)
    
    geneexp <- cbind(gexLh_transpose[,i])
    write.table(geneexp, file= pheno.dir %&% "tmp.pheno." %&% gene, col.names=F, row.names = F, quote=F) #output pheno for gemma
    runGEMMAgrm <- "gemma -g " %&%  ge.dir %&% "tmp.Lh.geno" %&% gene %&% " -p " %&% pheno.dir %&% "tmp.pheno." %&%  gene  %&%  " -gk -o grm_Lh_" %&% gene
    system(runGEMMAgrm)
}

Pl

pheno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/phenotype_files/"
ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/genotype_files/"
bim.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/bim_bam/"
#Read in bimbam file 
bimbamfile <- bim.dir %&% "Pl_bimbam" ###get SNP position information###
bimbam <- read.table(bimbamfile)

Make local GRMs for each gene

setwd("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/")
for(i in 1:length(ensidlist)){
    cat(i,"/",length(ensidlist),"\n")
    gene <- ensidlist[i]
    geneinfo <- gtf[match(gene, gtf$Gene),]
    chr <- geneinfo[1]
    c <- chr$Chr
    start <- geneinfo$Start - 1e6 ### 1Mb lower bound for cis-eQTLS
    end <- geneinfo$End + 1e6 ### 1Mb upper bound for cis-eQTLs
    chrsnps <- subset(bimbam, bimbam[,1]==c) ### pull snps on same chr
    cissnps <- subset(chrsnps,chrsnps[,2]>=start & chrsnps[,2]<=end) ### pull cis-SNP info
    snplist <- cissnps[,3:ncol(cissnps)]    
    write.table(snplist, file= ge.dir %&% "tmp.Pl.geno" %&% gene, quote=F,col.names=F,row.names=F)
    
    geneexp <- cbind(gexPl_transpose[,i])
    write.table(geneexp, file= pheno.dir %&% "tmp.pheno." %&% gene, col.names=F, row.names = F, quote=F) #output pheno for gemma
    runGEMMAgrm <- "gemma -g " %&%  ge.dir %&% "tmp.Pl.geno" %&% gene %&% " -p " %&% pheno.dir %&% "tmp.pheno." %&%  gene  %&%  " -gk -o grm_Pl_" %&% gene
    system(runGEMMAgrm)
}

#Vo

pheno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/phenotype_files/"
ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/genotype_files/"
bim.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/bim_bam/"
#Read in bimbam file 
bimbamfile <- bim.dir %&% "Vo_bimbam" ###get SNP position information###
bimbam <- read.table(bimbamfile)

Make local GRMs for each gene

setwd("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/")
for(i in 1:length(ensidlist)){
    cat(i,"/",length(ensidlist),"\n")
    gene <- ensidlist[i]
    geneinfo <- gtf[match(gene, gtf$Gene),]
    chr <- geneinfo[1]
    c <- chr$Chr
    start <- geneinfo$Start - 1e6 ### 1Mb lower bound for cis-eQTLS
    end <- geneinfo$End + 1e6 ### 1Mb upper bound for cis-eQTLs
    chrsnps <- subset(bimbam, bimbam[,1]==c) ### pull snps on same chr
    cissnps <- subset(chrsnps,chrsnps[,2]>=start & chrsnps[,2]<=end) ### pull cis-SNP info
    snplist <- cissnps[,3:ncol(cissnps)]    
    write.table(snplist, file= ge.dir %&% "tmp.Vo.geno" %&% gene, quote=F,col.names=F,row.names=F)
    
    geneexp <- cbind(gexVo_transpose[,i])
    write.table(geneexp, file= pheno.dir %&% "tmp.pheno." %&% gene, col.names=F, row.names = F, quote=F) #output pheno for gemma
    runGEMMAgrm <- "gemma -g " %&%  ge.dir %&% "tmp.Vo.geno" %&% gene %&% " -p " %&% pheno.dir %&% "tmp.pheno." %&%  gene  %&%  " -gk -o grm_Vo_" %&% gene
    system(runGEMMAgrm)
}

Sparsity Analysis

All the code above generates the local GRM for each phenotype (gene). With the GRM we then run gemma again to calculate both PVE (heritability) and PGE (sparsity). I used a badger template to calculate h2 and sparsity for each tissue. This steps takes a lot of computing power, so we use Badger. It takes approximatley 2-3 days to run.

The code to run badger is here

source("./Rat_Genomics_Paper_Pipeline/analysis/Sparsity_Badger_Template.Rmd")

GEMMA then generates a .hyp file for each phenotype or in our case gene of interest. The hyp file contains the posterior samples for the hyper-parameters (h, PVE, rho, PGE, pi and gamma) for every 10th iteration. For our purposes, we are interested in the PGE and PVE parameters.

To then calculate the point estimate and credible set for Proportion of Variance Explained (PVE) and Proportion of genetic variance explained by the sparse effects terms (PGE), we calculate the posterior probability for each gene for each tissue.

I generated a function that calculates the beta of the posterior distribution and can be found here

Ac

Find point estimate and 95% credible interval in CRI for PVE

ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/output"
files <- list.files(path = ge.dir, pattern = ".hyp.txt", full.names = TRUE)
PVE_df <- as.data.frame(matrix(NA, 0, 4)) 

for(i in 1:length(files)){
  gene <- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
  df <- read_tsv(files[i])
  
  q1 <- quantile(df$pve, 0.1)
  q2 <- quantile(df$pve, 0.9)
  quantile1=list(p=.1 ,x=q1)
  quantile2=list(p=.9, x=q2)
  if(quantile1$x != quantile2$x) {
  prior <- beta.select(quantile1, quantile2)
  credible_set <- list(qbeta(0.025,prior[1],  prior[2]), qbeta(0.975,prior[1],  prior[2]))
  
  PVE_df[i, 1] <- gene
  PVE_df[i, 2] <- qbeta(0.5, prior[1],  prior[2])
  PVE_df[i, 3] <- credible_set[1]
  PVE_df[i, 4] <- credible_set[2]
  }
  else 
    i = i+1
}

write_tsv(PVE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/PVE_estimates.txt", col_names = FALSE )

PGE_df <- as.data.frame(matrix(NA, 0, 4))
for(i in 1:length(files)){
  gene <- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
  df <- read_tsv(files[i])
  
  q1 <- quantile(df$pge, 0.5)
  q2 <- quantile(df$pge, 0.9)
  quantile1=list(p=.5 ,x=q1)
  quantile2=list(p=.9, x=q2)
  if(quantile1$x != quantile2$x) {
  prior <- beta.select(quantile1, quantile2)
  credible_set <- list(qbeta(0.025,prior[1],  prior[2]), qbeta(0.975,prior[1],  prior[2]))
  
  PGE_df[i, 1] <- gene
  PGE_df[i, 2] <- qbeta(0.5, prior[1],  prior[2])
  PGE_df[i, 3] <- credible_set[1]
  PGE_df[i, 4] <- credible_set[2]
  }
  else 
    i = i+1
}

write_tsv(PGE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Ac/PGE_estimates.txt", col_names = FALSE )

Il

ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/output"
files <- list.files(path = ge.dir, pattern = ".hyp.txt", full.names = TRUE)
PVE_df <- as.data.frame(matrix(NA, 0, 4)) 

for(i in 1:length(files)){
  gene <- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
  df <- read_tsv(files[i])
  
  q1 <- quantile(df$pve, 0.1)
  q2 <- quantile(df$pve, 0.9)
  quantile1=list(p=.1 ,x=q1)
  quantile2=list(p=.9, x=q2)
  if(quantile1$x != quantile2$x) {
  prior <- beta.select(quantile1, quantile2)
  credible_set <- list(qbeta(0.025,prior[1],  prior[2]), qbeta(0.975,prior[1],  prior[2]))
  
  PVE_df[i, 1] <- gene
  PVE_df[i, 2] <- qbeta(0.5, prior[1],  prior[2])
  PVE_df[i, 3] <- credible_set[1]
  PVE_df[i, 4] <- credible_set[2]
  }
  else 
    i = i+1
}

write_tsv(PVE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/PVE_estimates.txt", col_names = FALSE )

PGE_df <- as.data.frame(matrix(NA, 0, 4))
for(i in 1:length(files)){
  gene <- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
  df <- read_tsv(files[i])
  
  q1 <- quantile(df$pge, 0.5)
  q2 <- quantile(df$pge, 0.9)
  quantile1=list(p=.5 ,x=q1)
  quantile2=list(p=.9, x=q2)
  if(quantile1$x != quantile2$x) {
  prior <- beta.select(quantile1, quantile2)
  credible_set <- list(qbeta(0.025,prior[1],  prior[2]), qbeta(0.975,prior[1],  prior[2]))
  
  PGE_df[i, 1] <- gene
  PGE_df[i, 2] <- qbeta(0.5, prior[1],  prior[2])
  PGE_df[i, 3] <- credible_set[1]
  PGE_df[i, 4] <- credible_set[2]
  }
  else 
    i = i+1
}

write_tsv(PGE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Il/PGE_estimates.txt", col_names = FALSE )

Lh

ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/output"
files <- list.files(path = ge.dir, pattern = ".hyp.txt", full.names = TRUE)
PVE_df <- as.data.frame(matrix(NA, 0, 4)) 

for(i in 1:length(files)){
  gene <- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
  df <- read_tsv(files[i])
  
  q1 <- quantile(df$pve, 0.1)
  q2 <- quantile(df$pve, 0.9)
  quantile1=list(p=.1 ,x=q1)
  quantile2=list(p=.9, x=q2)
  if(quantile1$x != quantile2$x) {
  prior <- beta.select(quantile1, quantile2)
  credible_set <- list(qbeta(0.025,prior[1],  prior[2]), qbeta(0.975,prior[1],  prior[2]))
  
  PVE_df[i, 1] <- gene
  PVE_df[i, 2] <- qbeta(0.5, prior[1],  prior[2])
  PVE_df[i, 3] <- credible_set[1]
  PVE_df[i, 4] <- credible_set[2]
  }
  else 
    i = i+1
}

write_tsv(PVE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/PVE_estimates.txt", col_names = FALSE )

PGE_df <- as.data.frame(matrix(NA, 0, 4))
for(i in 1:length(files)){
  gene <- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
  df <- read_tsv(files[i])
  
  q1 <- quantile(df$pge, 0.5)
  q2 <- quantile(df$pge, 0.9)
  quantile1=list(p=.5 ,x=q1)
  quantile2=list(p=.9, x=q2)
  if(quantile1$x != quantile2$x) {
  prior <- beta.select(quantile1, quantile2)
  credible_set <- list(qbeta(0.025,prior[1],  prior[2]), qbeta(0.975,prior[1],  prior[2]))
  
  PGE_df[i, 1] <- gene
  PGE_df[i, 2] <- qbeta(0.5, prior[1],  prior[2])
  PGE_df[i, 3] <- credible_set[1]
  PGE_df[i, 4] <- credible_set[2]
  }
  else 
    i = i+1
}

write_tsv(PGE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Lh/PGE_estimates.txt", col_names = FALSE )

Pl

ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/output"
files <- list.files(path = ge.dir, pattern = ".hyp.txt", full.names = TRUE)
PVE_df <- as.data.frame(matrix(NA, 0, 4)) 

for(i in 1:length(files)){
  gene <- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
  df <- read_tsv(files[i])
  
  q1 <- quantile(df$pve, 0.1)
  q2 <- quantile(df$pve, 0.9)
  quantile1=list(p=.1 ,x=q1)
  quantile2=list(p=.9, x=q2)
  if(quantile1$x != quantile2$x) {
  prior <- beta.select(quantile1, quantile2)
  credible_set <- list(qbeta(0.025,prior[1],  prior[2]), qbeta(0.975,prior[1],  prior[2]))
  
  PVE_df[i, 1] <- gene
  PVE_df[i, 2] <- qbeta(0.5, prior[1],  prior[2])
  PVE_df[i, 3] <- credible_set[1]
  PVE_df[i, 4] <- credible_set[2]
  }
  else 
    i = i+1
}

write_tsv(PVE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/PVE_estimates.txt", col_names = FALSE )

PGE_df <- as.data.frame(matrix(NA, 0, 4))
for(i in 1:length(files)){
  gene <- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
  df <- read_tsv(files[i])
  
  q1 <- quantile(df$pge, 0.5)
  q2 <- quantile(df$pge, 0.9)
  quantile1=list(p=.5 ,x=q1)
  quantile2=list(p=.9, x=q2)
  if(quantile1$x != quantile2$x) {
  prior <- beta.select(quantile1, quantile2)
  credible_set <- list(qbeta(0.025,prior[1],  prior[2]), qbeta(0.975,prior[1],  prior[2]))
  
  PGE_df[i, 1] <- gene
  PGE_df[i, 2] <- qbeta(0.5, prior[1],  prior[2])
  PGE_df[i, 3] <- credible_set[1]
  PGE_df[i, 4] <- credible_set[2]
  }
  else 
    i = i+1
}

write_tsv(PGE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Pl/PGE_estimates.txt", col_names = FALSE )

Vo

ge.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/output"
files <- list.files(path = ge.dir, pattern = ".hyp.txt", full.names = TRUE)
PVE_df <- as.data.frame(matrix(NA, 0, 4)) 

for(i in 1:length(files)){
  gene <- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
  df <- read_tsv(files[i])
  
  q1 <- quantile(df$pve, 0.1)
  q2 <- quantile(df$pve, 0.9)
  quantile1=list(p=.1 ,x=q1)
  quantile2=list(p=.9, x=q2)
  if(quantile1$x != quantile2$x) {
  prior <- beta.select(quantile1, quantile2)
  credible_set <- list(qbeta(0.025,prior[1],  prior[2]), qbeta(0.975,prior[1],  prior[2]))
  
  PVE_df[i, 1] <- gene
  PVE_df[i, 2] <- qbeta(0.5, prior[1],  prior[2])
  PVE_df[i, 3] <- credible_set[1]
  PVE_df[i, 4] <- credible_set[2]
  }
  else 
    i = i+1
}

write_tsv(PVE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/PVE_estimates.txt", col_names = FALSE )

PGE_df <- as.data.frame(matrix(NA, 0, 4))
for(i in 1:length(files)){
  gene <- substr(sapply(strsplit(files[i],"/"), `[`, 11), 8, 25)
  df <- read_tsv(files[i])
  
  q1 <- quantile(df$pge, 0.5)
  q2 <- quantile(df$pge, 0.9)
  quantile1=list(p=.5 ,x=q1)
  quantile2=list(p=.9, x=q2)
  if(quantile1$x != quantile2$x) {
  prior <- beta.select(quantile1, quantile2)
  credible_set <- list(qbeta(0.025,prior[1],  prior[2]), qbeta(0.975,prior[1],  prior[2]))
  
  PGE_df[i, 1] <- gene
  PGE_df[i, 2] <- qbeta(0.5, prior[1],  prior[2])
  PGE_df[i, 3] <- credible_set[1]
  PGE_df[i, 4] <- credible_set[2]
  }
  else 
    i = i+1
}

write_tsv(PGE_df, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/GEMMA/Vo/PGE_estimates.txt", col_names = FALSE )

Now we have PVE and PGE estimates for each tissue and can generate the fitted curve figure for Heritability.

Generate Fitted Heritability Curves with R2

I’m showing the curve for Ac tissue but I generated the one for all other tissues as well.

theme_set(theme_bw(base_size = 15))
Data <- "/Users/natashasanthanam/Box/imlab-data/Projects/PTRS-PGRS-Rosetta/Rat-Genomics/Tyson-PalmerLab_PredictDB_Results/sql/"
poly_dir <- "/Users/natashasanthanam/Github/rat-genomic-analysis/data/"
load_pve <- function(df){
  df <- df[order(df$point_estimate),]
  df$index <- 1:nrow(df)
  return(df)
}

First we get the R2 for the tissue from the model we generated

filename <- Data %&% "Ac_output_db.db"
sqlite.driver <- dbDriver("SQLite")
conn <- dbConnect(RSQLite::SQLite(), filename)

extra <- dbGetQuery(conn, 'select * from extra')
dbDisconnect(conn)

Next we create the ordered plot for heritability for Ac. We use PVE as h2, the proportion of variance in phenotypes explained by typed genotypes.

human_h2 <- read_csv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/human_PVE.csv") %>% rename(point_estimate = pve) 
PVE_Ac <- read_tsv(poly_dir %&% "Ac_PVE_estimates.txt", col_names = FALSE)

human_h2$ensid = sapply(strsplit(human_h2$ensid, "\\."), `[`, 1)
colnames(PVE_Ac) <- c("gene", "point_estimate", "credible_set_1", "credible_set_2")

overlap <- data.frame(rat_genes = PVE_Ac$gene, ensid = orth.rats[match(PVE_Ac$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 1]$ensembl_gene_id) 
overlap <- overlap[na.omit(match(human_h2$ensid, overlap$ensid)), ]

PVE_Ac <- PVE_Ac %>% filter(gene %in% overlap$rat_genes)
PVE_Ac <- inner_join(PVE_Ac, extra, by = "gene")

A_df_Ac <- load_pve(PVE_Ac)
plt_1 <- (ggplot(data = A_df_Ac, aes(x = index))
          + geom_point(aes(x=index, y=R2), colour = "dodgerblue2", size = 0.2)
          + geom_line(aes(y = point_estimate), lwd=1.5)
          + geom_hline(yintercept = 0, linetype=2)
          + geom_ribbon(aes(ymax = credible_set_2, ymin = credible_set_1),
                         alpha = 0.25)
          + labs(x = 'Genes Sorted by Proportion of Variance Explained (PVE)',
                 y = 'PVE')
           + ylim(c(0,1))
           + annotate("text", x = 1250, y = 0.9, label = "Mean h2 =  0.098", size = 6)
           + annotate("text", x = 1250, y = 0.8, label = "Mean R2 =  0.085", size = 6)) 
plt_1

Generate Analagous Plot in Humans

Get R2 and H2 from Heather’s Models for Brain

sqlite3 genarch.db
.headers on
.mode csv
.output human_PVE.csv
select gene, ensid, en_r2 ,pve,pve_CI from results where tissue = "DGN-WB";
.quit
human_h2$credible_set_1 = as.numeric(ifelse(substr(human_h2$pve_CI ,1,1) == "-", paste("-", sapply(strsplit(human_h2$pve_CI, "-"), `[`, 2), sep = ""), sapply(strsplit(human_h2$pve_CI, "-"), `[`, 1)))
human_h2$credible_set_2 = as.numeric(ifelse(substr(human_h2$pve_CI ,1,1) == "-", sapply(strsplit(human_h2$pve_CI, "-"), `[`, 3), sapply(strsplit(human_h2$pve_CI, "-"), `[`, 2)))

human_h2 <- human_h2 %>% filter(ensid %in% overlap$ensid)

hum_df <- load_pve(human_h2)
plt_2  <- (ggplot(data = hum_df, aes(x = index))
          + geom_point(aes(x=index, y=en_r2), colour = "maroon1", size = 0.2)
          + geom_line(aes(y = point_estimate), lwd = 1.5)
          + geom_hline(yintercept = 0, linetype=2)
          + geom_ribbon(aes(ymax = credible_set_2, ymin = credible_set_1),
                         alpha = 0.25)
          + labs(x = 'Genes Sorted by Proportion of Variance Explained (PVE)',
                 y = 'PVE')
          + ylim(c(0,1))
           + annotate("text", x = 1690, y = 0.9, label = "Mean h2 =  0.124", size = 6)
           + annotate("text", x = 1710, y = 0.8, label = "Mean R2 =  0.114", size = 6))
plt_2 
all_h2 <- tibble(gene)
for(l in all) {
  l <- l %>% select(c(gene, point_estimate))
  all_h2 <- full_join(all_h2, l, by = "gene")
}
No matching items