<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/models//Ac_output_db.db"
fila <- dbDriver("SQLite")
sqlite.driver <- dbConnect(RSQLite::SQLite(), fila)
conn <- dbGetQuery(conn, 'select * from extra')
tempo
<- tempo[order(-tempo$R2),] tempo
Rat_Predictability_comp_Humans
Script to compare Heritability and Predictability Between Rats and Humans
#Figure comparing pred R2 vs obs R2 in Ac tissue in Rats
First find well expressed genes in Rats
Find Predicted Expression with separate Rat Genotypes
conda activate imlabtools
export METAXCAN=/gpfs/data/im-lab/nas40t2/natasha/GTEX_Analysis/MetaXcan/software
export GENO=/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files
export MODEL=/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/models
python3 $METAXCAN/Predict.py \
$MODEL/Ac_output_db.db \
--model_db_path \
--model_db_snp_key varID $GENO/BLA_NAcc2_PL2.vcf.gz \
--vcf_genotypes \
--vcf_mode genotyped "{}_{}_{}_{}" \
--on_the_fly_mapping METADATA \
--prediction_output Ac_NAcc2__predict.txt \
--prediction_summary_output Ac_NAcc2__summary.txt \
--verbosity 9 --throw
Mgmt
<- fread("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/Ac_NAcc2__predict.txt")
pred_expr <- fread("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/NAcc2.expr.iqn.bed.gz")
obs_expr
<- obs_expr %>% select(-c(`#chr`, start, end)) %>% pivot_longer(!gene_id, names_to = "FID", values_to = "count") %>% pivot_wider(names_from = gene_id, values_from = count)
obs_expr
<- as.data.frame(pred_expr[, c("FID", "ENSRNOG00000016038")])
Mgmt_pred <- as.data.frame(obs_expr[, c("FID", "ENSRNOG00000016038")])
Mgmt_obs $FID = as.character(Mgmt_pred$FID)
Mgmt_pred
<- Mgmt_pred[match(Mgmt_obs$FID, Mgmt_pred$FID),]
Mgmt_pred <- Mgmt_obs[match(Mgmt_pred$FID, Mgmt_obs$FID ), ]
Mgmt_obs
$FID = as.character(Mgmt_pred$FID)
Mgmt_pred<- inner_join(Mgmt_pred, Mgmt_obs, by = "FID") Mgmt
Polr3k
<- as.data.frame(pred_expr[, c("FID", "ENSRNOG00000017843")])
Polr3k_pred <- as.data.frame(obs_expr[, c("FID", "ENSRNOG00000017843")])
Polr3k_obs $FID = as.character(Polr3k_pred$FID)
Polr3k_pred
<- Polr3k_pred[match(Polr3k_obs$FID, Polr3k_pred$FID),]
Polr3k_pred <- Polr3k_obs[match(Polr3k_pred$FID, Polr3k_obs$FID ), ]
Polr3k_obs
$FID = as.character(Polr3k_pred$FID)
Polr3k_pred<- inner_join(Polr3k_pred, Polr3k_obs, by = "FID") Polr3k
Dhfr
<- as.data.frame(pred_expr[, c("FID", "ENSRNOG00000013521")])
Dhfr_pred <- as.data.frame(obs_expr[, c("FID", "ENSRNOG00000013521")])
Dhfr_obs $FID = as.character(Dhfr_pred$FID)
Dhfr_pred
<- Dhfr_pred[match(Dhfr_obs$FID, Dhfr_pred$FID),]
Dhfr_pred <- Dhfr_obs[match(Dhfr_pred$FID, Dhfr_obs$FID ), ]
Dhfr_obs
$FID = as.character(Dhfr_pred$FID)
Dhfr_pred<- inner_join(Dhfr_pred, Dhfr_obs, by = "FID") Dhfr
Plot pred vs obs expression with our models in NcAcc2 tissue
<- read_tsv("/Users/natashasanthanam/Downloads/model_perf_Mgmt.txt")
Mgmt <- read_tsv("/Users/natashasanthanam/Downloads/model_perf_Polr3k.txt")
Polr3k <- read_tsv("/Users/natashasanthanam/Downloads/model_perf_Dhfr.txt")
Dhfr
<- ggplot(Mgmt, aes(pred, obs)) + geom_point() + annotate("text", x = 0.4, y = 2.5, label = "Mgmt", size = 10, fontface ="bold.italic") + annotate("text", x = 0.53, y = 2, label = "R2 = 0.724", size = 8) + theme(aspect.ratio=1) + xlab("Predicted expression") + ylab("Observed Expression")
g1
<- ggplot(Polr3k, aes(pred, obs)) + geom_smooth(size = 1) + geom_point() + ggtitle("Polr3k") + annotate("text", x = -1.8, y = 2, label = "r2 = 0.796", size = 3) + theme(aspect.ratio=1)
g2
<- ggplot(Dhfr, aes(pred, obs)) + geom_point() + annotate("text", x = -0.4, y = 2.5, label = "Dhfr", size = 10, fontface =2) + annotate("text", x = -0.25, y = 2, label = "r2 = 0.554", size = 8) + theme(aspect.ratio=1) + xlab("Predicted expression") + ylab("Observed Expression") + coord_cartesian(xlim = c(NA, 1.25)) g3
Calculate Predicted vs Observed Expression for Mgmt in Humans using GTEx NA predictDB with Geuvadis genotypes (hg38) as well
conda activate imlabtools
export METAXCAN=/gpfs/data/im-lab/nas40t2/natasha/GTEX_Analysis/MetaXcan/software
export GENO=/gpfs/data/im-lab/nas40t2/Data/1000G/vcf_hg38/geuvadis
export MODEL=/gpfs/data/im-lab/nas40t2/Data/PredictDB/GTEx_v8/models_v1/eqtl/elastic_net_models
export DATA=/gpfs/data/im-lab/nas40t2/natasha/GTEX_Analysis/predixcan
python3 $METAXCAN/Predict.py \
$MODEL/en_Brain_Nucleus_accumbens_basal_ganglia.db \
--model_db_path \
--model_db_snp_key varID $GENO/ALL.chr*.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz \
--vcf_genotypes \
--vcf_mode genotyped "{}_{}_{}_{}_b38" \
--on_the_fly_mapping METADATA \
--prediction_output 1000G_NAcc2__predict.txt \
--prediction_summary_output 1000G_NAcc2__summary.txt \
--verbosity 9 --throw
RPS26 r2= 0.7352744
<- fread("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/1000G_NAcc2__predict.txt")
pred_expr <- fread("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt.gz")
obs_expr
<- obs_expr %>% select(-c(Chr, Gene_Symbol, Coord)) %>% pivot_longer(!TargetID, names_to = "FID", values_to = "count") %>% pivot_wider(names_from = TargetID, values_from = count)
obs_expr colnames(obs_expr)[2:ncol(obs_expr)] = sapply(strsplit(colnames(obs_expr)[2:ncol(obs_expr)], "\\."), `[`, 1)
<- as.data.frame(pred_expr[, c("FID", "ENSG00000197728.9")])
RPS26_pred <- as.data.frame(obs_expr[, c("FID", "ENSG00000197728")])
RPS26_obs
<- RPS26_pred[match(RPS26_obs$FID, RPS26_pred$FID),]
RPS26_pred <- RPS26_obs[match(RPS26_pred$FID, RPS26_obs$FID ), ]
RPS26_obs
<- inner_join(RPS26_pred, RPS26_obs, by = "FID")
RPS26 colnames(RPS26) = c("FID", "pred", "obs")
<- read_tsv("/Users/natashasanthanam/Downloads/GTEx_RPS26.txt", col_names = TRUE)
RPS26_GTEx <- read_tsv("/Users/natashasanthanam/Downloads/GTEx_DHFR_results.txt", col_names = TRUE)
DHFR_GTEx
<- ggplot(DHFR_GTEx, aes(pred, obs)) + geom_point() + annotate("text", x = -0.45, y = 55, label = "DHFR", size = 10, fontface =2) + annotate("text", x = -0.4, y = 51, label = "r2 = 0.506", size = 8) + theme(aspect.ratio=1) + xlab("Predicted expression") + ylab("Observed Expression")
g4
<- ggplot(RPS26_GTEx, aes(pred, obs)) + geom_point() + annotate("text", x = -0.5, y = 385, label = "RPS26", size = 10, fontface ="bold.italic") + annotate("text", x = -0.45, y = 355, label = "R2 = 0.735", size = 8) + theme(aspect.ratio=1) + xlab("Predicted expression") + ylab("Observed Expression") g5
Correlation of predicted R2 between tissues in GTEx
We use Nucleus Accumbens, Hippocampus, Cerebellum, Cortex and Anterior Cingulate Cortex in GTEx
<- list( "en_Brain_Nucleus_accumbens_basal_ganglia.db", "en_Brain_Hippocampus.db", "en_Brain_Cerebellum.db", "en_Brain_Cortex.db", "en_Brain_Anterior_cingulate_cortex_BA24.db")
filelist <- "/gpfs/data/im-lab/nas40t2/Data/PredictDB/GTEx_v8/models_v1/eqtl/elastic_net_models/"
dir
<- data.frame(gene = as.character())
pred_tis for(fila in filelist) {
<- dir %&% fila
filename <- dbDriver("SQLite")
sqlite.driver <- dbConnect(RSQLite::SQLite(), filename)
conn <- substr(fila, 4, str_length(fila)-3)
tis <- dbGetQuery(conn, 'select * from extra') %>% select(c(gene, test_R2_avg))
extra colnames(extra)[2] = tis
<- full_join(pred_tis, extra, by = "gene")
pred_tis dbDisconnect(conn)
}
rownames(pred_tis) <- pred_tis$gene
<- pred_tis %>% select(-c(gene))
pred_tis saveRDS(pred_tis, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/pred_R2_betw_GTEx_brain_tissues.RDS")
<- readRDS("/Users/natashasanthanam/Downloads/pred_R2_betw_GTEx_brain_tissues.RDS")
pred_tis pairs(pred_tis)
Compare Heritability and Predictability in Humans and Rats
Get R2 and H2 from Heather’s Models for Brain
sqlite3 genarch.dbon
.headers mode csv
.
.output human_h2.csvselect gene, ensid, en_r2 ,h2,h2_ci from results where tissue = "DGN-WB";
.quit
<- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/Ac_PVE_estimates.txt", col_names = FALSE) %>% dplyr::rename(rat_herit = X2)
rat_h2
<- read_csv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/human_h2.csv")
human_h2 $ensid <- sapply(strsplit(human_h2$ensid, "\\."), `[`, 1)
human_h2summary(rat_h2$rat_herit)
summary(human_h2$h2)
Change gene names in rats
<- read_tsv("/Users/natashasanthanam/Downloads/ortholog_genes_rats_humans.tsv")
orth.rats <- rat_h2 %>% mutate(ensid = orth.rats[match(rat_h2$X1, orth.rats$rnorvegicus_homolog_ensembl_gene), 1]$ensembl_gene_id)
rat_h2
<- inner_join(rat_h2, human_h2, by = "ensid")
all_h2
cor.test(all_h2$rat_herit, all_h2$h2)
Compare Predictability (R2) between Humans and Rats
<- "/Users/natashasanthanam/Box/imlab-data/data-Github/rat-genomic-analysis/sql/Ac_output_db.db"
fila <- dbDriver("SQLite")
sqlite.driver <- dbConnect(RSQLite::SQLite(), fila)
conn <- dbGetQuery(conn, 'select * from extra')
rat_r2 dbDisconnect(conn)
<- rat_r2 %>% mutate(ensid = orth.rats[match(rat_r2$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 1]$ensembl_gene_id)
rat_r2 <- inner_join(rat_r2, human_h2, by = "ensid")
all_r2 cor.test(all_r2$R2, all_r2$en_r2)
Plot the Comparison between the two
<- rat_h2 %>% dplyr::rename(gene.x = X1) %>% mutate(rat_ci = paste(round(Ac_h2$X3, digits = 5), round(Ac_h2$X4, digits = 5), sep = "-")) %>% dplyr::select(c(gene.x, rat_herit, rat_ci))
Ac_h2
<- inner_join(all_r2, Ac_h2, by = "gene.x")
lg_df
<- lg_df %>% dplyr::select(c(ensid, gene.x, genename, R2, en_r2,rat_herit,h2, rat_ci, h2_ci))
lg_df colnames(lg_df) = c("ensembl_id", "rat_ensembl_id", "genename", "rat_r2", "human_r2", "rat_h2", "human_h2", "rat_h2_ci", "human_h2_ci")
# Plot h2 between rats and Humans with line for null
%>% mutate(human_shuffled_h2 = sample(human_h2, nrow(lg_df), replace=F)) %>% ggplot(aes(rat_h2, human_h2)) + geom_smooth(col = "darkgreen") + geom_smooth(aes(rat_h2, human_shuffled_h2),col="dark gray")+theme_bw()+ xlab("Rat Heritability") + ylab("Human Heritability")
lg_df
# Plot R2 between rats and Humans with line for null
%>% mutate(human_shuffled_r2 = sample(human_r2, nrow(lg_df), replace=F)) %>% ggplot(aes(rat_r2, human_r2)) + geom_smooth(col = "midnightblue") + xlab("Rat Predictability") + ylab("Human Predictability") + annotate("text", x = 0.1, y = 0.2, label = "R = 0.061", size = 8) lg_df
Compare Predictability Between Tissues
<- "/Users/natashasanthanam/Box/imlab-data/data-Github/rat-genomic-analysis/sql/"
box.dir <- c("Il_output_db.db", "Lh_output_db.db", "Ac_output_db.db", "Vo_output_db.db", "Pl_output_db.db")
filelist <- c("Infralimbic Cortex", "Lateral Habenula", "Nucleus accumbens", "Orbitofrontal Cortex", "Prelimibic Cortex")
tis.list
<- data.frame(gene = as.character())
pred_tis for(fila in filelist) {
<- box.dir %&% fila
filename <- dbDriver("SQLite")
sqlite.driver <- dbConnect(RSQLite::SQLite(), filename)
conn <- match(fila, filelist)
i <- tis.list[i]
tis <- dbGetQuery(conn, 'select * from extra') %>% select(c(gene, R2))
extra colnames(extra)[2] = tis
<- full_join(pred_tis, extra, by = "gene")
pred_tis dbDisconnect(conn)
}rownames(pred_tis) <- pred_tis$gene
<- pred_tis %>% select(-c(gene))
pred_tis
<- cor(pred_tis %>% na.omit)
d1 lower.tri(d1)] <- NA
d1[<- d1 %>%
d1 %>%
as.data.frame rownames_to_column(var = 'var1') %>%
gather(var2, value, -var1)
<- d1 %>% dplyr::rename(r = value) %>% na.omit()
d1
ggplot(data = d1, aes(x = var1, y = var2, fill = r)) +
geom_tile(color = "white")+
scale_fill_gradient(low="dodgerblue", high="dodgerblue4") +
theme_minimal() +
theme(panel.grid = element_blank()) + ylab("") + xlab("") + theme(axis.text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1)) + theme(aspect.ratio=1)
Calculate R2 across GTEx Tissues
<- "/gpfs/data/im-lab/nas40t2/Data/PredictDB/GTEx_v8/models_v1/eqtl/elastic_net_models/"
model.dir
<- c("en_Brain_Amygdala.db", "en_Brain_Cerebellum.db", "en_Brain_Nucleus_accumbens_basal_ganglia.db", "en_Brain_Spinal_cord_cervical_c-1.db", "en_Brain_Substantia_nigra.db") filelist
#pred_tis <- read_tsv(dir %&% "pred_R2_betw_GTEx_brain_tissues.txt", col_names = TRUE)
rownames(pred_tis) <- pred_tis$gene
<- pred_tis %>% select(-c(gene))
pred_tis colnames(pred_tis) = c("Amygdala", "Cerebellum", "Nucleus accumbens", "Spinal cord", "Substantia nigra")
<- cor(pred_tis %>% na.omit)
d2 lower.tri(d2)] <- NA
d2[<- d2 %>%
d2 %>%
as.data.frame rownames_to_column(var = 'var1') %>%
gather(var2, value, -var1)
<- d2 %>% dplyr::rename(r = value) %>% na.omit()
d2
ggplot(data = d2, aes(x = var1, y = var2, fill = r)) +
geom_tile(color = "white")+
scale_fill_gradient(low="maroon1", high="maroon4") +
theme_minimal() +
theme(panel.grid = element_blank()) + ylab("") + xlab("") + theme(axis.text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1)) + theme(aspect.ratio=1)
Check Number of SNPs used in Prediction in Rats vs Humans
summary(rat_r2$n.snps)
#human NAcc models
<- "/Users/natashasanthanam/Downloads/en_Brain_Nucleus_accumbens_basal_ganglia.db"
filename <- dbDriver("SQLite")
sqlite.driver <- dbConnect(RSQLite::SQLite(), filename)
conn <- dbGetQuery(conn, 'select * from extra')
human_r2
summary(human_r2$n.snps.in.model)
No matching items