library(tidyverse)
library(data.table)
library(RSQLite)
library(glmnet)
"%&%" = function(a,b) paste(a,b,sep="")
::source_gist("ee5f67abddd0b761ee24410ea71c41aa")
devtools::source_gist("38431b74c6c0bf90c12f")
devtools::source_gist("1e9053c8f35c30396429350a08f33ea7") devtools
Human_PTRS_Performance_Comparison
Generate Predicted Traits for Rats
Setup
Yanyu’s PTRS weights estimate the effect of genes on a given trait, in this case we pick height and BMI.
<- c("height", "bmi")
traits # folder with PrediXcan results
<- "/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/Results/PrediXcan/metabolic_traits/"
results.dir # folder with PTRS weights, predicted traits will output here
<- "/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/data/" data.dir
Data Wrangling
The orth.rats
file contains gives a dictionary between human genes and the corresponding gene in rats.
<- read_tsv(data.dir %&% "expression/ortholog_genes_rats_humans.tsv", col_names = TRUE) orth.rats
We first replace rat genes in the predicted expression results with their corresponding human genes, so that it could be compatible with PTRS weights.
<- read_tsv(results.dir %&% "rat_metabolic_Ac_best__predict.txt") %>% select(-c(FID))
pred_expr
#filter only for genes that have a human ortholog
<- pred_expr %>% select(c(IID, intersect(colnames(pred_expr), orth.rats$rnorvegicus_homolog_ensembl_gene) ))
pred_expr
#change name to human ensembl id in humans
colnames(pred_expr)[2:ncol(pred_expr)] <- orth.rats[match(colnames(pred_expr)[2:ncol(pred_expr)], orth.rats$rnorvegicus_homolog_ensembl_gene), 1] %>% .[["ensembl_gene_id"]]
Then we reformat the PTRS weight files, removing the Ensembl version from gene names.
= function(trait)
fn_weights
{<- read_tsv(data.dir %&% "PTRS_weights/weight_files/elastic_net_alpha_0.1_British.export_model/weights." %&% trait %&% ".tsv.gz")
weights $gene_id <- sapply(strsplit(weights$gene_id, "\\."), `[`, 1)
weightsrownames(weights) <- weights$gene_id
<- weights %>% rename(gene_name = gene_id)
weights return(weights)
}
We converted the predicted expression for rat genes to corresponding human gene names, matching the PTRS gene names. This lets us combine PTRS weights, trained from human transcriptomic data, with predicted transciptome of the rats using the fn_generate_trait function below. The output is the predicted height for individual rats.
In some ways, we can interpret generate_trait as the opposite of PrediXcan. Both start from the predicted transcriptome of a group of individuals, PrediXcan works backwards from values of their observed trait to compute association between genes and the trait, whereas fn_generate_trait assumes those associations to predict the trait for each individual. PTRS is particularly insightful in this case, because of its portability across different population groups. We hope this extends across species, motivating our final goal of comparing the performance of PTRS in humans and rats.
for(trait in traits) {
<- fn_weights(trait)
weights <- fn_generate_trait(pred_expr, weights)
pred_trait saveRDS(pred_trait, data.dir %&% "rat_pred_" %&% trait %&% "_w_Human_best_PTRS.RDS")
}
Evaluating Rat PTRS Performance
<- read_tsv(data.dir %&% "PTRS_weights/weight_files/elastic_net_alpha_0.1_British.export_model/weights.height.tsv.gz") %>% rename(gene_name = gene_id) weights
Human PTRS Results
The Personal Genome Project is a public resource of individual data from informed volunteers. PGP genetic data is publicly available, and we have processed trait information in an sqlite database. In Yanyu Liang’s development of PTRS, she generated PTRS weights using elastic net. We want to use PGP data to test Yanyu’s PTRS weights and compare to observed height. We previously generated predicted expression in Summary_PTRS_PGS.Rmd
Calculate Predicted Height in PGP using Individual PTRS Weights
<- read_tsv(data.dir %&% "PTRS_weights/PGP/PGP_Whole_Blood__predict.txt") %>% select(-c(FID)) pred_expr
We repeat the same method from before for predicting traits, but this time predicting height and BMI for PGP individuals.
<- fn_generate_trait(pred_expr, weights) pred_height_humans
Compare Performance to Observed Height in Personal Genomes
First, we load PGP phenotype data, stored in an sqlite database.
<- "~/Box/imlab-data/data-Github/web-data/2021-04-21-personal-genomes-project-data/repgp-data.sqlite3"
db <- dbDriver("SQLite")
sqlite.driver <- dbConnect(RSQLite::SQLite(), db)
conn dbListTables(conn)
<- dbGetQuery(conn, 'select * from users')
users dbDisconnect(conn)
<- users %>% select(c(sample, height)) %>% na.omit()
pheno = as.matrix(apply(weights[,2:ncol(weights)], 2, function(x) sum(x != 0 ))) n_genes
<- pheno[na.omit(match(rownames(pred_height_humans), pheno$sample)),]
pheno
<- pred_height_humans[na.omit(match(pheno$sample, rownames(pred_height_humans))), ]
tempo
<- data.frame(estimate = numeric(), pvalue = numeric(), model = character(), n_genes_in_model = numeric(), conf.int.min = numeric(), conf.int.max = numeric())
height_df for(i in 1:ncol(tempo)){
1] <- cor.test(pheno$height, tempo[,i])$estimate
height_df[i,2] <- cor.test(pheno$height, tempo[,i])$p.value
height_df[i,3] <- paste("model", i, sep = "_")
height_df[i,4] <- n_genes[i,1]
height_df[i,5] <- cor.test(pheno$height, tempo[,i])$conf.int[1]
height_df[i,6] <- cor.test(pheno$height, tempo[,i])$conf.int[2]
height_df[i, }
Plot Performance
<- readRDS(data.dir %&% "corr_height_indiv_PTRS.RDS")
height_df = ggplot(height_df, aes(n_genes_in_model, estimate)) + geom_errorbar(aes(ymin = conf.int.min, ymax = conf.int.max ), width=0.2, color="gray") + geom_point(color = "purple", position="jitter") + geom_line(color = "purple") + xlab("Number of genes in each model") + ylab("Correlation Coefficient (r)") + ggtitle("Performance of PTRS for Height in Personal Genomes") + theme_bw()
p1 p1