<- "/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/data/"
data.dir <- read_tsv(data.dir %&% "expression/ortholog_genes_rats_humans.tsv", col_names = TRUE) orth.rats
06.PTRS_Creation
The orth.rats
file contains gives a dictionary between human genes and the corresponding gene in rats.
Individual PTRS creation
Yanyu’s PTRS weights estimate the effect of genes on a given trait, in this case we pick height and BMI.
<- c("height", "bmi") traits
<- "~/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/Results/" base.dir
We first replace rat genes in the predicted expression results with their corresponding human genes, so that it could be compatible with Yanyu’s PTRS weights.
<- read_tsv(base.dir %&% "prediXcan/metabolic_traits/rat_metabolic_Ac_best__predict.txt") %>% select(-c(FID))
pred_expr
#filter only for genes that have a human ortholog
<- pred_expr %>% select(c(IID, intersect(colnames(pred_expr), orth.rats$rnorvegicus_homolog_ensembl_gene) ))
pred_expr
#change name to human ensembl id in humans
colnames(pred_expr)[2:ncol(pred_expr)] <- orth.rats[match(colnames(pred_expr)[2:ncol(pred_expr)], orth.rats$rnorvegicus_homolog_ensembl_gene), 1] %>% .[["ensembl_gene_id"]]
Then we reformat the PTRS weight files, removing the Ensembl version from gene names.
= function(trait)
fn_weights
{<- read_tsv(data.dir %&% "PTRS_weights/weight_files/elastic_net_alpha_0.1_British.export_model/weights." %&% trait %&% ".tsv.gz")
weights $gene_id <- sapply(strsplit(weights$gene_id, "\\."), `[`, 1)
weightsrownames(weights) <- weights$gene_id
<- weights %>% rename(gene_name = gene_id)
weights return(weights)
}
We converted the predicted expression for rat genes to corresponding human gene names, matching the PTRS gene names. This lets us combine PTRS weights, trained from human transcriptomic data, with predicted transciptome of the rats using the fn_generate_trait function below. The output is the predicted height for individual rats.
In some ways, we can interpret generate_trait as the opposite of PrediXcan. Both start from the predicted transcriptome of a group of individuals, PrediXcan works backwards from values of their observed trait to compute association between genes and the trait, whereas fn_generate_trait assumes those associations to predict the trait for each individual. PTRS is particularly insightful in this case, because of its portability across different population groups. We hope this extends across species, motivating our final goal of comparing the performance of PTRS in humans and rats.
<- "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/"
dir
for(trait in traits) {
<- fn_weights(trait)
weights <- fn_generate_trait(pred_expr, weights)
pred_trait saveRDS(pred_trait, dir %&% "rat_pred_" %&% trait %&% "_w_Human_best_PTRS.RDS")
}
Compare all values across different models to actual BMI and Height
<- readRDS(dir %&% "rat_pred_height_w_Human_best_PTRS.RDS")
pred_height <- readRDS(dir %&% "rat_pred_bmi_w_Human_best_PTRS.RDS")
pred_BMI
<- read_tsv(dir %&% "all_names.txt", col_names = TRUE)
all_rats
<- read_csv(dir %&% "processed_obesity_rat_Palmer_phenotypes.csv")
pheno <- pheno %>% filter(!(rat_rfid %in% all_rats$ID)) pheno
<- fread("/Users/natashasanthanam/Downloads/weights.bmi.annot.tsv")
weights_bmi <- fread("/Users/natashasanthanam/Downloads/weights.height.annot.tsv")
weights_height
= as.matrix(apply(weights_bmi[,2:ncol(weights_bmi)], 2, function(x) sum(x != 0 )))
n_genes_bmi = as.matrix(apply(weights_height[,2:ncol(weights_height)], 2, function(x) sum(x != 0 ))) n_genes_height
#Create Dataframes with the correlation coefficient between trait in rats and ones predicted using PTRS from Humans
BMI with predicted BMI
<- pheno %>% dplyr::select(c(rat_rfid, bmi_bodylength_w_tail)) %>% na.omit()
bmi_with_tail <- pred_BMI[na.omit(match(bmi_with_tail$rat_rfid, rownames(pred_BMI))), ]
tempo
<- data.frame(R2 = numeric(), pvalue = numeric(), model = character(), n_genes_in_model = numeric(), conf.int.min = numeric(), conf.int.max = numeric())
bmi_w_tail_df for(i in 1:ncol(tempo)){
= lm(bmi_with_tail$bmi_bodylength_w_tail ~ tempo[,i])
fit 1] <- summary(fit)$r.squared
bmi_w_tail_df[i,2] <- glance(fit)$p.value
bmi_w_tail_df[i,3] <- paste("model", i, sep = "_")
bmi_w_tail_df[i,4] <- n_genes_bmi[i,1]
bmi_w_tail_df[i,5] <- confint(fit)[1]
bmi_w_tail_df[i,6] <- confint(fit)[2]
bmi_w_tail_df[i,
}
<- data.frame(cor = numeric(), pvalue = numeric(), model = character(), n_genes_in_model = numeric(), conf.int.min = numeric(), conf.int.max = numeric())
bmi_w_tail_cor for(i in 1:ncol(tempo)){
1] <- cor.test(bmi_with_tail$bmi_bodylength_w_tail, tempo[,i])$estimate
bmi_w_tail_cor[i,2] <- cor.test(bmi_with_tail$bmi_bodylength_w_tail, tempo[,i])$p.value
bmi_w_tail_cor[i,3] <- paste("model", i, sep = "_")
bmi_w_tail_cor[i,4] <- n_genes_bmi[i,1]
bmi_w_tail_cor[i,5] <- cor.test(bmi_with_tail$bmi_bodylength_w_tail, tempo[,i])$conf.int[1]
bmi_w_tail_cor[i,6] <- cor.test(bmi_with_tail$bmi_bodylength_w_tail, tempo[,i])$conf.int[2]
bmi_w_tail_cor[i,
}
<- inner_join(bmi_w_tail_cor, bmi_w_tail_df, by = "model")
total_bmi_df <- total_bmi_df %>% select(c(model, n_genes_in_model.x, cor, R2, pvalue.x, conf.int.min.x, conf.int.max.x )) total_bmi_df
Bodylength with Predicted Height
#Bodylength wit Tail vs predicted Height from Human PTRS weights
<- pheno %>% dplyr::select(c(rat_rfid, bodylength_w_tail)) %>% na.omit()
bodylength_w_tail <- pred_height[na.omit(match(bodylength_w_tail$rat_rfid, rownames(pred_height))), ]
tempo
<- data.frame(R2 = numeric(), pvalue = numeric(), model = character(), n_genes_in_model = numeric(), conf.int.min = numeric(), conf.int.max = numeric())
bodylength_w_tail_df for(i in 1:ncol(tempo)){
= lm(bodylength_w_tail$bodylength_w_tail ~ tempo[,i])
fit 1] <- summary(fit)$r.squared
bodylength_w_tail_df[i,2] <- glance(fit)$p.value
bodylength_w_tail_df[i,3] <- paste("model", i, sep = "_")
bodylength_w_tail_df[i,4]<- n_genes_height[i,1]
bodylength_w_tail_df[i,5] <- confint(fit)[1]
bodylength_w_tail_df[i,6] <- confint(fit)[2]
bodylength_w_tail_df[i,
}
<- data.frame(cor = numeric(), pvalue = numeric(), model = character(), n_genes_in_model = numeric(), conf.int.min = numeric(), conf.int.max = numeric())
bodylength_w_tail_cor for(i in 1:ncol(tempo)){
1] <- cor.test(bodylength_w_tail$bodylength_w_tail, tempo[,i])$estimate
bodylength_w_tail_cor[i,2] <- cor.test(bodylength_w_tail$bodylength_w_tail, tempo[,i])$p.value
bodylength_w_tail_cor[i,3] <- paste("model", i, sep = "_")
bodylength_w_tail_cor[i,4]<- n_genes_height[i,1]
bodylength_w_tail_cor[i,5] <- cor.test(bodylength_w_tail$bodylength_w_tail, tempo[,i])$conf.int[1]
bodylength_w_tail_cor[i,6] <- cor.test(bodylength_w_tail$bodylength_w_tail, tempo[,i])$conf.int[2]
bodylength_w_tail_cor[i,
}
<- bodylength_w_tail_cor %>% filter(n_genes_in_model != 0)
bodylength_w_tail_cor
<- inner_join(bodylength_w_tail_cor, bodylength_w_tail_df, by = "model")
total_height_df <- total_height_df %>% select(c(model, n_genes_in_model.x, cor, R2, pvalue.x, conf.int.min.x, conf.int.max.x )) total_height_df
Plot Results
ggplot(bodylength_w_tail_cor, aes(n_genes_in_model, cor)) + geom_errorbar(aes(ymin = conf.int.min, ymax = conf.int.max ), width=0.2, color="gray48") + ylim(c(0,0.1)) + geom_point(position="jitter") +geom_line() + xlab("Number of genes in each model") + ylab("Correlation Coefficient (r)")
ggplot(bmi_w_tail_cor, aes(n_genes_in_model, cor)) + geom_errorbar(aes(ymin = conf.int.min, ymax = conf.int.max ), width=0.2, color="gray48") + ylim(c(0 ,0.1)) + geom_point(position="jitter") +geom_line() + xlab("Number of genes in each model") + ylab("Correlation Coefficient (r)")
Violin plot
<- read_excel("/Users/natashasanthanam/Downloads/13059_2021_2591_MOESM5_ESM.xlsx")
ptrs_hum <- ptrs_hum[7:nrow(ptrs_hum),]
ptrs_hum colnames(ptrs_hum) = c("trait", "population", "PTRS_MESA_EUR", "PTRS_MESA_AFR", "PTRS_MESA_ALL")
<- ptrs_hum %>% filter(trait == "height")
ptrs_hum_height <- ptrs_hum_height %>% pivot_longer(c("PTRS_MESA_EUR", "PTRS_MESA_AFR", "PTRS_MESA_ALL"), names_to = "PTRS") %>% select(-c(trait)) ptrs_hum_height
ggplot(bodylength_w_tail_cor, aes(x=n_genes_in_model, y=cor)) +
geom_violin()
ggplot(bmi_w_tail_cor, aes(x=n_genes_in_model, y=cor)) +
geom_violin()
Create and Plot Negative Control
<- pheno %>% dplyr::select(c(rat_rfid, fasting_glucose)) %>% na.omit()
fasting_glucose <- pred_height[na.omit(match(fasting_glucose$rat_rfid, rownames(pred_height))), ]
tempo
<- data.frame(estimate = numeric(), pvalue = numeric(), model = character(), n_genes_in_model = numeric(), conf.int.min = numeric(), conf.int.max = numeric())
neg_control_df
for(i in 1:ncol(tempo)){
1] <- cor.test(fasting_glucose$fasting_glucose, tempo[,i])$estimate
neg_control_df[i,2] <- cor.test(fasting_glucose$fasting_glucose, tempo[,i])$p.value
neg_control_df[i,3] <- paste("model", i, sep = "_")
neg_control_df[i,4]<- n_genes_height[i,1]
neg_control_df[i,5] <- cor.test(fasting_glucose$fasting_glucose, tempo[,i])$conf.int[1]
neg_control_df[i,6] <- cor.test(fasting_glucose$fasting_glucose, tempo[,i])$conf.int[2]
neg_control_df[i,
}
ggplot(neg_control_df, aes(n_genes_in_model, estimate)) + geom_errorbar(aes(ymin = conf.int.min, ymax = conf.int.max ), width=0.2, color="gray48") + geom_point(position="jitter") +geom_line() + xlab("Number of genes in each model") + ylab("Correlation Coefficient (r)")
Test PTRS top genes for Enrichment
<- weights_height %>% select(c(gene_name, model_5)) %>% filter(model_5 != 0) %>% select(c(gene_name))
top_ptrs_genes $gene_name = sapply(strsplit(top_ptrs_genes$gene_name , "\\."), `[`, 1)
top_ptrs_genes
<- top_ptrs_genes %>% mutate(rat_gene = orth.rats[match(top_ptrs_genes$gene_name, orth.rats$ensembl_gene_id), 3]$rnorvegicus_homolog_ensembl_gene)
top_ptrs_genes
<- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/rat_metabolic_traits_best_Ac_full_assocs.txt", col_names = TRUE)
full_df <- read_tsv("/Users/natashasanthanam/Downloads/rat_metabolic_MultiXcan_pval_assoc.txt", col_names = TRUE) pheno_Multi_rat
qqunif(full_df$pvalue, col= "dodgerblue4" )
qqpoints(full_df %>% filter(gene %in% top_ptrs_genes$rat_gene) %>% .[["pvalue"]],pch='+', col = "dodgerblue")
qqunif(pheno_Multi_rat$pvalue.bodylength_w_tail, col= "dodgerblue4" )
qqpoints(pheno_Multi_rat %>% filter(gene %in% top_ptrs_genes$rat_gene) %>% .[["pvalue.bodylength_w_tail"]],pch='+', col = "dodgerblue")
<- weights_height %>% filter(!(gene_name %in% top_ptrs_genes$gene_name)) %>% select(c(gene_name))
all_ptrs_genes <- all_ptrs_genes %>% mutate(rat_gene = orth.rats[match(all_ptrs_genes$gene_name, orth.rats$ensembl_gene_id), 3]$rnorvegicus_homolog_ensembl_gene)
all_ptrs_genes
qqunif(full_df %>% filter(gene %in% all_ptrs_genes$rat_gene) %>% .[["pvalue"]])
qqpoints(full_df %>% filter(gene %in% top_ptrs_genes$rat_gene) %>% .[["pvalue"]],pch='+', col = "dodgerblue")
<- full_df %>% mutate(human_gene = orth.rats[match(full_df$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 1]$ensembl_gene_id) full_df