06.PTRS_Creation

Author

Natasha Santhanam

Published

February 7, 2022

The orth.rats file contains gives a dictionary between human genes and the corresponding gene in rats.

data.dir <- "/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/data/"
orth.rats <- read_tsv(data.dir %&% "expression/ortholog_genes_rats_humans.tsv", col_names = TRUE)

Individual PTRS creation

Yanyu’s PTRS weights estimate the effect of genes on a given trait, in this case we pick height and BMI.

traits <- c("height", "bmi")
base.dir <- "~/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/Results/"

We first replace rat genes in the predicted expression results with their corresponding human genes, so that it could be compatible with Yanyu’s PTRS weights.

pred_expr <- read_tsv(base.dir %&% "prediXcan/metabolic_traits/rat_metabolic_Ac_best__predict.txt") %>% select(-c(FID))

#filter only for genes that have a human ortholog
pred_expr <- pred_expr %>% select(c(IID, intersect(colnames(pred_expr), orth.rats$rnorvegicus_homolog_ensembl_gene) ))

#change name to human ensembl id in humans
colnames(pred_expr)[2:ncol(pred_expr)] <- orth.rats[match(colnames(pred_expr)[2:ncol(pred_expr)], orth.rats$rnorvegicus_homolog_ensembl_gene), 1] %>% .[["ensembl_gene_id"]]

Then we reformat the PTRS weight files, removing the Ensembl version from gene names.

fn_weights = function(trait)
{
  weights <- read_tsv(data.dir %&% "PTRS_weights/weight_files/elastic_net_alpha_0.1_British.export_model/weights." %&% trait %&% ".tsv.gz")
  weights$gene_id <- sapply(strsplit(weights$gene_id, "\\."), `[`, 1)
  rownames(weights) <- weights$gene_id
  weights <- weights %>% rename(gene_name = gene_id)
  return(weights)
}

We converted the predicted expression for rat genes to corresponding human gene names, matching the PTRS gene names. This lets us combine PTRS weights, trained from human transcriptomic data, with predicted transciptome of the rats using the fn_generate_trait function below. The output is the predicted height for individual rats.

In some ways, we can interpret generate_trait as the opposite of PrediXcan. Both start from the predicted transcriptome of a group of individuals, PrediXcan works backwards from values of their observed trait to compute association between genes and the trait, whereas fn_generate_trait assumes those associations to predict the trait for each individual. PTRS is particularly insightful in this case, because of its portability across different population groups. We hope this extends across species, motivating our final goal of comparing the performance of PTRS in humans and rats.

dir <- "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/"

for(trait in traits) {
  weights <- fn_weights(trait)
  pred_trait <- fn_generate_trait(pred_expr, weights)
  saveRDS(pred_trait, dir %&% "rat_pred_" %&% trait %&% "_w_Human_best_PTRS.RDS")
}

Compare all values across different models to actual BMI and Height

pred_height <- readRDS(dir %&% "rat_pred_height_w_Human_best_PTRS.RDS") 
pred_BMI <- readRDS(dir %&% "rat_pred_bmi_w_Human_best_PTRS.RDS")

all_rats <- read_tsv(dir %&% "all_names.txt", col_names = TRUE)

pheno <- read_csv(dir %&% "processed_obesity_rat_Palmer_phenotypes.csv")
pheno <- pheno %>% filter(!(rat_rfid  %in% all_rats$ID))
weights_bmi <- fread("/Users/natashasanthanam/Downloads/weights.bmi.annot.tsv") 
weights_height <-  fread("/Users/natashasanthanam/Downloads/weights.height.annot.tsv")  

n_genes_bmi = as.matrix(apply(weights_bmi[,2:ncol(weights_bmi)], 2, function(x) sum(x != 0 )))
n_genes_height = as.matrix(apply(weights_height[,2:ncol(weights_height)], 2, function(x) sum(x != 0 )))

#Create Dataframes with the correlation coefficient between trait in rats and ones predicted using PTRS from Humans

BMI with predicted BMI

bmi_with_tail <- pheno %>% dplyr::select(c(rat_rfid, bmi_bodylength_w_tail)) %>% na.omit()
tempo <- pred_BMI[na.omit(match(bmi_with_tail$rat_rfid, rownames(pred_BMI))), ]

bmi_w_tail_df <- data.frame(R2 = numeric(), pvalue = numeric(), model = character(), n_genes_in_model = numeric(), conf.int.min = numeric(), conf.int.max = numeric())
for(i in 1:ncol(tempo)){
  fit = lm(bmi_with_tail$bmi_bodylength_w_tail ~ tempo[,i])
  bmi_w_tail_df[i,1] <- summary(fit)$r.squared
  bmi_w_tail_df[i,2] <- glance(fit)$p.value
  bmi_w_tail_df[i,3] <- paste("model", i, sep = "_")
  bmi_w_tail_df[i,4] <- n_genes_bmi[i,1]
  bmi_w_tail_df[i,5] <- confint(fit)[1] 
  bmi_w_tail_df[i,6] <- confint(fit)[2]
}

bmi_w_tail_cor <- data.frame(cor = numeric(), pvalue = numeric(), model = character(), n_genes_in_model = numeric(), conf.int.min = numeric(), conf.int.max = numeric())
for(i in 1:ncol(tempo)){
  bmi_w_tail_cor[i,1] <- cor.test(bmi_with_tail$bmi_bodylength_w_tail, tempo[,i])$estimate
  bmi_w_tail_cor[i,2] <- cor.test(bmi_with_tail$bmi_bodylength_w_tail, tempo[,i])$p.value
  bmi_w_tail_cor[i,3] <- paste("model", i, sep = "_")
  bmi_w_tail_cor[i,4] <- n_genes_bmi[i,1]
  bmi_w_tail_cor[i,5] <- cor.test(bmi_with_tail$bmi_bodylength_w_tail, tempo[,i])$conf.int[1]
  bmi_w_tail_cor[i,6] <- cor.test(bmi_with_tail$bmi_bodylength_w_tail, tempo[,i])$conf.int[2]
}

total_bmi_df <- inner_join(bmi_w_tail_cor, bmi_w_tail_df, by = "model")
total_bmi_df <- total_bmi_df %>% select(c(model, n_genes_in_model.x, cor, R2, pvalue.x, conf.int.min.x, conf.int.max.x ))

Bodylength with Predicted Height

#Bodylength wit Tail vs predicted Height from Human PTRS weights
bodylength_w_tail <- pheno %>% dplyr::select(c(rat_rfid, bodylength_w_tail)) %>% na.omit()
tempo <- pred_height[na.omit(match(bodylength_w_tail$rat_rfid, rownames(pred_height))), ]

bodylength_w_tail_df <- data.frame(R2 = numeric(), pvalue = numeric(), model = character(), n_genes_in_model = numeric(), conf.int.min = numeric(), conf.int.max = numeric())
for(i in 1:ncol(tempo)){
  fit = lm(bodylength_w_tail$bodylength_w_tail ~ tempo[,i])
  bodylength_w_tail_df[i,1] <- summary(fit)$r.squared
  bodylength_w_tail_df[i,2] <- glance(fit)$p.value
  bodylength_w_tail_df[i,3] <- paste("model", i, sep = "_")
  bodylength_w_tail_df[i,4]<- n_genes_height[i,1]
  bodylength_w_tail_df[i,5] <- confint(fit)[1] 
  bodylength_w_tail_df[i,6] <- confint(fit)[2] 
}

bodylength_w_tail_cor <- data.frame(cor = numeric(), pvalue = numeric(), model = character(), n_genes_in_model = numeric(), conf.int.min = numeric(), conf.int.max = numeric())
for(i in 1:ncol(tempo)){
  bodylength_w_tail_cor[i,1] <- cor.test(bodylength_w_tail$bodylength_w_tail, tempo[,i])$estimate
  bodylength_w_tail_cor[i,2] <- cor.test(bodylength_w_tail$bodylength_w_tail, tempo[,i])$p.value
  bodylength_w_tail_cor[i,3] <- paste("model", i, sep = "_")
  bodylength_w_tail_cor[i,4]<- n_genes_height[i,1]
  bodylength_w_tail_cor[i,5] <- cor.test(bodylength_w_tail$bodylength_w_tail, tempo[,i])$conf.int[1]
  bodylength_w_tail_cor[i,6] <- cor.test(bodylength_w_tail$bodylength_w_tail, tempo[,i])$conf.int[2]
}

bodylength_w_tail_cor <- bodylength_w_tail_cor %>% filter(n_genes_in_model != 0)

total_height_df <- inner_join(bodylength_w_tail_cor, bodylength_w_tail_df, by = "model")
total_height_df <- total_height_df %>% select(c(model, n_genes_in_model.x, cor, R2, pvalue.x, conf.int.min.x, conf.int.max.x ))

Plot Results

ggplot(bodylength_w_tail_cor, aes(n_genes_in_model, cor)) + geom_errorbar(aes(ymin = conf.int.min, ymax = conf.int.max ), width=0.2,  color="gray48") + ylim(c(0,0.1)) + geom_point(position="jitter") +geom_line() + xlab("Number of genes in each model") + ylab("Correlation Coefficient (r)") 


ggplot(bmi_w_tail_cor, aes(n_genes_in_model, cor)) + geom_errorbar(aes(ymin = conf.int.min, ymax = conf.int.max ), width=0.2,  color="gray48") + ylim(c(0 ,0.1)) + geom_point(position="jitter") +geom_line() +  xlab("Number of genes in each model") + ylab("Correlation Coefficient (r)") 

Violin plot

ptrs_hum <- read_excel("/Users/natashasanthanam/Downloads/13059_2021_2591_MOESM5_ESM.xlsx")
ptrs_hum <- ptrs_hum[7:nrow(ptrs_hum),]
colnames(ptrs_hum) = c("trait", "population", "PTRS_MESA_EUR", "PTRS_MESA_AFR", "PTRS_MESA_ALL")

ptrs_hum_height <- ptrs_hum %>% filter(trait == "height")
ptrs_hum_height <- ptrs_hum_height %>% pivot_longer(c("PTRS_MESA_EUR", "PTRS_MESA_AFR", "PTRS_MESA_ALL"), names_to = "PTRS") %>% select(-c(trait))
ggplot(bodylength_w_tail_cor, aes(x=n_genes_in_model, y=cor)) + 
  geom_violin()

ggplot(bmi_w_tail_cor, aes(x=n_genes_in_model, y=cor)) + 
  geom_violin()

Create and Plot Negative Control

fasting_glucose <- pheno %>% dplyr::select(c(rat_rfid, fasting_glucose)) %>% na.omit()
tempo <- pred_height[na.omit(match(fasting_glucose$rat_rfid, rownames(pred_height))), ]

neg_control_df <- data.frame(estimate = numeric(), pvalue = numeric(), model = character(), n_genes_in_model = numeric(), conf.int.min = numeric(), conf.int.max = numeric())

for(i in 1:ncol(tempo)){
  neg_control_df[i,1] <- cor.test(fasting_glucose$fasting_glucose, tempo[,i])$estimate
  neg_control_df[i,2] <- cor.test(fasting_glucose$fasting_glucose, tempo[,i])$p.value
  neg_control_df[i,3] <- paste("model", i, sep = "_")
  neg_control_df[i,4]<- n_genes_height[i,1]
  neg_control_df[i,5] <- cor.test(fasting_glucose$fasting_glucose, tempo[,i])$conf.int[1]
  neg_control_df[i,6] <- cor.test(fasting_glucose$fasting_glucose, tempo[,i])$conf.int[2]
}

ggplot(neg_control_df, aes(n_genes_in_model, estimate)) + geom_errorbar(aes(ymin = conf.int.min, ymax = conf.int.max ), width=0.2,  color="gray48") + geom_point(position="jitter") +geom_line() +  xlab("Number of genes in each model") + ylab("Correlation Coefficient (r)") 

Test PTRS top genes for Enrichment

top_ptrs_genes <- weights_height %>% select(c(gene_name, model_5)) %>% filter(model_5 != 0) %>% select(c(gene_name))
top_ptrs_genes$gene_name = sapply(strsplit(top_ptrs_genes$gene_name , "\\."), `[`, 1)

top_ptrs_genes <- top_ptrs_genes %>% mutate(rat_gene = orth.rats[match(top_ptrs_genes$gene_name, orth.rats$ensembl_gene_id), 3]$rnorvegicus_homolog_ensembl_gene)

full_df <- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/rat_metabolic_traits_best_Ac_full_assocs.txt", col_names = TRUE)
pheno_Multi_rat <- read_tsv("/Users/natashasanthanam/Downloads/rat_metabolic_MultiXcan_pval_assoc.txt", col_names = TRUE)
qqunif(full_df$pvalue, col= "dodgerblue4" )
qqpoints(full_df %>% filter(gene %in% top_ptrs_genes$rat_gene) %>% .[["pvalue"]],pch='+', col = "dodgerblue") 

qqunif(pheno_Multi_rat$pvalue.bodylength_w_tail, col= "dodgerblue4" )
qqpoints(pheno_Multi_rat %>% filter(gene %in% top_ptrs_genes$rat_gene) %>% .[["pvalue.bodylength_w_tail"]],pch='+', col = "dodgerblue") 


all_ptrs_genes <- weights_height %>%  filter(!(gene_name %in% top_ptrs_genes$gene_name)) %>% select(c(gene_name))
all_ptrs_genes <-  all_ptrs_genes %>% mutate(rat_gene = orth.rats[match(all_ptrs_genes$gene_name, orth.rats$ensembl_gene_id), 3]$rnorvegicus_homolog_ensembl_gene)

qqunif(full_df %>% filter(gene %in% all_ptrs_genes$rat_gene) %>% .[["pvalue"]])
qqpoints(full_df %>% filter(gene %in% top_ptrs_genes$rat_gene) %>% .[["pvalue"]],pch='+', col = "dodgerblue") 


full_df <- full_df %>% mutate(human_gene = orth.rats[match(full_df$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 1]$ensembl_gene_id)
No matching items