03.PrediXcan_MultiXcan

Author

Natasha Santhanam

Published

February 7, 2022

library(tidyverse)
library(broom)
library(data.table)
library(RSQLite)
library(qqman)
library(ggrepel)
library(devtools)
devtools::source_gist("0ddc9c0ea03245bb30efbe3e899897be")
"%&%" = function(a,b) paste(a,b,sep="")
args = commandArgs(trailingOnly=TRUE)
geno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/rat_genotypes_LD_pruned_0.95/"

Run PrediXcan with Metabolic Phenotype Data

Create model with right column names for PrediXcan (do for all 5 tissues)

Sabrina’s note: I have no idea what is going on here, I don’t know where bimbam file came from. I tried running this script from genotype files generated from previous steps in the pipeline, but it seems like a different file was inputed below. We may be able to ignore this step for now, since we already have PrediXcan formatted genotype from an earlier step in the pipeline.

filelist <- list.files(geno.dir, pattern = ".bimbam")
#ids for rats are in the phenotype file under rat_rfid

for(file in filelist) {
  tempo <- fread(geno.dir %&% file)
  tempo <- tempo %>% mutate(chr =  numextract(sapply(strsplit(tempo$V1, ":"), `[`, 1)), .before = V1) %>% mutate(pos = numextract(sapply(strsplit(tempo$V1, ":"), `[`, 2)), .before = V2) %>% mutate(maf = 0, .before = V4)
  write_tsv(tempo, geno.dir %&% substr(file, 1, nchar(file) - 7) %&% ".txt", col_names = FALSE)
}

tis = "Ac"
data.dir <- "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Results/"
filename <-  data.dir %&% "sql/" %&% tis %&% "_output_db.db"
sqlite.driver <- dbDriver("SQLite")
conn <- dbConnect(RSQLite::SQLite(), filename)
extra <- dbGetQuery(conn, 'select * from extra')
weights <- dbGetQuery(conn, 'select * from weights')
dbDisconnect(conn)
extra <- extra %>% select(c(gene, genename, n.snps, R2, pval)) %>% mutate(pred.perf.qval = NA) 
colnames(extra) <- c("gene", "genename", "n.snps.in.model", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval")
cor_df <- read_tsv(data.dir %&% "all_results_" %&% tis, col_names = TRUE) %>% select(c(gene, cvm))

extra <- extra %>% filter(pred.perf.R2 > 0.01)

extra <- full_join(extra, cor_df, by = "gene")
extra <- extra %>% filter(cvm >= 0 | is.na(cvm)) %>% select(-c(cvm)) %>% filter(!is.na(pred.perf.R2))
weights <- weights %>% filter(gene %in% extra$gene)

model_db = data.dir %&% "sql/" %&% tis %&% "_best_prediXcan_db.db"
conn <- dbConnect(RSQLite::SQLite(), model_db)
dbWriteTable(conn, "weights", weights)
dbWriteTable(conn, "extra", extra)

#check to see model is set up 
dbListTables(conn)
dbGetQuery(conn, 'SELECT * FROM weights') %>% head
dbGetQuery(conn, 'SELECT * FROM extra') %>% head
dbDisconnect(conn)

Note: I don’t know how these files were generated.

samples <- read_tsv(GENO %&% "samples_Rat_metab_phenos_file", col_names = FALSE)
all_rats <- read_tsv(PHENO %&% "all_names.txt", col_names = TRUE)
samples <- samples %>% filter(!(X1  %in% all_rats$ID))
write_tsv(samples, GENO %&% "samples_Rat_metab_abrv_phenos_file", col_names = FALSE)

Do for all 5 tissues

#run prediXcan
conda activate imlabtools
METAXCAN=/Users/sabrinami/Github/MetaXcan/software
GENO=/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/data/rat_genotypes_LD_pruned_0.95
MODEL=/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/Results/sql
OUTPUT=/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/Results/PrediXcan/metabolic_traits/

python $METAXCAN/Predict.py \
--model_db_path $MODEL/Ac_best_prediXcan_db.db \
--text_genotypes \
 $GENO/chr*.round2_impute2_3473.txt  \
--on_the_fly_mapping METADATA "{}_{}_{}_{}" \
--text_sample_ids $GENO/samples_Rat_metab_phenos_file \
--prediction_output $OUTPUT/rat_metabolic_Ac_best__predict.txt  \
--prediction_summary_output $OUTPUT/rat_metabolic_Ac_best__summary.txt \
--throw

Note: Stop here, we have all the results needed to continue to PTRS analysis.

pred_expr <- read_tsv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/rat_metabolic_" %&% tis %&% "_best__predict.txt", col_names = TRUE)

all_rats <- read_tsv(PHENO %&% "all_names.txt", col_names = TRUE)
pred_expr <- pred_expr %>% filter(!(FID  %in% all_rats$ID))

#write_tsv(pred_expr, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/rat_metabolic_" %&% tis %&% "_best__predict.txt", col_names = TRUE)

#run asssociation in prediXcan
PHENO = "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/"
RESULTS = "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/associations/"

pheno <- read_csv(PHENO %&% "processed_obesity_rat_Palmer_phenotypes.csv", col_names = TRUE) %>% filter(!(rat_rfid  %in% all_rats$ID))
write_tsv(pheno, PHENO %&% "processed_obesity_rat_Palmer_phenotypes_target_set.tsv", col_names = TRUE)

for(i in 2:length(colnames(pheno))){
trait <- colnames(pheno)[i]
runLOC <- "python3 " %&% METAXCAN %&% "/PrediXcanAssociation.py " %&% "--expression_file " %&%  "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/rat_metabolic_" %&% "Ac" %&% "__predict.txt --input_phenos_file " %&% PHENO %&% "processed_obesity_rat_Palmer_phenotypes_target_set.tsv " %&% "--input_phenos_column " %&% trait %&%  " --output " %&% RESULTS %&% "associations/" %&% "Ac" %&% "__association_" %&% trait %&% ".txt --verbosity 9 --throw"
system(runLOC)
}

Rscript --vanilla /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_trait_assoc_all_tis.R $tissue

qsub -v tissue=$tis metabolic_assoc_all_tissues.pbs

Format and Plot PrediXcan Results

results.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/associations"
filelist <- list.files(results.dir, pattern = "Ac__association_", full.names = TRUE)
full_df <- data.frame()

for(fila in filelist) {
  assoc_fila <- read_tsv(fila, col_names = TRUE)
  pheno_id <- substr(fila, 104, (str_length(fila) - 4))
  tempo <- cbind(assoc_fila, metabolic_trait=pheno_id) %>% select(-c(status))
  full_df <- rbind(full_df, tempo)
} 
#full_df <- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/rat_metabolic_traits_best_Ac_full_assocs.txt", col_names = TRUE)

tempo_df <- full_df %>% filter(pvalue <  9.279881e-06)

#428 sig genes
tempo_df %>% group_by(gene) %>% summarise(n = n())

#all 11 traits
tempo_df %>% group_by(metabolic_trait) %>% summarise(n = n())

Filter prediXcan results for Supplementary Table

full_df <- full_df %>% filter(metabolic_trait == "bmi_bodylength_w_tail"|metabolic_trait ==  "bodylength_w_tail"| metabolic_trait == "bodyweight" | metabolic_trait == "fasting_glucose" | metabolic_trait == "epifat" |  metabolic_trait == "retrofat" | metabolic_trait == "parafat")

full_df <- full_df %>% mutate(gene_name = orth.rats[match(full_df$gene, orth.rats$rnorvegicus_homolog_ensembl_gene),4]$rnorvegicus_homolog_associated_gene_name, .before = effect)

full_df$metabolic_trait[full_df$metabolic_trait == "bmi_bodylength_w_tail" ] <- "Body Mass Index (BMI) with tail"
full_df$metabolic_trait[full_df$metabolic_trait == "bodylength_w_tail" ] <- "Body length including tail"
full_df$metabolic_trait[full_df$metabolic_trait == "bodyweight" ] <- "Body weight"
full_df$metabolic_trait[full_df$metabolic_trait == "fasting_glucose" ] <- "Fasting Glucose"
full_df$metabolic_trait[full_df$metabolic_trait == "epifat" ] <- "Epididymal fat"
full_df$metabolic_trait[full_df$metabolic_trait == "retrofat" ] <- "Retroperitoneal fat"
full_df$metabolic_trait[full_df$metabolic_trait == "parafat" ] <- "Parametrial fat"

full_df <- full_df %>% mutate(bf_sig = ifelse(full_df$pvalue <= 9.279881e-06, "Yes", "No"))

human_height_genes <- read_tsv("/Users/natashasanthanam/Downloads/Human_phenomeXcan_all_traits.txt", col_names = TRUE)
human_height_genes <- human_height_genes %>% mutate(rat_gene = orth.rats[match(human_height_genes$gene_name, orth.rats$external_gene_name), 4]$rnorvegicus_homolog_associated_gene_name) %>% filter(pvalue_Height <= 0.01)


human_bmi_genes <- read_tsv("/Users/natashasanthanam/Downloads/Human_phenomeXcan_all_traits.txt", col_names = TRUE) 
colnames(human_bmi_genes)[2] = "pvalue_BMI"
human_bmi_genes <- human_bmi_genes %>% mutate(rat_gene = orth.rats[match(human_bmi_genes$gene_name, orth.rats$external_gene_name), 4]$rnorvegicus_homolog_associated_gene_name) %>% filter(pvalue_BMI <= 0.01 )

gene_annot <- readRDS("/Users/natashasanthanam/Github/rat-genomic-analysis/data/gene_annotation.RDS") %>% select(c("chr", "gene_id", "start", "end")) %>% rename(gene = gene_id)

tempo_manhatt <- inner_join(gene_annot, full_df, by = "gene")
tempo_manhatt$chr <- as.numeric(tempo_manhatt$chr)

bmi_manhat <- tempo_manhatt %>% filter(metabolic_trait == "Body Mass Index (BMI) with tail") 
bmi_manhat <- bmi_manhat %>% mutate(gene_name = orth.rats[match(bmi_manhat$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 4]$rnorvegicus_homolog_associated_gene_name)

height_manhat <- tempo_manhatt %>% filter(metabolic_trait == "Body length including tail") 
height_manhat <- height_manhat %>% mutate(gene_name = orth.rats[match(height_manhat$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 4]$rnorvegicus_homolog_associated_gene_name)

data_cum <- bmi_manhat %>% 
  group_by(chr) %>% 
  summarise(max_bp = as.numeric(max(start))) %>% 
  mutate(bp_add = lag(cumsum(max_bp), default = 0)) %>% 
  select(chr, bp_add)

gwas_data <- bmi_manhat %>% 
  inner_join(data_cum, by = "chr") %>% 
  mutate(bp_cum = start + bp_add)


axis_set <- gwas_data %>% 
  group_by(chr) %>% 
  summarize(center = mean(bp_cum))

ylim <- gwas_data %>% 
  filter(pvalue == min(pvalue)) %>% 
  mutate(ylim = abs(floor(log10(pvalue))) + 2) %>% 
  pull(ylim)

sig <-  0.05/(5388)

bmi_manhplot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(pvalue), 
                                  color = as_factor(chr), size = -log10(pvalue))) +
  geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") + 
  geom_hline(yintercept = -log10(0.0001), color = "red", linetype = "dashed") + 
  geom_point(alpha = 0.75, shape = ifelse((gwas_data$zscore >= 4.863456), 17, ifelse(gwas_data$zscore <= -4.863456, 25, 19)), fill = "dodgerblue4") +
  geom_label_repel(aes(label=ifelse((pvalue <=  sig & gene_name %in% human_bmi_genes$rat_gene), gene_name, "")), size = 6) + 
  ylim(c(0,8)) + 
  scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) +
  scale_color_manual(values = rep(c("dodgerblue4", "midnightblue"), unique(length(axis_set$chr)))) +
  scale_size_continuous(range = c(0.5,3)) +
  labs(x = NULL, 
       y = expression(-log[10](italic(p)))) + 
  theme_minimal() +
  theme( 
    legend.position = "none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    axis.text.x = element_text(angle = 90, size = 12), 
    axis.text.y = element_text( size = 12,  vjust = 0), 
    axis.title = element_text(size = 20))

data_cum <- height_manhat %>% 
  group_by(chr) %>% 
  summarise(max_bp = as.numeric(max(start))) %>% 
  mutate(bp_add = lag(cumsum(max_bp), default = 0)) %>% 
  select(chr, bp_add)

gwas_data <- height_manhat %>% 
  inner_join(data_cum, by = "chr") %>% 
  mutate(bp_cum = start + bp_add)


axis_set <- gwas_data %>% 
  group_by(chr) %>% 
  summarize(center = mean(bp_cum))

ylim <- gwas_data %>% 
  filter(pvalue == min(pvalue)) %>% 
  mutate(ylim = abs(floor(log10(pvalue))) + 2) %>% 
  pull(ylim)

sig <-   0.05/(5388)

height_manhplot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(pvalue), 
                                  color = as_factor(chr), size = -log10(pvalue))) +
  geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") + 
  geom_hline(yintercept = -log10(0.0001), color = "red", linetype = "dashed") + 
  geom_point(alpha = 0.75, shape = ifelse((gwas_data$zscore >= 4.863456), 17, ifelse(gwas_data$zscore <= -4.863456, 25, 19)), fill = "dodgerblue4") +
  geom_label_repel(aes(label=ifelse((pvalue <=  sig & gene_name %in% human_height_genes$rat_gene), gene_name, "")), size = 6) + 
  ylim(c(0,10)) + 
  scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) +
  scale_color_manual(values = rep(c("dodgerblue4", "midnightblue"), unique(length(axis_set$chr)))) +
  scale_size_continuous(range = c(0.5,3)) +
  labs(x = NULL, 
       y = expression(-log[10](italic(p)))) + 
  theme_minimal() +
  theme( 
    legend.position = "none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    axis.text.x = element_text(angle = 90, size = 12), 
    axis.text.y = element_text( size = 12,  vjust = 0), 
    axis.title = element_text(size = 20))

Run MultiXcan

Generate Folder with Predicted Expression Data for each Tissue

First have to remove potential overlap between genotypes used in predicted expression and those in phenotype file. Should only be around 60ish so not too big a deal

filelist <- list.files("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/expression", pattern = ".RDS", full.names = TRUE)
all_names <- data.frame(ID = as.character())
for(fila in filelist) {
  tempo <- readRDS(fila)
  tempo <- as.data.frame(rownames(tempo)) %>% rename(ID = `rownames(tempo)`)
  all_names <- full_join(tempo, all_names, by = "ID")
}

pheno <- read_csv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/processed_obesity_rat_Palmer_phenotypes.csv", col_names=TRUE)

pheno <- pheno %>% rename(ID = rat_rfid) %>% filter(!ID %in% all_names$ID)

Next have to remove overlap rats predicted expression as well

filelist <- list.files("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/", pattern = "__predict.txt", full.names = TRUE)

for(fila in filelist) {
  tempo <- fread(fila, header=TRUE)
  name <- substr(fila, 90,91)
  tempo <- tempo %>% filter(!FID %in% all_names$ID)
  tempo <- tempo[match(pheno$ID, tempo$FID),]
  write_tsv(tempo, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/expr/" %&% name %&% ".txt")
}

Run MultiXcan using the predicted expression from prediXcan across all 5 tissues to boost power

#!/bin/bash

#PBS -N multixcan
#PBS -S /bin/bash
#PBS -l walltime=4:00:00
#PBS -l mem=4gb
#PBS -l nodes=1:ppn=1

# SPECIFY LOGGING BEHAVIOR

#PBS -o /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/logs/${PBS_JOBNAME}.${PBS_JOBID}.log
#PBS -e /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/logs/${PBS_JOBNAME}.${PBS_JOBID}.err

module load gcc/6.2.0
source ~/.bashrc 
conda activate /gpfs/data/im-lab/nas40t2/bin/envs/tensorqtl/

echo "MultiXcan running on epifat"

python /gpfs/data/im-lab/nas40t2/natasha/GTEX_Analysis/MetaXcan/software/MulTiXcan.py \
        --expression_folder /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/expr \
        --expression_pattern "(.*).txt" \
        --input_phenos_file /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/metabolic_trait_phenos_MultiXcan.txt \
        --input_phenos_column bmi_bodylength_wo_tail \
        --output /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/results/bmi_bodylength_wo_tail_predict_assoc.txt \
        --pc_condition_number 10 \
        --mode linear \
        --verbosity 8 \
        --throw

Add Zscore to MutliXcan

Calculate most significant Zscore across all tisuses For each trait find most significant pvalue and take sign of that effect

pheno <- read_csv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/processed_obesity_rat_Palmer_phenotypes.csv", col_names=TRUE)

for(i in 2:ncol(pheno)) {
  trait <- colnames(pheno)[i]
  filelist <- list.files(data.dir %&% "prediXcan/metabolic_traits/associations/", pattern = trait %&% ".txt", full.names = TRUE)
  tempo <- data.frame(gene= as.character())
  for(fila in filelist) {
    tis <- substr(fila, 89, 90)
    df <- read_tsv(fila) %>% select(c(gene, effect, pvalue)) 
    new_eff <- paste("effect", tis, sep = "_")
    new_pval <- paste("pvalue", tis, sep = "_")
    colnames(df)[2] <- new_eff
    colnames(df)[3] <- new_pval 
    
   tempo <- full_join(tempo, df,  by = "gene")
  }
  most_sig = rowMins(as.matrix(tempo[,c(3,5,7,9,11)]))
  
  Ac <- tempo[na.omit(match(most_sig, tempo$pvalue_Ac)), c(1,2)] %>% rename(effect = effect_Ac )
  Il <- tempo[na.omit(match(most_sig, tempo$pvalue_Il)), c(1,4)] %>% rename(effect = effect_Il )
  Lh <- tempo[na.omit(match(most_sig, tempo$pvalue_Lh)), c(1,6)] %>% rename(effect = effect_Lh )
  Pl <- tempo[na.omit(match(most_sig, tempo$pvalue_Pl)), c(1,8)] %>% rename(effect = effect_Pl )
  Vo <- tempo[na.omit(match(most_sig, tempo$pvalue_Vo)), c(1,10)] %>% rename(effect = effect_Vo )
  
  df <- rbind(Ac, Il, Lh, Pl, Vo) 
  df <- df %>% mutate(sign = sign(effect))
  write_tsv(df, data.dir %&% "prediXcan/metabolic_traits/associations/most_sig_zscores/" %&% trait %&% "_avg_zscore.txt", col_names = FALSE)
}

Loci Analysis

devtools::source_gist("50a2bdc64e103e8321fefb9e712aa137")

gene_annot <- readRDS("/Users/natashasanthanam/Github/rat-genomic-analysis/data/gene_annotation.RDS") %>% select(c(chr, gene_id, gene_name, start, end))
height_loci <- full_df %>% filter(metabolic_trait == "Body length including tail") %>% filter(pvalue <= 9.279881e-06)

height_loci <- inner_join(gene_annot, height_loci %>% select(c(gene, pvalue)) %>% rename(gene_id = gene), by = "gene_id")
height_loci$chr = as.numeric(height_loci$chr)
height_loci <-  height_loci[order(height_loci$chr),]

height_distinct_loci <- fn_count_distinct_loci(height_loci)

bmi_loci <- full_df %>% filter(metabolic_trait == "Body Mass Index (BMI) with tail") %>% filter(pvalue <= 9.279881e-06)

bmi_loci <- inner_join(gene_annot, bmi_loci %>% select(c(gene, pvalue)) %>% rename(gene_id = gene), by = "gene_id")
bmi_loci$chr = as.numeric(bmi_loci$chr)
bmi_loci <-  bmi_loci[order(bmi_loci$chr),]

bmi_distinct_loci <- fn_count_distinct_loci(bmi_loci)

--- title: "03.PrediXcan_MultiXcan" author: "Natasha Santhanam" date: "2/7/2022" output: html_document --- ```{r setup, eval=FALSE} library(tidyverse) library(broom) library(data.table) library(RSQLite) library(qqman) library(ggrepel) library(devtools) devtools::source_gist("0ddc9c0ea03245bb30efbe3e899897be") "%&%" = function(a,b) paste(a,b,sep="") args = commandArgs(trailingOnly=TRUE) geno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/rat_genotypes_LD_pruned_0.95/" ``` ## Run PrediXcan with Metabolic Phenotype Data # Create model with right column names for PrediXcan (do for all 5 tissues) Sabrina's note: I have no idea what is going on here, I don't know where bimbam file came from. I tried running this script from genotype files generated from previous steps in the pipeline, but it seems like a different file was inputed below. We may be able to ignore this step for now, since we already have PrediXcan formatted genotype from an earlier step in the pipeline. ```{r create txt genotypes, eval=FALSE} filelist <- list.files(geno.dir, pattern = ".bimbam") #ids for rats are in the phenotype file under rat_rfid for(file in filelist) { tempo <- fread(geno.dir %&% file) tempo <- tempo %>% mutate(chr = numextract(sapply(strsplit(tempo$V1, ":"), `[`, 1)), .before = V1) %>% mutate(pos = numextract(sapply(strsplit(tempo$V1, ":"), `[`, 2)), .before = V2) %>% mutate(maf = 0, .before = V4) write_tsv(tempo, geno.dir %&% substr(file, 1, nchar(file) - 7) %&% ".txt", col_names = FALSE) } ``` ```{r change colnames of Ac prediction model, eval=FALSE} tis = "Ac" data.dir <- "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Results/" filename <- data.dir %&% "sql/" %&% tis %&% "_output_db.db" sqlite.driver <- dbDriver("SQLite") conn <- dbConnect(RSQLite::SQLite(), filename) extra <- dbGetQuery(conn, 'select * from extra') weights <- dbGetQuery(conn, 'select * from weights') dbDisconnect(conn) extra <- extra %>% select(c(gene, genename, n.snps, R2, pval)) %>% mutate(pred.perf.qval = NA) colnames(extra) <- c("gene", "genename", "n.snps.in.model", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval") cor_df <- read_tsv(data.dir %&% "all_results_" %&% tis, col_names = TRUE) %>% select(c(gene, cvm)) extra <- extra %>% filter(pred.perf.R2 > 0.01) extra <- full_join(extra, cor_df, by = "gene") extra <- extra %>% filter(cvm >= 0 | is.na(cvm)) %>% select(-c(cvm)) %>% filter(!is.na(pred.perf.R2)) weights <- weights %>% filter(gene %in% extra$gene) ``` ```{r create database connection, eval=FALSE} model_db = data.dir %&% "sql/" %&% tis %&% "_best_prediXcan_db.db" conn <- dbConnect(RSQLite::SQLite(), model_db) dbWriteTable(conn, "weights", weights) dbWriteTable(conn, "extra", extra) #check to see model is set up dbListTables(conn) dbGetQuery(conn, 'SELECT * FROM weights') %>% head dbGetQuery(conn, 'SELECT * FROM extra') %>% head dbDisconnect(conn) ``` Note: I don't know how these files were generated. ```{r filter samples for no overlap of rats, eval=FALSE} samples <- read_tsv(GENO %&% "samples_Rat_metab_phenos_file", col_names = FALSE) all_rats <- read_tsv(PHENO %&% "all_names.txt", col_names = TRUE) samples <- samples %>% filter(!(X1 %in% all_rats$ID)) write_tsv(samples, GENO %&% "samples_Rat_metab_abrv_phenos_file", col_names = FALSE) ``` # Do for all 5 tissues ```{bash run prediXcan, eval = FALSE} #run prediXcan conda activate imlabtools METAXCAN=/Users/sabrinami/Github/MetaXcan/software GENO=/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/data/rat_genotypes_LD_pruned_0.95 MODEL=/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/Results/sql OUTPUT=/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/Results/PrediXcan/metabolic_traits/ python $METAXCAN/Predict.py \ --model_db_path $MODEL/Ac_best_prediXcan_db.db \ --text_genotypes \ $GENO/chr*.round2_impute2_3473.txt \ --on_the_fly_mapping METADATA "{}_{}_{}_{}" \ --text_sample_ids $GENO/samples_Rat_metab_phenos_file \ --prediction_output $OUTPUT/rat_metabolic_Ac_best__predict.txt \ --prediction_summary_output $OUTPUT/rat_metabolic_Ac_best__summary.txt \ --throw ``` Note: Stop here, we have all the results needed to continue to PTRS analysis. ```{r } pred_expr <- read_tsv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/rat_metabolic_" %&% tis %&% "_best__predict.txt", col_names = TRUE) all_rats <- read_tsv(PHENO %&% "all_names.txt", col_names = TRUE) pred_expr <- pred_expr %>% filter(!(FID %in% all_rats$ID)) #write_tsv(pred_expr, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/rat_metabolic_" %&% tis %&% "_best__predict.txt", col_names = TRUE) ``` ```{r run associations, eval=FALSE} #run asssociation in prediXcan PHENO = "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/" RESULTS = "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/associations/" pheno <- read_csv(PHENO %&% "processed_obesity_rat_Palmer_phenotypes.csv", col_names = TRUE) %>% filter(!(rat_rfid %in% all_rats$ID)) write_tsv(pheno, PHENO %&% "processed_obesity_rat_Palmer_phenotypes_target_set.tsv", col_names = TRUE) for(i in 2:length(colnames(pheno))){ trait <- colnames(pheno)[i] runLOC <- "python3 " %&% METAXCAN %&% "/PrediXcanAssociation.py " %&% "--expression_file " %&% "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/rat_metabolic_" %&% "Ac" %&% "__predict.txt --input_phenos_file " %&% PHENO %&% "processed_obesity_rat_Palmer_phenotypes_target_set.tsv " %&% "--input_phenos_column " %&% trait %&% " --output " %&% RESULTS %&% "associations/" %&% "Ac" %&% "__association_" %&% trait %&% ".txt --verbosity 9 --throw" system(runLOC) } ``` ```{bash submit script for all tissues} Rscript --vanilla /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_trait_assoc_all_tis.R $tissue qsub -v tissue=$tis metabolic_assoc_all_tissues.pbs ``` Format and Plot PrediXcan Results ```{r format prediXcan assocs, eval=FALSE} results.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/associations" filelist <- list.files(results.dir, pattern = "Ac__association_", full.names = TRUE) full_df <- data.frame() for(fila in filelist) { assoc_fila <- read_tsv(fila, col_names = TRUE) pheno_id <- substr(fila, 104, (str_length(fila) - 4)) tempo <- cbind(assoc_fila, metabolic_trait=pheno_id) %>% select(-c(status)) full_df <- rbind(full_df, tempo) } #full_df <- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/rat_metabolic_traits_best_Ac_full_assocs.txt", col_names = TRUE) tempo_df <- full_df %>% filter(pvalue < 9.279881e-06) #428 sig genes tempo_df %>% group_by(gene) %>% summarise(n = n()) #all 11 traits tempo_df %>% group_by(metabolic_trait) %>% summarise(n = n()) ``` Filter prediXcan results for Supplementary Table ```{r} full_df <- full_df %>% filter(metabolic_trait == "bmi_bodylength_w_tail"|metabolic_trait == "bodylength_w_tail"| metabolic_trait == "bodyweight" | metabolic_trait == "fasting_glucose" | metabolic_trait == "epifat" | metabolic_trait == "retrofat" | metabolic_trait == "parafat") full_df <- full_df %>% mutate(gene_name = orth.rats[match(full_df$gene, orth.rats$rnorvegicus_homolog_ensembl_gene),4]$rnorvegicus_homolog_associated_gene_name, .before = effect) full_df$metabolic_trait[full_df$metabolic_trait == "bmi_bodylength_w_tail" ] <- "Body Mass Index (BMI) with tail" full_df$metabolic_trait[full_df$metabolic_trait == "bodylength_w_tail" ] <- "Body length including tail" full_df$metabolic_trait[full_df$metabolic_trait == "bodyweight" ] <- "Body weight" full_df$metabolic_trait[full_df$metabolic_trait == "fasting_glucose" ] <- "Fasting Glucose" full_df$metabolic_trait[full_df$metabolic_trait == "epifat" ] <- "Epididymal fat" full_df$metabolic_trait[full_df$metabolic_trait == "retrofat" ] <- "Retroperitoneal fat" full_df$metabolic_trait[full_df$metabolic_trait == "parafat" ] <- "Parametrial fat" full_df <- full_df %>% mutate(bf_sig = ifelse(full_df$pvalue <= 9.279881e-06, "Yes", "No")) ``` ```{r find genes that overlap in Humans, eval=FALSE} human_height_genes <- read_tsv("/Users/natashasanthanam/Downloads/Human_phenomeXcan_all_traits.txt", col_names = TRUE) human_height_genes <- human_height_genes %>% mutate(rat_gene = orth.rats[match(human_height_genes$gene_name, orth.rats$external_gene_name), 4]$rnorvegicus_homolog_associated_gene_name) %>% filter(pvalue_Height <= 0.01) human_bmi_genes <- read_tsv("/Users/natashasanthanam/Downloads/Human_phenomeXcan_all_traits.txt", col_names = TRUE) colnames(human_bmi_genes)[2] = "pvalue_BMI" human_bmi_genes <- human_bmi_genes %>% mutate(rat_gene = orth.rats[match(human_bmi_genes$gene_name, orth.rats$external_gene_name), 4]$rnorvegicus_homolog_associated_gene_name) %>% filter(pvalue_BMI <= 0.01 ) ``` ```{r plot prediXcan results as miami plot divided per trait} gene_annot <- readRDS("/Users/natashasanthanam/Github/rat-genomic-analysis/data/gene_annotation.RDS") %>% select(c("chr", "gene_id", "start", "end")) %>% rename(gene = gene_id) tempo_manhatt <- inner_join(gene_annot, full_df, by = "gene") tempo_manhatt$chr <- as.numeric(tempo_manhatt$chr) bmi_manhat <- tempo_manhatt %>% filter(metabolic_trait == "Body Mass Index (BMI) with tail") bmi_manhat <- bmi_manhat %>% mutate(gene_name = orth.rats[match(bmi_manhat$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 4]$rnorvegicus_homolog_associated_gene_name) height_manhat <- tempo_manhatt %>% filter(metabolic_trait == "Body length including tail") height_manhat <- height_manhat %>% mutate(gene_name = orth.rats[match(height_manhat$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 4]$rnorvegicus_homolog_associated_gene_name) ``` ```{r plot manhattan for BMI} data_cum <- bmi_manhat %>% group_by(chr) %>% summarise(max_bp = as.numeric(max(start))) %>% mutate(bp_add = lag(cumsum(max_bp), default = 0)) %>% select(chr, bp_add) gwas_data <- bmi_manhat %>% inner_join(data_cum, by = "chr") %>% mutate(bp_cum = start + bp_add) axis_set <- gwas_data %>% group_by(chr) %>% summarize(center = mean(bp_cum)) ylim <- gwas_data %>% filter(pvalue == min(pvalue)) %>% mutate(ylim = abs(floor(log10(pvalue))) + 2) %>% pull(ylim) sig <- 0.05/(5388) bmi_manhplot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(pvalue), color = as_factor(chr), size = -log10(pvalue))) + geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") + geom_hline(yintercept = -log10(0.0001), color = "red", linetype = "dashed") + geom_point(alpha = 0.75, shape = ifelse((gwas_data$zscore >= 4.863456), 17, ifelse(gwas_data$zscore <= -4.863456, 25, 19)), fill = "dodgerblue4") + geom_label_repel(aes(label=ifelse((pvalue <= sig & gene_name %in% human_bmi_genes$rat_gene), gene_name, "")), size = 6) + ylim(c(0,8)) + scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) + scale_color_manual(values = rep(c("dodgerblue4", "midnightblue"), unique(length(axis_set$chr)))) + scale_size_continuous(range = c(0.5,3)) + labs(x = NULL, y = expression(-log[10](italic(p)))) + theme_minimal() + theme( legend.position = "none", panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), axis.text.x = element_text(angle = 90, size = 12), axis.text.y = element_text( size = 12, vjust = 0), axis.title = element_text(size = 20)) ``` ```{r plot manhattan for height} data_cum <- height_manhat %>% group_by(chr) %>% summarise(max_bp = as.numeric(max(start))) %>% mutate(bp_add = lag(cumsum(max_bp), default = 0)) %>% select(chr, bp_add) gwas_data <- height_manhat %>% inner_join(data_cum, by = "chr") %>% mutate(bp_cum = start + bp_add) axis_set <- gwas_data %>% group_by(chr) %>% summarize(center = mean(bp_cum)) ylim <- gwas_data %>% filter(pvalue == min(pvalue)) %>% mutate(ylim = abs(floor(log10(pvalue))) + 2) %>% pull(ylim) sig <- 0.05/(5388) height_manhplot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(pvalue), color = as_factor(chr), size = -log10(pvalue))) + geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") + geom_hline(yintercept = -log10(0.0001), color = "red", linetype = "dashed") + geom_point(alpha = 0.75, shape = ifelse((gwas_data$zscore >= 4.863456), 17, ifelse(gwas_data$zscore <= -4.863456, 25, 19)), fill = "dodgerblue4") + geom_label_repel(aes(label=ifelse((pvalue <= sig & gene_name %in% human_height_genes$rat_gene), gene_name, "")), size = 6) + ylim(c(0,10)) + scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) + scale_color_manual(values = rep(c("dodgerblue4", "midnightblue"), unique(length(axis_set$chr)))) + scale_size_continuous(range = c(0.5,3)) + labs(x = NULL, y = expression(-log[10](italic(p)))) + theme_minimal() + theme( legend.position = "none", panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), axis.text.x = element_text(angle = 90, size = 12), axis.text.y = element_text( size = 12, vjust = 0), axis.title = element_text(size = 20)) ``` ## Run MultiXcan # Generate Folder with Predicted Expression Data for each Tissue First have to remove potential overlap between genotypes used in predicted expression and those in phenotype file. Should only be around 60ish so not too big a deal ```{r generate list of ids across all tissues, eval=FALSE} filelist <- list.files("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/expression", pattern = ".RDS", full.names = TRUE) all_names <- data.frame(ID = as.character()) for(fila in filelist) { tempo <- readRDS(fila) tempo <- as.data.frame(rownames(tempo)) %>% rename(ID = `rownames(tempo)`) all_names <- full_join(tempo, all_names, by = "ID") } ``` ```{r clean up pheno file, eval=FALSE} pheno <- read_csv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/processed_obesity_rat_Palmer_phenotypes.csv", col_names=TRUE) pheno <- pheno %>% rename(ID = rat_rfid) %>% filter(!ID %in% all_names$ID) ``` Next have to remove overlap rats predicted expression as well ```{r generate expr data, eval=FALSE} filelist <- list.files("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/", pattern = "__predict.txt", full.names = TRUE) for(fila in filelist) { tempo <- fread(fila, header=TRUE) name <- substr(fila, 90,91) tempo <- tempo %>% filter(!FID %in% all_names$ID) tempo <- tempo[match(pheno$ID, tempo$FID),] write_tsv(tempo, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/expr/" %&% name %&% ".txt") } ``` Run MultiXcan using the predicted expression from prediXcan across all 5 tissues to boost power ```{bash run MultiXcan} #!/bin/bash #PBS -N multixcan #PBS -S /bin/bash #PBS -l walltime=4:00:00 #PBS -l mem=4gb #PBS -l nodes=1:ppn=1 # SPECIFY LOGGING BEHAVIOR #PBS -o /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/logs/${PBS_JOBNAME}.${PBS_JOBID}.log #PBS -e /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/logs/${PBS_JOBNAME}.${PBS_JOBID}.err module load gcc/6.2.0 source ~/.bashrc conda activate /gpfs/data/im-lab/nas40t2/bin/envs/tensorqtl/ echo "MultiXcan running on epifat" python /gpfs/data/im-lab/nas40t2/natasha/GTEX_Analysis/MetaXcan/software/MulTiXcan.py \ --expression_folder /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/expr \ --expression_pattern "(.*).txt" \ --input_phenos_file /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/metabolic_trait_phenos_MultiXcan.txt \ --input_phenos_column bmi_bodylength_wo_tail \ --output /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/results/bmi_bodylength_wo_tail_predict_assoc.txt \ --pc_condition_number 10 \ --mode linear \ --verbosity 8 \ --throw ``` # Add Zscore to MutliXcan Calculate most significant Zscore across all tisuses For each trait find most significant pvalue and take sign of that effect ```{r find mean Z, eval=FALSE} pheno <- read_csv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/processed_obesity_rat_Palmer_phenotypes.csv", col_names=TRUE) for(i in 2:ncol(pheno)) { trait <- colnames(pheno)[i] filelist <- list.files(data.dir %&% "prediXcan/metabolic_traits/associations/", pattern = trait %&% ".txt", full.names = TRUE) tempo <- data.frame(gene= as.character()) for(fila in filelist) { tis <- substr(fila, 89, 90) df <- read_tsv(fila) %>% select(c(gene, effect, pvalue)) new_eff <- paste("effect", tis, sep = "_") new_pval <- paste("pvalue", tis, sep = "_") colnames(df)[2] <- new_eff colnames(df)[3] <- new_pval tempo <- full_join(tempo, df, by = "gene") } most_sig = rowMins(as.matrix(tempo[,c(3,5,7,9,11)])) Ac <- tempo[na.omit(match(most_sig, tempo$pvalue_Ac)), c(1,2)] %>% rename(effect = effect_Ac ) Il <- tempo[na.omit(match(most_sig, tempo$pvalue_Il)), c(1,4)] %>% rename(effect = effect_Il ) Lh <- tempo[na.omit(match(most_sig, tempo$pvalue_Lh)), c(1,6)] %>% rename(effect = effect_Lh ) Pl <- tempo[na.omit(match(most_sig, tempo$pvalue_Pl)), c(1,8)] %>% rename(effect = effect_Pl ) Vo <- tempo[na.omit(match(most_sig, tempo$pvalue_Vo)), c(1,10)] %>% rename(effect = effect_Vo ) df <- rbind(Ac, Il, Lh, Pl, Vo) df <- df %>% mutate(sign = sign(effect)) write_tsv(df, data.dir %&% "prediXcan/metabolic_traits/associations/most_sig_zscores/" %&% trait %&% "_avg_zscore.txt", col_names = FALSE) } ``` Loci Analysis ```{r function to count distinct genes} devtools::source_gist("50a2bdc64e103e8321fefb9e712aa137") ``` ```{r find distinct loci related to height} gene_annot <- readRDS("/Users/natashasanthanam/Github/rat-genomic-analysis/data/gene_annotation.RDS") %>% select(c(chr, gene_id, gene_name, start, end)) height_loci <- full_df %>% filter(metabolic_trait == "Body length including tail") %>% filter(pvalue <= 9.279881e-06) height_loci <- inner_join(gene_annot, height_loci %>% select(c(gene, pvalue)) %>% rename(gene_id = gene), by = "gene_id") height_loci$chr = as.numeric(height_loci$chr) height_loci <- height_loci[order(height_loci$chr),] height_distinct_loci <- fn_count_distinct_loci(height_loci) ``` ```{r find distinct loci related to BMI} bmi_loci <- full_df %>% filter(metabolic_trait == "Body Mass Index (BMI) with tail") %>% filter(pvalue <= 9.279881e-06) bmi_loci <- inner_join(gene_annot, bmi_loci %>% select(c(gene, pvalue)) %>% rename(gene_id = gene), by = "gene_id") bmi_loci$chr = as.numeric(bmi_loci$chr) bmi_loci <- bmi_loci[order(bmi_loci$chr),] bmi_distinct_loci <- fn_count_distinct_loci(bmi_loci) ```