library(tidyverse)
library(broom)
library(data.table)
library(RSQLite)
library(qqman)
library(ggrepel)
library(devtools)
::source_gist("0ddc9c0ea03245bb30efbe3e899897be")
devtools"%&%" = function(a,b) paste(a,b,sep="")
= commandArgs(trailingOnly=TRUE)
args <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/rat_genotypes_LD_pruned_0.95/" geno.dir
03.PrediXcan_MultiXcan
Run PrediXcan with Metabolic Phenotype Data
Create model with right column names for PrediXcan (do for all 5 tissues)
Sabrina’s note: I have no idea what is going on here, I don’t know where bimbam file came from. I tried running this script from genotype files generated from previous steps in the pipeline, but it seems like a different file was inputed below. We may be able to ignore this step for now, since we already have PrediXcan formatted genotype from an earlier step in the pipeline.
<- list.files(geno.dir, pattern = ".bimbam")
filelist #ids for rats are in the phenotype file under rat_rfid
for(file in filelist) {
<- fread(geno.dir %&% file)
tempo <- tempo %>% mutate(chr = numextract(sapply(strsplit(tempo$V1, ":"), `[`, 1)), .before = V1) %>% mutate(pos = numextract(sapply(strsplit(tempo$V1, ":"), `[`, 2)), .before = V2) %>% mutate(maf = 0, .before = V4)
tempo write_tsv(tempo, geno.dir %&% substr(file, 1, nchar(file) - 7) %&% ".txt", col_names = FALSE)
}
= "Ac"
tis <- "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Results/"
data.dir <- data.dir %&% "sql/" %&% tis %&% "_output_db.db"
filename <- dbDriver("SQLite")
sqlite.driver <- dbConnect(RSQLite::SQLite(), filename)
conn <- dbGetQuery(conn, 'select * from extra')
extra <- dbGetQuery(conn, 'select * from weights')
weights dbDisconnect(conn)
<- extra %>% select(c(gene, genename, n.snps, R2, pval)) %>% mutate(pred.perf.qval = NA)
extra colnames(extra) <- c("gene", "genename", "n.snps.in.model", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval")
<- read_tsv(data.dir %&% "all_results_" %&% tis, col_names = TRUE) %>% select(c(gene, cvm))
cor_df
<- extra %>% filter(pred.perf.R2 > 0.01)
extra
<- full_join(extra, cor_df, by = "gene")
extra <- extra %>% filter(cvm >= 0 | is.na(cvm)) %>% select(-c(cvm)) %>% filter(!is.na(pred.perf.R2))
extra <- weights %>% filter(gene %in% extra$gene) weights
= data.dir %&% "sql/" %&% tis %&% "_best_prediXcan_db.db"
model_db <- dbConnect(RSQLite::SQLite(), model_db)
conn dbWriteTable(conn, "weights", weights)
dbWriteTable(conn, "extra", extra)
#check to see model is set up
dbListTables(conn)
dbGetQuery(conn, 'SELECT * FROM weights') %>% head
dbGetQuery(conn, 'SELECT * FROM extra') %>% head
dbDisconnect(conn)
Note: I don’t know how these files were generated.
<- read_tsv(GENO %&% "samples_Rat_metab_phenos_file", col_names = FALSE)
samples <- read_tsv(PHENO %&% "all_names.txt", col_names = TRUE)
all_rats <- samples %>% filter(!(X1 %in% all_rats$ID))
samples write_tsv(samples, GENO %&% "samples_Rat_metab_abrv_phenos_file", col_names = FALSE)
Do for all 5 tissues
#run prediXcan
conda activate imlabtools
METAXCAN=/Users/sabrinami/Github/MetaXcan/software
GENO=/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/data/rat_genotypes_LD_pruned_0.95
MODEL=/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/Results/sql
OUTPUT=/Users/sabrinami/Box/imlab-data/data-Github/Rat_Genomics_Paper_Pipeline/Results/PrediXcan/metabolic_traits/
python $METAXCAN/Predict.py \
$MODEL/Ac_best_prediXcan_db.db \
--model_db_path \
--text_genotypes $GENO/chr*.round2_impute2_3473.txt \
"{}_{}_{}_{}" \
--on_the_fly_mapping METADATA $GENO/samples_Rat_metab_phenos_file \
--text_sample_ids $OUTPUT/rat_metabolic_Ac_best__predict.txt \
--prediction_output $OUTPUT/rat_metabolic_Ac_best__summary.txt \
--prediction_summary_output --throw
Note: Stop here, we have all the results needed to continue to PTRS analysis.
<- read_tsv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/rat_metabolic_" %&% tis %&% "_best__predict.txt", col_names = TRUE)
pred_expr
<- read_tsv(PHENO %&% "all_names.txt", col_names = TRUE)
all_rats <- pred_expr %>% filter(!(FID %in% all_rats$ID))
pred_expr
#write_tsv(pred_expr, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/rat_metabolic_" %&% tis %&% "_best__predict.txt", col_names = TRUE)
#run asssociation in prediXcan
= "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/"
PHENO = "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/associations/"
RESULTS
<- read_csv(PHENO %&% "processed_obesity_rat_Palmer_phenotypes.csv", col_names = TRUE) %>% filter(!(rat_rfid %in% all_rats$ID))
pheno write_tsv(pheno, PHENO %&% "processed_obesity_rat_Palmer_phenotypes_target_set.tsv", col_names = TRUE)
for(i in 2:length(colnames(pheno))){
<- colnames(pheno)[i]
trait <- "python3 " %&% METAXCAN %&% "/PrediXcanAssociation.py " %&% "--expression_file " %&% "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/rat_metabolic_" %&% "Ac" %&% "__predict.txt --input_phenos_file " %&% PHENO %&% "processed_obesity_rat_Palmer_phenotypes_target_set.tsv " %&% "--input_phenos_column " %&% trait %&% " --output " %&% RESULTS %&% "associations/" %&% "Ac" %&% "__association_" %&% trait %&% ".txt --verbosity 9 --throw"
runLOC system(runLOC)
}
Rscript --vanilla /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_trait_assoc_all_tis.R $tissue
qsub -v tissue=$tis metabolic_assoc_all_tissues.pbs
Format and Plot PrediXcan Results
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/associations"
results.dir <- list.files(results.dir, pattern = "Ac__association_", full.names = TRUE)
filelist <- data.frame()
full_df
for(fila in filelist) {
<- read_tsv(fila, col_names = TRUE)
assoc_fila <- substr(fila, 104, (str_length(fila) - 4))
pheno_id <- cbind(assoc_fila, metabolic_trait=pheno_id) %>% select(-c(status))
tempo <- rbind(full_df, tempo)
full_df
} #full_df <- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/rat_metabolic_traits_best_Ac_full_assocs.txt", col_names = TRUE)
<- full_df %>% filter(pvalue < 9.279881e-06)
tempo_df
#428 sig genes
%>% group_by(gene) %>% summarise(n = n())
tempo_df
#all 11 traits
%>% group_by(metabolic_trait) %>% summarise(n = n()) tempo_df
Filter prediXcan results for Supplementary Table
<- full_df %>% filter(metabolic_trait == "bmi_bodylength_w_tail"|metabolic_trait == "bodylength_w_tail"| metabolic_trait == "bodyweight" | metabolic_trait == "fasting_glucose" | metabolic_trait == "epifat" | metabolic_trait == "retrofat" | metabolic_trait == "parafat")
full_df
<- full_df %>% mutate(gene_name = orth.rats[match(full_df$gene, orth.rats$rnorvegicus_homolog_ensembl_gene),4]$rnorvegicus_homolog_associated_gene_name, .before = effect)
full_df
$metabolic_trait[full_df$metabolic_trait == "bmi_bodylength_w_tail" ] <- "Body Mass Index (BMI) with tail"
full_df$metabolic_trait[full_df$metabolic_trait == "bodylength_w_tail" ] <- "Body length including tail"
full_df$metabolic_trait[full_df$metabolic_trait == "bodyweight" ] <- "Body weight"
full_df$metabolic_trait[full_df$metabolic_trait == "fasting_glucose" ] <- "Fasting Glucose"
full_df$metabolic_trait[full_df$metabolic_trait == "epifat" ] <- "Epididymal fat"
full_df$metabolic_trait[full_df$metabolic_trait == "retrofat" ] <- "Retroperitoneal fat"
full_df$metabolic_trait[full_df$metabolic_trait == "parafat" ] <- "Parametrial fat"
full_df
<- full_df %>% mutate(bf_sig = ifelse(full_df$pvalue <= 9.279881e-06, "Yes", "No")) full_df
<- read_tsv("/Users/natashasanthanam/Downloads/Human_phenomeXcan_all_traits.txt", col_names = TRUE)
human_height_genes <- human_height_genes %>% mutate(rat_gene = orth.rats[match(human_height_genes$gene_name, orth.rats$external_gene_name), 4]$rnorvegicus_homolog_associated_gene_name) %>% filter(pvalue_Height <= 0.01)
human_height_genes
<- read_tsv("/Users/natashasanthanam/Downloads/Human_phenomeXcan_all_traits.txt", col_names = TRUE)
human_bmi_genes colnames(human_bmi_genes)[2] = "pvalue_BMI"
<- human_bmi_genes %>% mutate(rat_gene = orth.rats[match(human_bmi_genes$gene_name, orth.rats$external_gene_name), 4]$rnorvegicus_homolog_associated_gene_name) %>% filter(pvalue_BMI <= 0.01 ) human_bmi_genes
<- readRDS("/Users/natashasanthanam/Github/rat-genomic-analysis/data/gene_annotation.RDS") %>% select(c("chr", "gene_id", "start", "end")) %>% rename(gene = gene_id)
gene_annot
<- inner_join(gene_annot, full_df, by = "gene")
tempo_manhatt $chr <- as.numeric(tempo_manhatt$chr)
tempo_manhatt
<- tempo_manhatt %>% filter(metabolic_trait == "Body Mass Index (BMI) with tail")
bmi_manhat <- bmi_manhat %>% mutate(gene_name = orth.rats[match(bmi_manhat$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 4]$rnorvegicus_homolog_associated_gene_name)
bmi_manhat
<- tempo_manhatt %>% filter(metabolic_trait == "Body length including tail")
height_manhat <- height_manhat %>% mutate(gene_name = orth.rats[match(height_manhat$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 4]$rnorvegicus_homolog_associated_gene_name) height_manhat
<- bmi_manhat %>%
data_cum group_by(chr) %>%
summarise(max_bp = as.numeric(max(start))) %>%
mutate(bp_add = lag(cumsum(max_bp), default = 0)) %>%
select(chr, bp_add)
<- bmi_manhat %>%
gwas_data inner_join(data_cum, by = "chr") %>%
mutate(bp_cum = start + bp_add)
<- gwas_data %>%
axis_set group_by(chr) %>%
summarize(center = mean(bp_cum))
<- gwas_data %>%
ylim filter(pvalue == min(pvalue)) %>%
mutate(ylim = abs(floor(log10(pvalue))) + 2) %>%
pull(ylim)
<- 0.05/(5388)
sig
<- ggplot(gwas_data, aes(x = bp_cum, y = -log10(pvalue),
bmi_manhplot color = as_factor(chr), size = -log10(pvalue))) +
geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
geom_hline(yintercept = -log10(0.0001), color = "red", linetype = "dashed") +
geom_point(alpha = 0.75, shape = ifelse((gwas_data$zscore >= 4.863456), 17, ifelse(gwas_data$zscore <= -4.863456, 25, 19)), fill = "dodgerblue4") +
geom_label_repel(aes(label=ifelse((pvalue <= sig & gene_name %in% human_bmi_genes$rat_gene), gene_name, "")), size = 6) +
ylim(c(0,8)) +
scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) +
scale_color_manual(values = rep(c("dodgerblue4", "midnightblue"), unique(length(axis_set$chr)))) +
scale_size_continuous(range = c(0.5,3)) +
labs(x = NULL,
y = expression(-log[10](italic(p)))) +
theme_minimal() +
theme(
legend.position = "none",
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.text.x = element_text(angle = 90, size = 12),
axis.text.y = element_text( size = 12, vjust = 0),
axis.title = element_text(size = 20))
<- height_manhat %>%
data_cum group_by(chr) %>%
summarise(max_bp = as.numeric(max(start))) %>%
mutate(bp_add = lag(cumsum(max_bp), default = 0)) %>%
select(chr, bp_add)
<- height_manhat %>%
gwas_data inner_join(data_cum, by = "chr") %>%
mutate(bp_cum = start + bp_add)
<- gwas_data %>%
axis_set group_by(chr) %>%
summarize(center = mean(bp_cum))
<- gwas_data %>%
ylim filter(pvalue == min(pvalue)) %>%
mutate(ylim = abs(floor(log10(pvalue))) + 2) %>%
pull(ylim)
<- 0.05/(5388)
sig
<- ggplot(gwas_data, aes(x = bp_cum, y = -log10(pvalue),
height_manhplot color = as_factor(chr), size = -log10(pvalue))) +
geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
geom_hline(yintercept = -log10(0.0001), color = "red", linetype = "dashed") +
geom_point(alpha = 0.75, shape = ifelse((gwas_data$zscore >= 4.863456), 17, ifelse(gwas_data$zscore <= -4.863456, 25, 19)), fill = "dodgerblue4") +
geom_label_repel(aes(label=ifelse((pvalue <= sig & gene_name %in% human_height_genes$rat_gene), gene_name, "")), size = 6) +
ylim(c(0,10)) +
scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) +
scale_color_manual(values = rep(c("dodgerblue4", "midnightblue"), unique(length(axis_set$chr)))) +
scale_size_continuous(range = c(0.5,3)) +
labs(x = NULL,
y = expression(-log[10](italic(p)))) +
theme_minimal() +
theme(
legend.position = "none",
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.text.x = element_text(angle = 90, size = 12),
axis.text.y = element_text( size = 12, vjust = 0),
axis.title = element_text(size = 20))
Run MultiXcan
Generate Folder with Predicted Expression Data for each Tissue
First have to remove potential overlap between genotypes used in predicted expression and those in phenotype file. Should only be around 60ish so not too big a deal
<- list.files("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/expression", pattern = ".RDS", full.names = TRUE)
filelist <- data.frame(ID = as.character())
all_names for(fila in filelist) {
<- readRDS(fila)
tempo <- as.data.frame(rownames(tempo)) %>% rename(ID = `rownames(tempo)`)
tempo <- full_join(tempo, all_names, by = "ID")
all_names }
<- read_csv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/processed_obesity_rat_Palmer_phenotypes.csv", col_names=TRUE)
pheno
<- pheno %>% rename(ID = rat_rfid) %>% filter(!ID %in% all_names$ID) pheno
Next have to remove overlap rats predicted expression as well
<- list.files("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/prediXcan/metabolic_traits/", pattern = "__predict.txt", full.names = TRUE)
filelist
for(fila in filelist) {
<- fread(fila, header=TRUE)
tempo <- substr(fila, 90,91)
name <- tempo %>% filter(!FID %in% all_names$ID)
tempo <- tempo[match(pheno$ID, tempo$FID),]
tempo write_tsv(tempo, "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/expr/" %&% name %&% ".txt")
}
Run MultiXcan using the predicted expression from prediXcan across all 5 tissues to boost power
#!/bin/bash
#PBS -N multixcan
#PBS -S /bin/bash
#PBS -l walltime=4:00:00
#PBS -l mem=4gb
#PBS -l nodes=1:ppn=1
# SPECIFY LOGGING BEHAVIOR
#PBS -o /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/logs/${PBS_JOBNAME}.${PBS_JOBID}.log
#PBS -e /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/logs/${PBS_JOBNAME}.${PBS_JOBID}.err
module load gcc/6.2.0
source ~/.bashrc
conda activate /gpfs/data/im-lab/nas40t2/bin/envs/tensorqtl/
echo "MultiXcan running on epifat"
python /gpfs/data/im-lab/nas40t2/natasha/GTEX_Analysis/MetaXcan/software/MulTiXcan.py \
--expression_folder /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/expr \
--expression_pattern "(.*).txt" \
--input_phenos_file /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/metabolic_trait_phenos_MultiXcan.txt \
--input_phenos_column bmi_bodylength_wo_tail \
--output /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/MultiXcan/results/bmi_bodylength_wo_tail_predict_assoc.txt \
--pc_condition_number 10 \
--mode linear \
--verbosity 8 \
--throw
Add Zscore to MutliXcan
Calculate most significant Zscore across all tisuses For each trait find most significant pvalue and take sign of that effect
<- read_csv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Box_files/processed_obesity_rat_Palmer_phenotypes.csv", col_names=TRUE)
pheno
for(i in 2:ncol(pheno)) {
<- colnames(pheno)[i]
trait <- list.files(data.dir %&% "prediXcan/metabolic_traits/associations/", pattern = trait %&% ".txt", full.names = TRUE)
filelist <- data.frame(gene= as.character())
tempo for(fila in filelist) {
<- substr(fila, 89, 90)
tis <- read_tsv(fila) %>% select(c(gene, effect, pvalue))
df <- paste("effect", tis, sep = "_")
new_eff <- paste("pvalue", tis, sep = "_")
new_pval colnames(df)[2] <- new_eff
colnames(df)[3] <- new_pval
<- full_join(tempo, df, by = "gene")
tempo
}= rowMins(as.matrix(tempo[,c(3,5,7,9,11)]))
most_sig
<- tempo[na.omit(match(most_sig, tempo$pvalue_Ac)), c(1,2)] %>% rename(effect = effect_Ac )
Ac <- tempo[na.omit(match(most_sig, tempo$pvalue_Il)), c(1,4)] %>% rename(effect = effect_Il )
Il <- tempo[na.omit(match(most_sig, tempo$pvalue_Lh)), c(1,6)] %>% rename(effect = effect_Lh )
Lh <- tempo[na.omit(match(most_sig, tempo$pvalue_Pl)), c(1,8)] %>% rename(effect = effect_Pl )
Pl <- tempo[na.omit(match(most_sig, tempo$pvalue_Vo)), c(1,10)] %>% rename(effect = effect_Vo )
Vo
<- rbind(Ac, Il, Lh, Pl, Vo)
df <- df %>% mutate(sign = sign(effect))
df write_tsv(df, data.dir %&% "prediXcan/metabolic_traits/associations/most_sig_zscores/" %&% trait %&% "_avg_zscore.txt", col_names = FALSE)
}
Loci Analysis
::source_gist("50a2bdc64e103e8321fefb9e712aa137") devtools
<- readRDS("/Users/natashasanthanam/Github/rat-genomic-analysis/data/gene_annotation.RDS") %>% select(c(chr, gene_id, gene_name, start, end))
gene_annot <- full_df %>% filter(metabolic_trait == "Body length including tail") %>% filter(pvalue <= 9.279881e-06)
height_loci
<- inner_join(gene_annot, height_loci %>% select(c(gene, pvalue)) %>% rename(gene_id = gene), by = "gene_id")
height_loci $chr = as.numeric(height_loci$chr)
height_loci<- height_loci[order(height_loci$chr),]
height_loci
<- fn_count_distinct_loci(height_loci) height_distinct_loci
<- full_df %>% filter(metabolic_trait == "Body Mass Index (BMI) with tail") %>% filter(pvalue <= 9.279881e-06)
bmi_loci
<- inner_join(gene_annot, bmi_loci %>% select(c(gene, pvalue)) %>% rename(gene_id = gene), by = "gene_id")
bmi_loci $chr = as.numeric(bmi_loci$chr)
bmi_loci<- bmi_loci[order(bmi_loci$chr),]
bmi_loci
<- fn_count_distinct_loci(bmi_loci) bmi_distinct_loci