library(tidyverse)
library(devtools)
library(broom)
library(data.table)
02. Prediction_Model_Pipeline
Definitions
Data from here - genoGex.Rdata has everything we need in it There are 5 ‘gex’ RDS files which are the gene expressions for the 5 different tissues, the ‘gtf’ is the gene annotation, ‘phyMap’ is the snp annotation, and ‘geno’ is the genotype matrix
Our pipeline predicts expressions from the gene expression data and genotypes of the rats from the study.
load("~/Box/imlab-data/Projects/PTRS-PGRS-Rosetta/Data-From-Abe-Palmer-Lab/Rdata/genoGex.RData")
Gene Expression Files
First, we transpose each tissue’s gene expression file to fit the format expected by the PrediXcan pipeline scripts.
#transposing gene expression files for the 5 tissues
= gexAc$EnsemblGeneID
n <- as.data.frame(t(gexAc[,-1]))
gexAc_transpose colnames(gexAc_transpose) <- n
= gexIl$EnsemblGeneID
n <- as.data.frame(t(gexIl[,-1]))
gexIl_transpose colnames(gexIl_transpose) <- n
= gexLh$EnsemblGeneID
n <- as.data.frame(t(gexLh[,-1]))
gexLh_transpose colnames(gexLh_transpose) <- n
= gexPl$EnsemblGeneID
n <- as.data.frame(t(gexPl[,-1]))
gexPl_transpose colnames(gexPl_transpose) <- n
= gexVo$EnsemblGeneID
n <- as.data.frame(t(gexVo[,-1]))
gexVo_transpose colnames(gexVo_transpose) <- n
# Running inverse normalization on each gene expression
= function(x) {
invnorm if(is.null(dim(x))) res = invnorm.vector(x) else
=apply(x,2,invnorm.vector)
res
res
}= function(x) {yy = rank(x)/(length(x)+1); qnorm(yy)}
invnorm.vector = invnorm(gexAc_transpose)
gexAc_transpose = invnorm(gexIl_transpose)
gexIl_transpose = invnorm(gexLh_transpose)
gexLh_transpose = invnorm(gexPl_transpose)
gexPl_transpose = invnorm(gexVo_transpose) gexVo_transpose
Write to file.
# Writing the gene expression files to csv files to be used for PEER Factor analysis
write.table(gexAc_transpose, file = '/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gexAc.csv', sep = ",", col.names = TRUE, row.names = FALSE)
write.table(gexIl_transpose, file = '/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gexIl.csv', sep = ",", col.names = TRUE, row.names = FALSE)
write.table(gexLh_transpose, file = '/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gexLh.csv', sep = ",", col.names = TRUE, row.names = FALSE)
write.table(gexPl_transpose, file = '/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gexPl.csv', sep = ",", col.names = TRUE, row.names = FALSE)
write.table(gexVo_transpose, file = '/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gexVo.csv', sep = ",", col.names = TRUE, row.names = FALSE)
PEER analysis
Now we are performing PEER factor analysis on each tissue, choosing 7 factors.
peertool -f data/"gexAc.csv" -n 7 -o peer_Ac --has_header
peertool -f data/"gexIl.csv" -n 7 -o peer_Il --has_header
peertool -f data/"gexLh.csv" -n 7 -o peer_Lh --has_header
peertool -f data/"gexPl.csv" -n 7 -o peer_Pl --has_header
peertool -f data/"gexVo.csv" -n 7 -o peer_Vo --has_header
Later on, we examine these 7 factors, as well as other covariates, to interpret expression variability. # Gene, snp annotation files The prediction model pipeline also requires a gene annotation file as input. The code below generates it from the gene annotations provided by Palmer lab in ‘gtf’. We also collect snp info.
$gene_type = sub(".*?gene_biotype(.*?);.*", "\\1", gtf$Attr)
gtf$gene_name = sub(".*?gene_name(.*?);.*", "\\1", gtf$Attr)
gtf= subset(gtf, select = -c(Source, Feature, Score, Strand, Attr, Frame) )
gene_annotation = gene_annotation[, c("Chr","Gene", "gene_name", "Start", "End", "gene_type" )]
gene_annotation colnames(gene_annotation) = c("chr", "gene_id", "gene_name", "start", "end")
rownames(gene_annotation) = gene_annotation$gene_id
We have all the information needed to generate the predictions models. We are left to reorganize it to fit the pipeline. The specifics of each step is commented at the top of each block.
# Making the snp annotation in the correct format for the pipeline
<- within(phyMap, varID <- paste(Chr, Pos, Ref, Alt, sep="_"))
phyMap rownames(phyMap) = phyMap$varID
$rsid = phyMap$varID
phyMapcolnames(phyMap) = c("snp", "chr", "pos", "refAllele", "effectAllele", 'varID', "rsid")
# Splitting the snp annotation file by chromosome
<- setNames(split(phyMap, phyMap$chr), paste0("snp_annot.chr", unique(phyMap$chr)))
s list2env(s, globalenv())
The new genotype file combines the provided geno
file and combines information from the provided snp annotation file, phyMap
.
# writing the genotype file to a .txt file so that we can separate it by chromosome using our geneotype parse script.
rownames(geno) = rownames(phyMap)
write.table(geno, file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/genotype.txt", sep = "\t", col.names = TRUE, row.names = TRUE)
This python script separates the genotype file by chromosome.
#Splitting the genotype file by chromosome - run this from the rat_genomic_alaysis directory
python scripts/split_genotype_by_chr.py data/genotype.txt data/geno_by_chr/'genotype'
Covariate Files
We analyze sex, batch number, and batch center, as possible covariates, along with the 7 PEER factors.
# Loading the phenotype file in to create covariate files. For this we are selecting sex, batch number, and batch center as covariates as well as the 7 PEER factors we generate
load("~/Box/imlab-data/Projects/PTRS-PGRS-Rosetta/Data-From-Abe-Palmer-Lab/Final_P50_traits/P50_raw_trait_values.RData")
= raw_traits[match(rownames(gexAc_transpose), raw_traits$rfid), ]
covariatesAc = subset(covariatesAc, select = c(rfid, sex, batchnumber, center))
covariatesAc = raw_traits[match(rownames(gexIl_transpose), raw_traits$rfid), ]
covariatesIl = subset(covariatesIl, select = c(rfid, sex, batchnumber, center))
covariatesIl = raw_traits[match(rownames(gexLh_transpose), raw_traits$rfid), ]
covariatesLh = subset(covariatesLh, select = c(rfid, sex, batchnumber, center))
covariatesLh = raw_traits[match(rownames(gexPl_transpose), raw_traits$rfid), ]
covariatesPl = subset(covariatesPl, select = c(rfid, sex, batchnumber, center))
covariatesPl = raw_traits[match(rownames(gexVo_transpose), raw_traits$rfid), ]
covariatesVo = subset(covariatesVo, select = c(rfid, sex, batchnumber, center)) covariatesVo
# Reading the PEER factor output files to be appended to the covariate file and eventually regressed out of the expression files
= read.csv(file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/peer_Ac/X.csv", header = FALSE)
peer_factorsAc = read.csv(file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/peer_Il/X.csv", header = FALSE)
peer_factorsIl = read.csv(file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/peer_Lh/X.csv", header = FALSE)
peer_factorsLh = read.csv(file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/peer_Pl/X.csv", header = FALSE)
peer_factorsPl = read.csv(file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/peer_Vo/X.csv", header = FALSE) peer_factorsVo
For each tissue’s PEER factor output, set individual IDs as rownames and enumerate the PEER factors in the columns.
# Manipulating the PEER factor files so we can append to covariate file
colnames(peer_factorsAc) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')
rownames(peer_factorsAc) = rownames(gexAc_transpose)
colnames(peer_factorsIl) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')
rownames(peer_factorsIl) = rownames(gexIl_transpose)
colnames(peer_factorsLh) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')
rownames(peer_factorsLh) = rownames(gexLh_transpose)
colnames(peer_factorsPl) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')
rownames(peer_factorsPl) = rownames(gexPl_transpose)
colnames(peer_factorsVo) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')
rownames(peer_factorsVo) = rownames(gexVo_transpose)
#initializing matrices to be filled with t-stats, p_vals, and residuals of the regression of each gene vs. the covariates for each tissue. t-stat and p-val matrices are just for diagnostics
#t_statsAc = matrix(nrow = 13, ncol = length(colnames(gexAc_transpose)))
#p_valsAc = matrix(nrow = 13, ncol = length(colnames(gexAc_transpose)))
= gexAc_transpose
expressionAc
#t_statsIl = matrix(nrow = 13, ncol = length(colnames(gexIl_transpose)))
#p_valsIl = matrix(nrow = 13, ncol = length(colnames(gexIl_transpose)))
= gexIl_transpose
expressionIl
#t_statsLh = matrix(nrow = 13, ncol = length(colnames(gexLh_transpose)))
#p_valsLh = matrix(nrow = 13, ncol = length(colnames(gexLh_transpose)))
= gexLh_transpose
expressionLh
#t_statsPl = matrix(nrow = 13, ncol = length(colnames(gexPl_transpose)))
#p_valsPl = matrix(nrow = 13, ncol = length(colnames(gexPl_transpose)))
= gexPl_transpose
expressionPl
#t_statsVo = matrix(nrow = 13, ncol = length(colnames(gexVo_transpose)))
#p_valsVo = matrix(nrow = 13, ncol = length(colnames(gexVo_transpose)))
= gexVo_transpose expressionVo
In the following code, we regress out the covariates and save the residuals as the new expression for each tissue.
for (i in 1:length(colnames(gexAc_transpose))) {
= lm(gexAc_transpose[,i] ~ covariatesAc$sex + covariatesAc$batchnumber + peer_factorsAc$PF1 + peer_factorsAc$PF2 + peer_factorsAc$PF3 + peer_factorsAc$PF4 + peer_factorsAc$PF5 + peer_factorsAc$PF6 + peer_factorsAc$PF7)
fit <- fit$residuals
expressionAc[,i] #t_statsAc[,i] <- tidy(fit)$statistic
#p_valsAc[,i] <- tidy(fit)$p.value
}
for (i in 1:length(colnames(gexIl_transpose))) {
= lm(gexIl_transpose[,i] ~ covariatesIl$sex + covariatesIl$batchnumber + peer_factorsIl$PF1 + peer_factorsIl$PF2 + peer_factorsIl$PF3 + peer_factorsIl$PF4 + peer_factorsIl$PF5 + peer_factorsIl$PF6 + peer_factorsIl$PF7)
fit <- fit$residuals
expressionIl[,i] #t_statsAc[,i] <- tidy(fit)$statistic
#p_valsAc[,i] <- tidy(fit)$p.value
}
for (i in 1:length(colnames(gexLh_transpose))) {
= lm(gexLh_transpose[,i] ~ covariatesLh$sex + covariatesLh$batchnumber + peer_factorsLh$PF1 + peer_factorsLh$PF2 + peer_factorsLh$PF3 + peer_factorsLh$PF4 + peer_factorsLh$PF5 + peer_factorsLh$PF6 + peer_factorsLh$PF7)
fit <- fit$residuals
expressionLh[,i] #t_statsAc[,i] <- tidy(fit)$statistic
#p_valsAc[,i] <- tidy(fit)$p.value
}
for (i in 1:length(colnames(gexPl_transpose))) {
= lm(gexPl_transpose[,i] ~ covariatesPl$sex + covariatesPl$batchnumber + peer_factorsPl$PF1 + peer_factorsPl$PF2 + peer_factorsPl$PF3 + peer_factorsPl$PF4 + peer_factorsPl$PF5 + peer_factorsPl$PF6 + peer_factorsPl$PF7)
fit <- fit$residuals
expressionPl[,i] #t_statsAc[,i] <- tidy(fit)$statistic
#p_valsAc[,i] <- tidy(fit)$p.value
}
for (i in 1:length(colnames(gexVo_transpose))) {
= lm(gexVo_transpose[,i] ~ covariatesVo$sex + covariatesVo$batchnumber + peer_factorsVo$PF1 + peer_factorsVo$PF2 + peer_factorsVo$PF3 + peer_factorsVo$PF4 + peer_factorsVo$PF5 + peer_factorsVo$PF6 + peer_factorsVo$PF7)
fit <- fit$residuals
expressionVo[,i] #t_statsAc[,i] <- tidy(fit)$statistic
#p_valsAc[,i] <- tidy(fit)$p.value
}
Write the processed expression data to file.
# Save expression as tsv
<- as.data.frame(expressionAc) %>% mutate(ID = rownames(expressionAc), .before = colnames(expressionAc))
Ac_expr
<- as.data.frame(expressionIl) %>% mutate(ID = rownames(expressionIl), .before = colnames(expressionIl))
Il_expr
<- as.data.frame(expressionLh) %>% mutate(ID = rownames(expressionLh), .before = colnames(expressionLh))
Lh_expr
<- as.data.frame(expressionPl) %>% mutate(ID = rownames(expressionPl), .before = colnames(expressionPl))
Pl_expr
<- as.data.frame(expressionVo) %>% mutate(ID = rownames(expressionVo), .before = colnames(expressionVo))
Vo_expr
"%&%" = function(a,b) paste(a,b,sep="")
<- list(Ac_expr, Il_expr, Lh_expr, Pl_expr, Vo_expr)
exprlist <- c("Ac", "Il", "Lh", "Pl", "Vo")
tis = 1
i for(l in exprlist) {
write_tsv(l, "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/"
%&% tis[i] %&% "_expression_transformed.tsv", col_names = TRUE)
<- i+1
i }
Save the expression RDS objects to be used as arguments in the script.
saveRDS(as.matrix(expressionAc), "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Ac_expression_transformed.RDS")
saveRDS(as.matrix(expressionIl), "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Il_expression_transformed.RDS")
saveRDS(as.matrix(expressionLh), "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Lh_expression_transformed.RDS")
saveRDS(as.matrix(expressionPl), "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Pl_expression_transformed.RDS")
saveRDS(as.matrix(expressionVo), "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Vo_expression_transformed.RDS")
Save the gene and SNP annotation as RDS objects to be used as arguments in the script.
#
<- list(snp_annot.chr1, snp_annot.chr2, snp_annot.chr3, snp_annot.chr4, snp_annot.chr5, snp_annot.chr6, snp_annot.chr7, snp_annot.chr8, snp_annot.chr9, snp_annot.chr10, snp_annot.chr11, snp_annot.chr12, snp_annot.chr13, snp_annot.chr14, snp_annot.chr15, snp_annot.chr16, snp_annot.chr17, snp_annot.chr18, snp_annot.chr19, snp_annot.chr20)
snp_files = 1
i for(l in snp_files) {
saveRDS(l, "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/snp_annot/" %&% "snp_annot.chr" %&% i %&% ".RDS")
<- i+1
i }
# Saving the gene annotation RDS object to be used as an argument in the script
saveRDS(gene_annotation, "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gene_annotation.RDS")
Metadata Files
# Creating the meta data file for each tissue
python scripts/create_meta_data.py --geno "data/genotype.txt" --expr "Ac_expression_transformed.tsv" --alpha 1 --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/Ac"
python scripts/create_meta_data.py --geno "data/genotype.txt" --expr "Il_expression_transformed.tsv" --alpha 1 --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/Il"
python scripts/create_meta_data.py --geno "data/genotype.txt" --expr "Lh_expression_transformed.tsv" --alpha 1 --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/Lh"
python scripts/create_meta_data.py --geno "data/genotype.txt" --expr "Pl_expression_transformed.tsv" --alpha 1 --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/Pl"
python scripts/create_meta_data.py --geno "data/genotype.txt" --expr "Vo_expression_transformed.tsv" --alpha 1 --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/Vo"
# Running the model training script for each tissue/chromosome pair
cd /Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline
for i in {1..20}
do
Rscript scripts/create_model.R 'Ac' $i 0.5 1000000
Rscript scripts/create_model.R 'Il' $i 0.5 1000000
Rscript scripts/create_model.R 'Lh' $i 0.5 1000000
Rscript scripts/create_model.R 'Pl' $i 0.5 1000000
Rscript scripts/create_model.R 'Vo' $i 0.5 1000000
done
# Concatenating all of the results files for each tissue
bash scripts/make_all_results.sh 'Ac' 'Results/all_results_Ac' 0.5 '1KG_snps'
bash scripts/make_all_betas.sh 'Ac' 'Results/all_betas_Ac' 0.5 '1KG_snps'
bash scripts/make_all_logs.sh 'Ac' 'Results/all_logs_Ac'
bash scripts/make_all_covariances.sh 'Ac' 'Results/all_covariances_Ac' 0.5 '1KG_snps'
bash scripts/make_all_results.sh 'Il' 'Results/all_results_Il' 0.5 '1KG_snps'
bash scripts/make_all_betas.sh 'Il' 'Results/all_betas_Il' 0.5 '1KG_snps'
bash scripts/make_all_logs.sh 'Il' 'Results/all_logs_Il'
bash scripts/make_all_covariances.sh 'Il' 'Results/all_covariances_Il' 0.5 '1KG_snps'
bash scripts/make_all_results.sh 'Lh' 'Results/all_results_Lh' 0.5 '1KG_snps'
bash scripts/make_all_betas.sh 'Lh' 'Results/all_betas_Lh' 0.5 '1KG_snps'
bash scripts/make_all_logs.sh 'Lh' 'Results/all_logs_Lh'
bash scripts/make_all_covariances.sh 'Lh' 'Results/all_covariances_Lh' 0.5 '1KG_snps'
bash scripts/make_all_results.sh 'Pl' 'Results/all_results_Pl' 0.5 '1KG_snps'
bash scripts/make_all_betas.sh 'Pl' 'Results/all_betas_Pl' 0.5 '1KG_snps'
bash scripts/make_all_logs.sh 'Pl' 'Results/all_logs_Pl'
bash scripts/make_all_covariances.sh 'Pl' 'Results/all_covariances_Pl' 0.5 '1KG_snps'
bash scripts/make_all_results.sh 'Vo' 'Results/all_results_Vo' 0.5 '1KG_snps'
bash scripts/make_all_betas.sh 'Vo' 'Results/all_betas_Vo' 0.5 '1KG_snps'
bash scripts/make_all_logs.sh 'Vo' 'Results/all_logs_Vo'
bash scripts/make_all_covariances.sh 'Vo' 'Results/all_covariances_Vo' 0.5 '1KG_snps'
# Putting these into sql lite databases
python scripts/make_sqlite_db.py --output "Results/sql/Ac_output_db.db" --results "Results/all_results_Ac" --construction "Results/all_logs_Ac" --betas "Results/all_betas_Ac" --meta "Results/allMetaData/Ac.allMetaData.txt"
python scripts/make_sqlite_db.py --output "Results/sql/Il_output_db.db" --results "Results/all_results_Il" --construction "Results/all_logs_Il" --betas "Results/all_betas_Il" --meta "Results/allMetaData/Il.allMetaData.txt"
python scripts/make_sqlite_db.py --output "Results/sql/Lh_output_db.db" --results "Results/all_results_Lh" --construction "Results/all_logs_Lh" --betas "Results/all_betas_Lh" --meta "Results/allMetaData/Lh.allMetaData.txt"
python scripts/make_sqlite_db.py --output "Results/sql/Pl_output_db.db" --results "Results/all_results_Pl" --construction "Results/all_logs_Pl" --betas "Results/all_betas_Pl" --meta "Results/allMetaData/Pl.allMetaData.txt"
python scripts/make_sqlite_db.py --output "Results/sql/Vo_output_db.db" --results "Results/all_results_Vo" --construction "Results/all_logs_Vo" --betas "Results/all_betas_Vo" --meta "Results/allMetaData/Vo.allMetaData.txt"