02. Prediction_Model_Pipeline

Author

Tyson Miller

Published

February 7, 2022

Definitions

library(tidyverse)
library(devtools)
library(broom)
library(data.table)

Data from here - genoGex.Rdata has everything we need in it There are 5 ‘gex’ RDS files which are the gene expressions for the 5 different tissues, the ‘gtf’ is the gene annotation, ‘phyMap’ is the snp annotation, and ‘geno’ is the genotype matrix

Our pipeline predicts expressions from the gene expression data and genotypes of the rats from the study.

load("~/Box/imlab-data/Projects/PTRS-PGRS-Rosetta/Data-From-Abe-Palmer-Lab/Rdata/genoGex.RData")

Gene Expression Files

First, we transpose each tissue’s gene expression file to fit the format expected by the PrediXcan pipeline scripts.

#transposing gene expression files for the 5 tissues 
n = gexAc$EnsemblGeneID
gexAc_transpose <- as.data.frame(t(gexAc[,-1]))
colnames(gexAc_transpose) <- n
n = gexIl$EnsemblGeneID
gexIl_transpose <- as.data.frame(t(gexIl[,-1]))
colnames(gexIl_transpose) <- n
n = gexLh$EnsemblGeneID
gexLh_transpose <- as.data.frame(t(gexLh[,-1]))
colnames(gexLh_transpose) <- n
n = gexPl$EnsemblGeneID
gexPl_transpose <- as.data.frame(t(gexPl[,-1]))
colnames(gexPl_transpose) <- n
n = gexVo$EnsemblGeneID
gexVo_transpose <- as.data.frame(t(gexVo[,-1]))
colnames(gexVo_transpose) <- n
# Running inverse normalization on each gene expression
invnorm = function(x) {
  if(is.null(dim(x))) res = invnorm.vector(x) else
  res=apply(x,2,invnorm.vector)
  res
}
invnorm.vector = function(x) {yy = rank(x)/(length(x)+1); qnorm(yy)}
gexAc_transpose = invnorm(gexAc_transpose)
gexIl_transpose = invnorm(gexIl_transpose)
gexLh_transpose = invnorm(gexLh_transpose)
gexPl_transpose = invnorm(gexPl_transpose)
gexVo_transpose = invnorm(gexVo_transpose)

Write to file.

# Writing the gene expression files to csv files to be used for PEER Factor analysis
write.table(gexAc_transpose, file = '/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gexAc.csv', sep = ",", col.names = TRUE, row.names = FALSE)
write.table(gexIl_transpose, file = '/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gexIl.csv', sep = ",", col.names = TRUE, row.names = FALSE)
write.table(gexLh_transpose, file = '/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gexLh.csv', sep = ",", col.names = TRUE, row.names = FALSE)
write.table(gexPl_transpose, file = '/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gexPl.csv', sep = ",", col.names = TRUE, row.names = FALSE)
write.table(gexVo_transpose, file = '/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gexVo.csv', sep = ",", col.names = TRUE, row.names = FALSE)

PEER analysis

Now we are performing PEER factor analysis on each tissue, choosing 7 factors.

peertool -f data/"gexAc.csv" -n 7 -o peer_Ac --has_header
peertool -f data/"gexIl.csv" -n 7 -o peer_Il --has_header
peertool -f data/"gexLh.csv" -n 7 -o peer_Lh --has_header
peertool -f data/"gexPl.csv" -n 7 -o peer_Pl --has_header
peertool -f data/"gexVo.csv" -n 7 -o peer_Vo --has_header

Later on, we examine these 7 factors, as well as other covariates, to interpret expression variability. # Gene, snp annotation files The prediction model pipeline also requires a gene annotation file as input. The code below generates it from the gene annotations provided by Palmer lab in ‘gtf’. We also collect snp info.

gtf$gene_type = sub(".*?gene_biotype(.*?);.*", "\\1", gtf$Attr)
gtf$gene_name = sub(".*?gene_name(.*?);.*", "\\1", gtf$Attr)
gene_annotation = subset(gtf, select = -c(Source, Feature, Score, Strand, Attr, Frame) )
gene_annotation = gene_annotation[, c("Chr","Gene", "gene_name", "Start", "End", "gene_type" )]
colnames(gene_annotation) = c("chr", "gene_id", "gene_name", "start", "end")
rownames(gene_annotation) = gene_annotation$gene_id

We have all the information needed to generate the predictions models. We are left to reorganize it to fit the pipeline. The specifics of each step is commented at the top of each block.

# Making the snp annotation in the correct format for the pipeline
phyMap <- within(phyMap,  varID <- paste(Chr, Pos, Ref, Alt, sep="_"))
rownames(phyMap) = phyMap$varID
phyMap$rsid = phyMap$varID
colnames(phyMap) = c("snp", "chr", "pos", "refAllele", "effectAllele", 'varID', "rsid")
# Splitting the snp annotation file by chromosome
s <- setNames(split(phyMap, phyMap$chr), paste0("snp_annot.chr", unique(phyMap$chr)))
list2env(s, globalenv())

The new genotype file combines the provided geno file and combines information from the provided snp annotation file, phyMap.

# writing the genotype file to a .txt file so that we can separate it by chromosome using our geneotype parse script.
rownames(geno) = rownames(phyMap)
write.table(geno, file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/genotype.txt", sep = "\t", col.names = TRUE, row.names = TRUE)

This python script separates the genotype file by chromosome.

#Splitting the genotype file by chromosome - run this from the rat_genomic_alaysis directory
python scripts/split_genotype_by_chr.py data/genotype.txt data/geno_by_chr/'genotype'

Covariate Files

We analyze sex, batch number, and batch center, as possible covariates, along with the 7 PEER factors.

# Loading the phenotype file in to create covariate files. For this we are selecting sex, batch number, and batch center as covariates as well as the 7 PEER factors we generate
load("~/Box/imlab-data/Projects/PTRS-PGRS-Rosetta/Data-From-Abe-Palmer-Lab/Final_P50_traits/P50_raw_trait_values.RData")
covariatesAc = raw_traits[match(rownames(gexAc_transpose), raw_traits$rfid), ]
covariatesAc = subset(covariatesAc, select = c(rfid, sex, batchnumber, center))
covariatesIl = raw_traits[match(rownames(gexIl_transpose), raw_traits$rfid), ]
covariatesIl = subset(covariatesIl, select = c(rfid, sex, batchnumber, center))
covariatesLh = raw_traits[match(rownames(gexLh_transpose), raw_traits$rfid), ]
covariatesLh = subset(covariatesLh, select = c(rfid, sex, batchnumber, center))
covariatesPl = raw_traits[match(rownames(gexPl_transpose), raw_traits$rfid), ]
covariatesPl = subset(covariatesPl, select = c(rfid, sex, batchnumber, center))
covariatesVo = raw_traits[match(rownames(gexVo_transpose), raw_traits$rfid), ]
covariatesVo = subset(covariatesVo, select = c(rfid, sex, batchnumber, center))
# Reading the PEER factor output files to be appended to the covariate file and eventually regressed out of the expression files
peer_factorsAc = read.csv(file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/peer_Ac/X.csv", header = FALSE)
peer_factorsIl = read.csv(file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/peer_Il/X.csv", header = FALSE)
peer_factorsLh = read.csv(file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/peer_Lh/X.csv", header = FALSE)
peer_factorsPl = read.csv(file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/peer_Pl/X.csv", header = FALSE)
peer_factorsVo = read.csv(file = "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/peer_Vo/X.csv", header = FALSE)

For each tissue’s PEER factor output, set individual IDs as rownames and enumerate the PEER factors in the columns.

# Manipulating the PEER factor files so we can append to covariate file

colnames(peer_factorsAc) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')
rownames(peer_factorsAc) = rownames(gexAc_transpose)

colnames(peer_factorsIl) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')
rownames(peer_factorsIl) = rownames(gexIl_transpose)

colnames(peer_factorsLh) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')
rownames(peer_factorsLh) = rownames(gexLh_transpose)

colnames(peer_factorsPl) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')
rownames(peer_factorsPl) = rownames(gexPl_transpose)

colnames(peer_factorsVo) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')
rownames(peer_factorsVo) = rownames(gexVo_transpose)
#initializing matrices to be filled with t-stats, p_vals, and residuals of the regression of each gene vs. the covariates for each tissue. t-stat and p-val matrices are just for diagnostics

#t_statsAc = matrix(nrow = 13, ncol = length(colnames(gexAc_transpose)))
#p_valsAc = matrix(nrow = 13, ncol = length(colnames(gexAc_transpose)))
expressionAc = gexAc_transpose

#t_statsIl = matrix(nrow = 13, ncol = length(colnames(gexIl_transpose)))
#p_valsIl = matrix(nrow = 13, ncol = length(colnames(gexIl_transpose)))
expressionIl = gexIl_transpose

#t_statsLh = matrix(nrow = 13, ncol = length(colnames(gexLh_transpose)))
#p_valsLh = matrix(nrow = 13, ncol = length(colnames(gexLh_transpose)))
expressionLh = gexLh_transpose

#t_statsPl = matrix(nrow = 13, ncol = length(colnames(gexPl_transpose)))
#p_valsPl = matrix(nrow = 13, ncol = length(colnames(gexPl_transpose)))
expressionPl = gexPl_transpose

#t_statsVo = matrix(nrow = 13, ncol = length(colnames(gexVo_transpose)))
#p_valsVo = matrix(nrow = 13, ncol = length(colnames(gexVo_transpose)))
expressionVo = gexVo_transpose

In the following code, we regress out the covariates and save the residuals as the new expression for each tissue.

for (i in 1:length(colnames(gexAc_transpose))) {
    fit = lm(gexAc_transpose[,i] ~ covariatesAc$sex + covariatesAc$batchnumber + peer_factorsAc$PF1 + peer_factorsAc$PF2 + peer_factorsAc$PF3 + peer_factorsAc$PF4 + peer_factorsAc$PF5 + peer_factorsAc$PF6 + peer_factorsAc$PF7)
    expressionAc[,i] <- fit$residuals
    #t_statsAc[,i] <- tidy(fit)$statistic
    #p_valsAc[,i] <- tidy(fit)$p.value
}

for (i in 1:length(colnames(gexIl_transpose))) {
    fit = lm(gexIl_transpose[,i] ~ covariatesIl$sex + covariatesIl$batchnumber + peer_factorsIl$PF1 + peer_factorsIl$PF2 + peer_factorsIl$PF3 + peer_factorsIl$PF4 + peer_factorsIl$PF5 + peer_factorsIl$PF6 + peer_factorsIl$PF7)
    expressionIl[,i] <- fit$residuals
    #t_statsAc[,i] <- tidy(fit)$statistic
    #p_valsAc[,i] <- tidy(fit)$p.value
}

for (i in 1:length(colnames(gexLh_transpose))) {
    fit = lm(gexLh_transpose[,i] ~ covariatesLh$sex + covariatesLh$batchnumber + peer_factorsLh$PF1 + peer_factorsLh$PF2 + peer_factorsLh$PF3 + peer_factorsLh$PF4 + peer_factorsLh$PF5 + peer_factorsLh$PF6 + peer_factorsLh$PF7)
    expressionLh[,i] <- fit$residuals
    #t_statsAc[,i] <- tidy(fit)$statistic
    #p_valsAc[,i] <- tidy(fit)$p.value
}

for (i in 1:length(colnames(gexPl_transpose))) {
    fit = lm(gexPl_transpose[,i] ~ covariatesPl$sex + covariatesPl$batchnumber + peer_factorsPl$PF1 + peer_factorsPl$PF2 + peer_factorsPl$PF3 + peer_factorsPl$PF4 + peer_factorsPl$PF5 + peer_factorsPl$PF6 + peer_factorsPl$PF7)
    expressionPl[,i] <- fit$residuals
    #t_statsAc[,i] <- tidy(fit)$statistic
    #p_valsAc[,i] <- tidy(fit)$p.value
}

for (i in 1:length(colnames(gexVo_transpose))) {
    fit = lm(gexVo_transpose[,i] ~ covariatesVo$sex + covariatesVo$batchnumber + peer_factorsVo$PF1 + peer_factorsVo$PF2 + peer_factorsVo$PF3 + peer_factorsVo$PF4 + peer_factorsVo$PF5 + peer_factorsVo$PF6 + peer_factorsVo$PF7)
    expressionVo[,i] <- fit$residuals
    #t_statsAc[,i] <- tidy(fit)$statistic
    #p_valsAc[,i] <- tidy(fit)$p.value
  }

Write the processed expression data to file.

# Save expression as tsv
Ac_expr <- as.data.frame(expressionAc) %>% mutate(ID = rownames(expressionAc), .before = colnames(expressionAc))

Il_expr <- as.data.frame(expressionIl) %>% mutate(ID = rownames(expressionIl), .before = colnames(expressionIl))

Lh_expr <- as.data.frame(expressionLh) %>% mutate(ID = rownames(expressionLh), .before = colnames(expressionLh))

Pl_expr <- as.data.frame(expressionPl) %>% mutate(ID = rownames(expressionPl), .before = colnames(expressionPl))

Vo_expr <- as.data.frame(expressionVo) %>% mutate(ID = rownames(expressionVo), .before = colnames(expressionVo))

"%&%" = function(a,b) paste(a,b,sep="")
exprlist <- list(Ac_expr, Il_expr, Lh_expr, Pl_expr, Vo_expr)
tis <- c("Ac", "Il", "Lh", "Pl", "Vo")
i = 1
for(l in exprlist) {
write_tsv(l, "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/"
%&% tis[i] %&% "_expression_transformed.tsv", col_names = TRUE)
  i <- i+1
}

Save the expression RDS objects to be used as arguments in the script.

saveRDS(as.matrix(expressionAc), "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Ac_expression_transformed.RDS")

saveRDS(as.matrix(expressionIl), "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Il_expression_transformed.RDS")

saveRDS(as.matrix(expressionLh), "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Lh_expression_transformed.RDS")

saveRDS(as.matrix(expressionPl), "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Pl_expression_transformed.RDS")

saveRDS(as.matrix(expressionVo), "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/Vo_expression_transformed.RDS")

Save the gene and SNP annotation as RDS objects to be used as arguments in the script.

# 
snp_files <- list(snp_annot.chr1, snp_annot.chr2, snp_annot.chr3, snp_annot.chr4, snp_annot.chr5, snp_annot.chr6, snp_annot.chr7, snp_annot.chr8, snp_annot.chr9, snp_annot.chr10, snp_annot.chr11, snp_annot.chr12, snp_annot.chr13, snp_annot.chr14, snp_annot.chr15, snp_annot.chr16, snp_annot.chr17, snp_annot.chr18, snp_annot.chr19, snp_annot.chr20)
i = 1
for(l in snp_files) {
  saveRDS(l, "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/snp_annot/" %&% "snp_annot.chr" %&% i %&% ".RDS")
  i <- i+1
}
# Saving the gene annotation RDS object to be used as an argument in the script
saveRDS(gene_annotation, "/Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline/data/gene_annotation.RDS")

Metadata Files

# Creating the meta data file for each tissue 
python scripts/create_meta_data.py --geno "data/genotype.txt" --expr "Ac_expression_transformed.tsv" --alpha 1 --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/Ac"

python scripts/create_meta_data.py --geno "data/genotype.txt" --expr "Il_expression_transformed.tsv" --alpha 1 --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/Il"

python scripts/create_meta_data.py --geno "data/genotype.txt" --expr "Lh_expression_transformed.tsv" --alpha 1 --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/Lh"

python scripts/create_meta_data.py --geno "data/genotype.txt" --expr "Pl_expression_transformed.tsv" --alpha 1 --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/Pl"

python scripts/create_meta_data.py --geno "data/genotype.txt" --expr "Vo_expression_transformed.tsv" --alpha 1 --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/Vo"
# Running the model training script for each tissue/chromosome pair
cd /Users/sabrinami/Github/Rat_Genomics_Paper_Pipeline
for i in {1..20}
do
  Rscript scripts/create_model.R 'Ac' $i 0.5 1000000
  Rscript scripts/create_model.R 'Il' $i 0.5 1000000
  Rscript scripts/create_model.R 'Lh' $i 0.5 1000000
  Rscript scripts/create_model.R 'Pl' $i 0.5 1000000
  Rscript scripts/create_model.R 'Vo' $i 0.5 1000000
done
  
# Concatenating all of the results files for each tissue
bash scripts/make_all_results.sh 'Ac' 'Results/all_results_Ac' 0.5 '1KG_snps'
bash scripts/make_all_betas.sh 'Ac' 'Results/all_betas_Ac' 0.5 '1KG_snps'
bash scripts/make_all_logs.sh 'Ac' 'Results/all_logs_Ac'
bash scripts/make_all_covariances.sh 'Ac' 'Results/all_covariances_Ac' 0.5 '1KG_snps'

bash scripts/make_all_results.sh 'Il' 'Results/all_results_Il' 0.5 '1KG_snps'
bash scripts/make_all_betas.sh 'Il' 'Results/all_betas_Il' 0.5 '1KG_snps'
bash scripts/make_all_logs.sh 'Il' 'Results/all_logs_Il'
bash scripts/make_all_covariances.sh 'Il' 'Results/all_covariances_Il' 0.5 '1KG_snps' 

bash scripts/make_all_results.sh 'Lh' 'Results/all_results_Lh' 0.5 '1KG_snps'
bash scripts/make_all_betas.sh 'Lh' 'Results/all_betas_Lh' 0.5 '1KG_snps'
bash scripts/make_all_logs.sh 'Lh' 'Results/all_logs_Lh'
bash scripts/make_all_covariances.sh 'Lh' 'Results/all_covariances_Lh' 0.5 '1KG_snps'

bash scripts/make_all_results.sh 'Pl' 'Results/all_results_Pl' 0.5 '1KG_snps'
bash scripts/make_all_betas.sh 'Pl' 'Results/all_betas_Pl' 0.5 '1KG_snps'
bash scripts/make_all_logs.sh 'Pl' 'Results/all_logs_Pl'
bash scripts/make_all_covariances.sh 'Pl' 'Results/all_covariances_Pl' 0.5 '1KG_snps'

bash scripts/make_all_results.sh 'Vo' 'Results/all_results_Vo' 0.5 '1KG_snps'
bash scripts/make_all_betas.sh 'Vo' 'Results/all_betas_Vo' 0.5 '1KG_snps'
bash scripts/make_all_logs.sh 'Vo' 'Results/all_logs_Vo'
bash scripts/make_all_covariances.sh 'Vo' 'Results/all_covariances_Vo' 0.5 '1KG_snps'
# Putting these into sql lite databases
python scripts/make_sqlite_db.py --output "Results/sql/Ac_output_db.db" --results "Results/all_results_Ac" --construction "Results/all_logs_Ac" --betas "Results/all_betas_Ac" --meta "Results/allMetaData/Ac.allMetaData.txt"

python scripts/make_sqlite_db.py --output "Results/sql/Il_output_db.db" --results "Results/all_results_Il" --construction "Results/all_logs_Il" --betas "Results/all_betas_Il" --meta "Results/allMetaData/Il.allMetaData.txt"

python scripts/make_sqlite_db.py --output "Results/sql/Lh_output_db.db" --results "Results/all_results_Lh" --construction "Results/all_logs_Lh" --betas "Results/all_betas_Lh" --meta "Results/allMetaData/Lh.allMetaData.txt"

python scripts/make_sqlite_db.py --output "Results/sql/Pl_output_db.db" --results "Results/all_results_Pl" --construction "Results/all_logs_Pl" --betas "Results/all_betas_Pl" --meta "Results/allMetaData/Pl.allMetaData.txt"

python scripts/make_sqlite_db.py --output "Results/sql/Vo_output_db.db" --results "Results/all_results_Vo" --construction "Results/all_logs_Vo" --betas "Results/all_betas_Vo" --meta "Results/allMetaData/Vo.allMetaData.txt"
No matching items