library(tidyverse)
library(devtools)
library(broom)
library(data.table)
library(RSQLite)
"%&%" = function(a,b) paste(a,b,sep="")
dir <- "/gpfs/data/im-lab/nas40t2/Data/GTEx/V8/GTEx_Analysis_v8_eQTL_expression_matrices/"
geno.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/"GTEx_comparison_rat_n_genes
Compare number of genes predicted between Rat and GTEx prediction models
Generate GTEx prediction models using the same pipeline as in Rats
Gather Genotype, Gene epression data, snp annotation file and gtf (made with ensembl notation)
First convert genotypes to single coded format to be read by pipeline
plink --bfile /gpfs/data/im-lab/nas40t2/Data/GTEx/V8/genotype/plink_files/GTEx_maf_0.01 --geno 0.02 --mind 0.02 --maf 0.05 --make-bed --out GTEx_compplink --bfile GTEx_comp --recode A-transpose --out GTEx_single_codegeno <- fread(geno.dir %&% "genos/GTEx_single_code.traw")
gex <- read_tsv(dir %&% TISSUE %&% ".v8.normalized_expression.bed.gz") %>% select(-c(`#chr`, start, end))Genotype has to have SNP in first column
geno <- geno %>% select(c(SNP, CHR, colnames(geno)[4:ncol(geno)]))transposing gene expression files and inverse normalize
gex <- gex %>% pivot_longer(!gene_id, names_to = "IID", values_to = "count") %>% pivot_wider(names_from = gene_id, values_from = count)
invnorm = function(x) {
if(is.null(dim(x))) res = invnorm.vector(x) else
res=apply(x,2,invnorm.vector)
res
}
invnorm.vector = function(x) {yy = rank(x)/(length(x)+1); qnorm(yy)}
gex_transpose <- gex %>% select(-c(IID))
gex_transpose = invnorm(gex_transpose)
rownames(gex_transpose) = gex$IID
write.table(gex_transpose, file = '/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/expression/gexWB.csv', sep = ",", col.names = TRUE, row.names = FALSE)Format gene annotation
gtf <- fread("/gpfs/data/im-lab/nas40t2/natasha/GTEX_Analysis/annotations_gencode_v26.tsv")
gene_annotation <- gtf %>% select(c(chromosome, gene_id, gene_name, start, end))
rownames(gene_annotation) = gtf$gene_idFormat snp annotation
phyMap <- read_tsv(geno.dir %&% "genos/GTEx_comp.bim", col_names = FALSE) %>% mutate(varID = X2) %>% select(c(X2, X1, X4, X5, X6, varID))
phyMap$rsid = phyMap$varID
colnames(phyMap) = c("snp", "chr", "pos", "refAllele", "effectAllele", "varID", "rsid")
rownames(phyMap) = phyMap$varID# Splitting the snp annotation file by chromosome
s <- setNames(split(phyMap, phyMap$chr), paste0("snp_annot.chr", unique(phyMap$chr)))
list2env(s, globalenv())#Splitting the genotype file by chromosome - run this from the rat_genomic_alaysis directory
python /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/rat-genomic-analysis/scripts/split_genotype_by_chr.py GTEx_genotype.txt /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/genos/geno_by_chr/'genotype'# Now we are performing PEER factor analysis on each tissue choosing 7 factors
peertool -f expression/"gexWB.csv" -n 7 -o peer_GTEx --has_headerRead in Peer Factors
peer_factors= read.csv(file = geno.dir %&% "peer_GTEx/X.csv", header = FALSE)# Manipulating the PEER factor files so we can append to covariate file
rownames(peer_factors) = gex$IID
colnames(peer_factors) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')#initializing matrices to be filled with t-stats, p_vals, and residuals of the regression of each gene vs. the covariates for each tissue. t-stat and p-val matrices are just for diagnostics
# t_stats = matrix(nrow = 13, ncol = length(colnames(gex)))
# p_vals = matrix(nrow = 13, ncol = length(colnames(gex)))
expression = gex_transpose# Regressing out the covariates and saving the residuals as the new expression for each tissue
for (i in 1:length(colnames(gex_transpose))) {
fit = lm(gex_transpose[,i] ~ peer_factors$PF1 + peer_factors$PF2 + peer_factors$PF3 + peer_factors$PF4 + peer_factors$PF5 + peer_factors$PF6 + peer_factors$PF7)
expression[,i] <- fit$residuals
}# Saving the expression RDS objects to be used as arguments in the script
saveRDS(as.matrix(expression), geno.dir %&% TISSUE %&% "_expression_transformed.RDS")Find intersection of genes across all GTEx tissues and Rats
filelist = list.files(geno.dir %&% "expression" , pattern = "expression_transformed.RDS", full.names = TRUE)
names <- data.frame(gene = readRDS(filelist[1]) %>% colnames())
for(fila in filelist[2:length(filelist)]) {
tempo <- data.frame(gene = readRDS(fila) %>% colnames())
names <- inner_join(names, tempo, by = "gene")
}
names$gene = sapply(strsplit(names$gene, "\\."), `[`, 1)
#do the same for all 5 rat tissues
filelist = list.files("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/expression/", pattern = "expression_transformed.RDS", full.names = TRUE)
rat_names <- data.frame(gene = readRDS(filelist[1]) %>% colnames())
for(fila in filelist[2:length(filelist)]) {
tempo <- data.frame(gene = readRDS(fila) %>% colnames())
rat_names <- inner_join(rat_names, tempo, by = "gene")
}
#change gene id in rats to human notation
orth.rats <- read_tsv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/expression/ortholog_genes_rats_humans.tsv")
rat_names <- rat_names %>% mutate(human_gene = orth.rats[match(rat_names$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 1]$ensembl_gene_id)
overlap = data.frame(human_gene = intersect(rat_names$human_gene, names$gene)) #12,230 overlap
overlap <- overlap %>% mutate(rat_gene = orth.rats[match(overlap$human_gene, orth.rats$ensembl_gene_id), 3]$rnorvegicus_homolog_ensembl_gene)Filter for only overlapped genes in expression files
filelist = list.files(geno.dir %&% "expression", pattern = "expression_transformed.RDS", full.names = TRUE)
for(fila in filelist) {
tempo <- readRDS(fila)
colnames(tempo) = sapply(strsplit(colnames(tempo), "\\."), `[`, 1)
tempo <- tempo[, overlap$human_gene]
tis = substr(fila, 73, str_length(fila) - 27)
saveRDS(tempo, geno.dir %&% txis %&% "_expression_abrv.RDS")
}#set snp annotation rownames
rownames(snp_annot.chr1) = snp_annot.chr1$varID
rownames(snp_annot.chr2) = snp_annot.chr2$varID
rownames(snp_annot.chr3) = snp_annot.chr3$varID
rownames(snp_annot.chr4) = snp_annot.chr4$varID
rownames(snp_annot.chr5) = snp_annot.chr5$varID
rownames(snp_annot.chr6) = snp_annot.chr6$varID
rownames(snp_annot.chr7) = snp_annot.chr7$varID
rownames(snp_annot.chr8) = snp_annot.chr8$varID
rownames(snp_annot.chr9) = snp_annot.chr9$varID
rownames(snp_annot.chr10) = snp_annot.chr10$varID
rownames(snp_annot.chr11) = snp_annot.chr11$varID
rownames(snp_annot.chr12) = snp_annot.chr12$varID
rownames(snp_annot.chr13) = snp_annot.chr13$varID
rownames(snp_annot.chr14) = snp_annot.chr14$varID
rownames(snp_annot.chr15) = snp_annot.chr15$varID
rownames(snp_annot.chr16) = snp_annot.chr16$varID
rownames(snp_annot.chr17) = snp_annot.chr17$varID
rownames(snp_annot.chr18) = snp_annot.chr18$varID
rownames(snp_annot.chr19) = snp_annot.chr19$varID
rownames(snp_annot.chr20) = snp_annot.chr20$varID# Saving the SNP annotation RDS objects to be used as arguments in the script - too lazy to write a for loop
saveRDS(snp_annot.chr1, geno.dir %&% "snp_annot/snp_annot.chr1.RDS")
saveRDS(snp_annot.chr2, geno.dir %&% "snp_annot/snp_annot.chr2.RDS")
saveRDS(snp_annot.chr3, geno.dir %&% "snp_annot/snp_annot.chr3.RDS")
saveRDS(snp_annot.chr4, geno.dir %&% "snp_annot/snp_annot.chr4.RDS")
saveRDS(snp_annot.chr5, geno.dir %&% "snp_annot/snp_annot.chr5.RDS")
saveRDS(snp_annot.chr6, geno.dir %&% "snp_annot/snp_annot.chr6.RDS")
saveRDS(snp_annot.chr7, geno.dir %&% "snp_annot/snp_annot.chr7.RDS")
saveRDS(snp_annot.chr8, geno.dir %&% "snp_annot/snp_annot.chr8.RDS")
saveRDS(snp_annot.chr9, geno.dir %&% "snp_annot/snp_annot.chr9.RDS")
saveRDS(snp_annot.chr10, geno.dir %&% "snp_annot/snp_annot.chr10.RDS")
saveRDS(snp_annot.chr11, geno.dir %&% "snp_annot/snp_annot.chr11.RDS")
saveRDS(snp_annot.chr12, geno.dir %&% "snp_annot/snp_annot.chr12.RDS")
saveRDS(snp_annot.chr13, geno.dir %&% "snp_annot/snp_annot.chr13.RDS")
saveRDS(snp_annot.chr14, geno.dir %&% "snp_annot/snp_annot.chr14.RDS")
saveRDS(snp_annot.chr15, geno.dir %&% "snp_annot/snp_annot.chr15.RDS")
saveRDS(snp_annot.chr16, geno.dir %&% "snp_annot/snp_annot.chr16.RDS")
saveRDS(snp_annot.chr17, geno.dir %&% "snp_annot/snp_annot.chr17.RDS")
saveRDS(snp_annot.chr18, geno.dir %&% "snp_annot/snp_annot.chr18.RDS")
saveRDS(snp_annot.chr19, geno.dir %&% "snp_annot/snp_annot.chr19.RDS")
saveRDS(snp_annot.chr20, geno.dir %&% "snp_annot/snp_annot.chr20.RDS")# Creating the meta data file for each tissue
python /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/rat-genomic-analysis/scripts/create_meta_data.py --geno "genos/GTEx_genotype.txt" --expr "Ovary_expression_transformed.RDS" --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/GTEx_ovary" # Running the model training script for each tissue/chromosome pair
Rscript --vanilla /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/GTEx_TW_CV_elasticNet.R $chr $alpha
a=0.01
for i in {1..20}
do
qsub -v chr=$i,alpha=$a GTEx_single_nested_EN.pbs
done# Concatenating all of the results files for each tissue
bash /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/rat-genomic-analysis/scripts/make_all_results.sh 'Whole_Blood' '/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/Results/all_Results_Whole_Blood_0.01' 0.01 '1KG_snps'Compare number of genes predicted in GTEx to those in Rats
Now we have predictability for all parameters of alpha. We can now iterate through all alphas and create the long data format. We also only select for genes that have an average R2 > 0.3 and subsample 20 genes.
filelist <- list.files("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/Results/chr_Results", pattern = "working_TW_", full.names = TRUE)
tempo <- read_tsv(filelist[1]) %>% filter(R2 <= 1) %>% select(c(gene, cor, R2)) %>% distinct(gene, .keep_all = TRUE)
tempo$cor = as.numeric(tempo$cor)
tempo$R2 = tempo$R2*sign(tempo$cor)
colnames(tempo)[2] = paste("cor", "0.01", sep="_")
colnames(tempo)[3] = paste("R2", "0.01", sep="_")
for (k in 2:(length(filelist))){
df <- read_tsv(filelist[k], col_names = TRUE) %>% distinct(gene, .keep_all = TRUE)
alpha <- substr(filelist[k], 135, str_length(filelist[k]) - 8)
df <- as.data.frame(df) %>% select(c(gene, cor, R2)) %>% filter(R2 <= 1)
df$cor = as.numeric(df$cor)
df$R2 = as.numeric(df$R2)*sign(df$cor)
colnames(df)[2] <- paste("cor", alpha, sep="_")
colnames(df)[3] <- paste("R2", alpha, sep="_")
tempo <- inner_join(tempo, df, by = c("gene"))
}
#tempo <- read_tsv("/Users/natashasanthanam/Downloads/GTEx_Whole_Blood_sparsity_all_parameters.txt", col_names = TRUE)
tempo_cor_df <- tempo %>% select(c(gene, starts_with("cor"))) %>% select(-c(cor_0.01))
tempo_R2_df <- tempo %>% select(c(gene, starts_with("R"))) %>% select(-c(R_0.01))
tempo_R2_df <- tempo_R2_df[sample(nrow(tempo_R2_df), 20), ]
tempo_cor_df <- tempo_cor_df[sample(nrow(tempo_cor_df), 20), ]Plot Sparsity for GTEx
data_long <- tempo_R2_df %>% pivot_longer(!gene, names_to = "value", values_to = "count")
data_long$value = as.numeric(substr(data_long$value, 4, str_length(data_long$value)))
p1 <- ggplot(data_long, aes(x = as.numeric(value), y = count)) + geom_smooth(method = "loess", span = 2, show_guide = FALSE, se=T, size = .5, col = "maroon1") + xlab(expression(paste("Elastic net mixing parameter (",alpha, ")"))) + ylab(expression(paste("10-fold cross-validated R2")))
p1
cor_long <- tempo_cor_df %>% pivot_longer(!gene, names_to = "value", values_to = "count")
cor_long$value = as.numeric(substr(cor_long$value, 5, str_length(cor_long$value)))
p2 <- ggplot(cor_long, aes(x = as.numeric(value), y = count)) + geom_smooth(show_guide = FALSE, se=T, size = .5, col = "maroon1") + xlab(expression(paste("Elastic net mixing parameter (",alpha, ")"))) + ylab(expression(paste("10-fold cross-validated cor")))Create Number of Genes for Rat Tissues
rat.dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Tyson_Results/"
filelist <- list.files(rat.dir, pattern = "all_results", full.names=TRUE)
rat_n_genes <- data.frame(tis= as.character(), n.genes = numeric(), species = as.character())
for(fila in filelist) {
i = match(fila, filelist)
tempo <- read_tsv(fila, col_names = TRUE)
tempo <- tempo %>% filter(R2 >= 0)
rat_n_genes[i,2] <- n_distinct(tempo$gene)
rat_n_genes[i,1] <- substr(fila, 75, 76)
rat_n_genes[i,3] <- "Rat"
}
rat_n_genes <- rat_n_genes %>% mutate(n.samples = c(78, 83, 83, 81, 82))
total_genes <- rbind(GWAS_n_genes, rat_n_genes)Create Plot Comparing Tissues
total_genes <- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/n_genes_comparison_GTEx_subset_genes.txt", col_names = TRUE)
ggplot(total_genes, aes(n.samples, n.genes)) + geom_point() +geom_smooth(data=subset(total_genes,species=="Human"),
aes(n.samples, n.genes), method=lm) + geom_label_repel( label = total_genes$tis, box.padding = 0.35, point.padding = 0.5) + xlab("Number of Individuals") + ylab("Number of Genes Predicted") + theme(legend.position = "None") + theme_bw()
ggsave("/Users/natashasanthanam/Downloads/GTEx_rats_n_genes.pdf")
No matching items