library(tidyverse)
library(devtools)
library(broom)
library(data.table)
library(RSQLite)
"%&%" = function(a,b) paste(a,b,sep="")
<- "/gpfs/data/im-lab/nas40t2/Data/GTEx/V8/GTEx_Analysis_v8_eQTL_expression_matrices/"
dir <- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/" geno.dir
GTEx_comparison_rat_n_genes
Compare number of genes predicted between Rat and GTEx prediction models
Generate GTEx prediction models using the same pipeline as in Rats
Gather Genotype, Gene epression data, snp annotation file and gtf (made with ensembl notation)
First convert genotypes to single coded format to be read by pipeline
plink --bfile /gpfs/data/im-lab/nas40t2/Data/GTEx/V8/genotype/plink_files/GTEx_maf_0.01 --geno 0.02 --mind 0.02 --maf 0.05 --make-bed --out GTEx_comp
plink --bfile GTEx_comp --recode A-transpose --out GTEx_single_code
<- fread(geno.dir %&% "genos/GTEx_single_code.traw")
geno <- read_tsv(dir %&% TISSUE %&% ".v8.normalized_expression.bed.gz") %>% select(-c(`#chr`, start, end)) gex
Genotype has to have SNP in first column
<- geno %>% select(c(SNP, CHR, colnames(geno)[4:ncol(geno)])) geno
transposing gene expression files and inverse normalize
<- gex %>% pivot_longer(!gene_id, names_to = "IID", values_to = "count") %>% pivot_wider(names_from = gene_id, values_from = count)
gex
= function(x) {
invnorm if(is.null(dim(x))) res = invnorm.vector(x) else
=apply(x,2,invnorm.vector)
res
res
}= function(x) {yy = rank(x)/(length(x)+1); qnorm(yy)}
invnorm.vector <- gex %>% select(-c(IID))
gex_transpose
= invnorm(gex_transpose)
gex_transpose rownames(gex_transpose) = gex$IID
write.table(gex_transpose, file = '/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/expression/gexWB.csv', sep = ",", col.names = TRUE, row.names = FALSE)
Format gene annotation
<- fread("/gpfs/data/im-lab/nas40t2/natasha/GTEX_Analysis/annotations_gencode_v26.tsv")
gtf
<- gtf %>% select(c(chromosome, gene_id, gene_name, start, end))
gene_annotation rownames(gene_annotation) = gtf$gene_id
Format snp annotation
<- read_tsv(geno.dir %&% "genos/GTEx_comp.bim", col_names = FALSE) %>% mutate(varID = X2) %>% select(c(X2, X1, X4, X5, X6, varID))
phyMap
$rsid = phyMap$varID
phyMapcolnames(phyMap) = c("snp", "chr", "pos", "refAllele", "effectAllele", "varID", "rsid")
rownames(phyMap) = phyMap$varID
# Splitting the snp annotation file by chromosome
<- setNames(split(phyMap, phyMap$chr), paste0("snp_annot.chr", unique(phyMap$chr)))
s list2env(s, globalenv())
#Splitting the genotype file by chromosome - run this from the rat_genomic_alaysis directory
python /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/rat-genomic-analysis/scripts/split_genotype_by_chr.py GTEx_genotype.txt /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/genos/geno_by_chr/'genotype'
# Now we are performing PEER factor analysis on each tissue choosing 7 factors
peertool -f expression/"gexWB.csv" -n 7 -o peer_GTEx --has_header
Read in Peer Factors
= read.csv(file = geno.dir %&% "peer_GTEx/X.csv", header = FALSE) peer_factors
# Manipulating the PEER factor files so we can append to covariate file
rownames(peer_factors) = gex$IID
colnames(peer_factors) = c('PF1', 'PF2', 'PF3', 'PF4', 'PF5', 'PF6', 'PF7')
#initializing matrices to be filled with t-stats, p_vals, and residuals of the regression of each gene vs. the covariates for each tissue. t-stat and p-val matrices are just for diagnostics
# t_stats = matrix(nrow = 13, ncol = length(colnames(gex)))
# p_vals = matrix(nrow = 13, ncol = length(colnames(gex)))
= gex_transpose expression
# Regressing out the covariates and saving the residuals as the new expression for each tissue
for (i in 1:length(colnames(gex_transpose))) {
= lm(gex_transpose[,i] ~ peer_factors$PF1 + peer_factors$PF2 + peer_factors$PF3 + peer_factors$PF4 + peer_factors$PF5 + peer_factors$PF6 + peer_factors$PF7)
fit <- fit$residuals
expression[,i] }
# Saving the expression RDS objects to be used as arguments in the script
saveRDS(as.matrix(expression), geno.dir %&% TISSUE %&% "_expression_transformed.RDS")
Find intersection of genes across all GTEx tissues and Rats
= list.files(geno.dir %&% "expression" , pattern = "expression_transformed.RDS", full.names = TRUE)
filelist <- data.frame(gene = readRDS(filelist[1]) %>% colnames())
names
for(fila in filelist[2:length(filelist)]) {
<- data.frame(gene = readRDS(fila) %>% colnames())
tempo <- inner_join(names, tempo, by = "gene")
names
}$gene = sapply(strsplit(names$gene, "\\."), `[`, 1)
names
#do the same for all 5 rat tissues
= list.files("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/expression/", pattern = "expression_transformed.RDS", full.names = TRUE)
filelist <- data.frame(gene = readRDS(filelist[1]) %>% colnames())
rat_names
for(fila in filelist[2:length(filelist)]) {
<- data.frame(gene = readRDS(fila) %>% colnames())
tempo <- inner_join(rat_names, tempo, by = "gene")
rat_names
}
#change gene id in rats to human notation
<- read_tsv("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/expression/ortholog_genes_rats_humans.tsv")
orth.rats <- rat_names %>% mutate(human_gene = orth.rats[match(rat_names$gene, orth.rats$rnorvegicus_homolog_ensembl_gene), 1]$ensembl_gene_id)
rat_names
= data.frame(human_gene = intersect(rat_names$human_gene, names$gene)) #12,230 overlap
overlap <- overlap %>% mutate(rat_gene = orth.rats[match(overlap$human_gene, orth.rats$ensembl_gene_id), 3]$rnorvegicus_homolog_ensembl_gene) overlap
Filter for only overlapped genes in expression files
= list.files(geno.dir %&% "expression", pattern = "expression_transformed.RDS", full.names = TRUE)
filelist for(fila in filelist) {
<- readRDS(fila)
tempo colnames(tempo) = sapply(strsplit(colnames(tempo), "\\."), `[`, 1)
<- tempo[, overlap$human_gene]
tempo = substr(fila, 73, str_length(fila) - 27)
tis saveRDS(tempo, geno.dir %&% txis %&% "_expression_abrv.RDS")
}
#set snp annotation rownames
rownames(snp_annot.chr1) = snp_annot.chr1$varID
rownames(snp_annot.chr2) = snp_annot.chr2$varID
rownames(snp_annot.chr3) = snp_annot.chr3$varID
rownames(snp_annot.chr4) = snp_annot.chr4$varID
rownames(snp_annot.chr5) = snp_annot.chr5$varID
rownames(snp_annot.chr6) = snp_annot.chr6$varID
rownames(snp_annot.chr7) = snp_annot.chr7$varID
rownames(snp_annot.chr8) = snp_annot.chr8$varID
rownames(snp_annot.chr9) = snp_annot.chr9$varID
rownames(snp_annot.chr10) = snp_annot.chr10$varID
rownames(snp_annot.chr11) = snp_annot.chr11$varID
rownames(snp_annot.chr12) = snp_annot.chr12$varID
rownames(snp_annot.chr13) = snp_annot.chr13$varID
rownames(snp_annot.chr14) = snp_annot.chr14$varID
rownames(snp_annot.chr15) = snp_annot.chr15$varID
rownames(snp_annot.chr16) = snp_annot.chr16$varID
rownames(snp_annot.chr17) = snp_annot.chr17$varID
rownames(snp_annot.chr18) = snp_annot.chr18$varID
rownames(snp_annot.chr19) = snp_annot.chr19$varID
rownames(snp_annot.chr20) = snp_annot.chr20$varID
# Saving the SNP annotation RDS objects to be used as arguments in the script - too lazy to write a for loop
saveRDS(snp_annot.chr1, geno.dir %&% "snp_annot/snp_annot.chr1.RDS")
saveRDS(snp_annot.chr2, geno.dir %&% "snp_annot/snp_annot.chr2.RDS")
saveRDS(snp_annot.chr3, geno.dir %&% "snp_annot/snp_annot.chr3.RDS")
saveRDS(snp_annot.chr4, geno.dir %&% "snp_annot/snp_annot.chr4.RDS")
saveRDS(snp_annot.chr5, geno.dir %&% "snp_annot/snp_annot.chr5.RDS")
saveRDS(snp_annot.chr6, geno.dir %&% "snp_annot/snp_annot.chr6.RDS")
saveRDS(snp_annot.chr7, geno.dir %&% "snp_annot/snp_annot.chr7.RDS")
saveRDS(snp_annot.chr8, geno.dir %&% "snp_annot/snp_annot.chr8.RDS")
saveRDS(snp_annot.chr9, geno.dir %&% "snp_annot/snp_annot.chr9.RDS")
saveRDS(snp_annot.chr10, geno.dir %&% "snp_annot/snp_annot.chr10.RDS")
saveRDS(snp_annot.chr11, geno.dir %&% "snp_annot/snp_annot.chr11.RDS")
saveRDS(snp_annot.chr12, geno.dir %&% "snp_annot/snp_annot.chr12.RDS")
saveRDS(snp_annot.chr13, geno.dir %&% "snp_annot/snp_annot.chr13.RDS")
saveRDS(snp_annot.chr14, geno.dir %&% "snp_annot/snp_annot.chr14.RDS")
saveRDS(snp_annot.chr15, geno.dir %&% "snp_annot/snp_annot.chr15.RDS")
saveRDS(snp_annot.chr16, geno.dir %&% "snp_annot/snp_annot.chr16.RDS")
saveRDS(snp_annot.chr17, geno.dir %&% "snp_annot/snp_annot.chr17.RDS")
saveRDS(snp_annot.chr18, geno.dir %&% "snp_annot/snp_annot.chr18.RDS")
saveRDS(snp_annot.chr19, geno.dir %&% "snp_annot/snp_annot.chr19.RDS")
saveRDS(snp_annot.chr20, geno.dir %&% "snp_annot/snp_annot.chr20.RDS")
# Creating the meta data file for each tissue
python /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/rat-genomic-analysis/scripts/create_meta_data.py --geno "genos/GTEx_genotype.txt" --expr "Ovary_expression_transformed.RDS" --snpset "1KG" --rsid_label 1 --window 1000000 --out_prefix "Results/allMetaData/GTEx_ovary"
# Running the model training script for each tissue/chromosome pair
Rscript --vanilla /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/GTEx_TW_CV_elasticNet.R $chr $alpha
a=0.01
for i in {1..20}
do
qsub -v chr=$i,alpha=$a GTEx_single_nested_EN.pbs
done
# Concatenating all of the results files for each tissue
bash /gpfs/data/im-lab/nas40t2/natasha/rat_genomics/rat-genomic-analysis/scripts/make_all_results.sh 'Whole_Blood' '/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/Results/all_Results_Whole_Blood_0.01' 0.01 '1KG_snps'
Compare number of genes predicted in GTEx to those in Rats
Now we have predictability for all parameters of alpha. We can now iterate through all alphas and create the long data format. We also only select for genes that have an average R2 > 0.3 and subsample 20 genes.
<- list.files("/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/comp_to_GTEx/Results/chr_Results", pattern = "working_TW_", full.names = TRUE)
filelist
<- read_tsv(filelist[1]) %>% filter(R2 <= 1) %>% select(c(gene, cor, R2)) %>% distinct(gene, .keep_all = TRUE)
tempo $cor = as.numeric(tempo$cor)
tempo$R2 = tempo$R2*sign(tempo$cor)
tempocolnames(tempo)[2] = paste("cor", "0.01", sep="_")
colnames(tempo)[3] = paste("R2", "0.01", sep="_")
for (k in 2:(length(filelist))){
<- read_tsv(filelist[k], col_names = TRUE) %>% distinct(gene, .keep_all = TRUE)
df <- substr(filelist[k], 135, str_length(filelist[k]) - 8)
alpha <- as.data.frame(df) %>% select(c(gene, cor, R2)) %>% filter(R2 <= 1)
df $cor = as.numeric(df$cor)
df$R2 = as.numeric(df$R2)*sign(df$cor)
dfcolnames(df)[2] <- paste("cor", alpha, sep="_")
colnames(df)[3] <- paste("R2", alpha, sep="_")
<- inner_join(tempo, df, by = c("gene"))
tempo
}
#tempo <- read_tsv("/Users/natashasanthanam/Downloads/GTEx_Whole_Blood_sparsity_all_parameters.txt", col_names = TRUE)
<- tempo %>% select(c(gene, starts_with("cor"))) %>% select(-c(cor_0.01))
tempo_cor_df <- tempo %>% select(c(gene, starts_with("R"))) %>% select(-c(R_0.01))
tempo_R2_df
<- tempo_R2_df[sample(nrow(tempo_R2_df), 20), ]
tempo_R2_df <- tempo_cor_df[sample(nrow(tempo_cor_df), 20), ] tempo_cor_df
Plot Sparsity for GTEx
<- tempo_R2_df %>% pivot_longer(!gene, names_to = "value", values_to = "count")
data_long $value = as.numeric(substr(data_long$value, 4, str_length(data_long$value)))
data_long
<- ggplot(data_long, aes(x = as.numeric(value), y = count)) + geom_smooth(method = "loess", span = 2, show_guide = FALSE, se=T, size = .5, col = "maroon1") + xlab(expression(paste("Elastic net mixing parameter (",alpha, ")"))) + ylab(expression(paste("10-fold cross-validated R2")))
p1
p1
<- tempo_cor_df %>% pivot_longer(!gene, names_to = "value", values_to = "count")
cor_long $value = as.numeric(substr(cor_long$value, 5, str_length(cor_long$value)))
cor_long
<- ggplot(cor_long, aes(x = as.numeric(value), y = count)) + geom_smooth(show_guide = FALSE, se=T, size = .5, col = "maroon1") + xlab(expression(paste("Elastic net mixing parameter (",alpha, ")"))) + ylab(expression(paste("10-fold cross-validated cor"))) p2
Create Number of Genes for Rat Tissues
<- "/gpfs/data/im-lab/nas40t2/natasha/rat_genomics/Tyson_Results/"
rat.dir <- list.files(rat.dir, pattern = "all_results", full.names=TRUE)
filelist <- data.frame(tis= as.character(), n.genes = numeric(), species = as.character())
rat_n_genes
for(fila in filelist) {
= match(fila, filelist)
i <- read_tsv(fila, col_names = TRUE)
tempo <- tempo %>% filter(R2 >= 0)
tempo 2] <- n_distinct(tempo$gene)
rat_n_genes[i,1] <- substr(fila, 75, 76)
rat_n_genes[i,3] <- "Rat"
rat_n_genes[i,
}
<- rat_n_genes %>% mutate(n.samples = c(78, 83, 83, 81, 82))
rat_n_genes <- rbind(GWAS_n_genes, rat_n_genes) total_genes
Create Plot Comparing Tissues
<- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/n_genes_comparison_GTEx_subset_genes.txt", col_names = TRUE)
total_genes
ggplot(total_genes, aes(n.samples, n.genes)) + geom_point() +geom_smooth(data=subset(total_genes,species=="Human"),
aes(n.samples, n.genes), method=lm) + geom_label_repel( label = total_genes$tis, box.padding = 0.35, point.padding = 0.5) + xlab("Number of Individuals") + ylab("Number of Genes Predicted") + theme(legend.position = "None") + theme_bw()
ggsave("/Users/natashasanthanam/Downloads/GTEx_rats_n_genes.pdf")
No matching items