overlap <- read_tsv(data.dir %&% "Box_files/overlap_rat_genes_GTEx.txt", col_names = TRUE) # genes that are present in both rat expression data and in GTEx
expr_Ac <- readRDS(data.dir %&% "expression/Ac_expression_transformed.RDS")
expr_Ac <- expr_Ac[, intersect(colnames(expr_Ac), overlap$rat_gene)]
saveRDS(expr_Ac, data.dir %&% "expression/Ac_expression_intesrect_GTEx.RDS")Generate Sparsity Figures
Subset genes used in prediction to those that have an analog in GTEx tissues
we will use this for comparison later
Generate R2 predictions for all elastic net parameters between 0 and 1
We run the same Prediction Model Pipeline only for Ac tissue. However this time, I didn’t break it down into chromosome. This takes longer but means you have less files, a file for each alpha parameter.
for i in $(seq 0 0.1 1.0)
do
qsub -v alpha=$i sparsity_rats_pipeline.pbs
doneNow we have predictability for all parameters of alpha. We can now iterate through all alphas and create the long data format. We also only select for genes that have an average cor > 0.3 and subsample 20 genes.
ldf <- list() # creates a list
listtsv <- list.files(path = dir, pattern = "working_TW_Ac_exp_10-foldCV_elasticNet_alpha", full.names = TRUE) # creates the list of all the tsv files in the directory
tempo <- read_tsv(listtsv[1], col_names = TRUE)
tempo <- tempo %>% select(c(gene, cor))
colnames(tempo)[2] = "0"
for (k in 2:length(listtsv)){
ldf[[k]] <- read_tsv(listtsv[k], col_names = TRUE)
alpha <- substr(listtsv[k], 107, str_length(listtsv[k]) - 13)
fila <- as.data.frame(ldf[[k]])
fila <- fila %>% select(c(gene, cor))
colnames(fila)[2] = alpha
tempo <- inner_join(tempo, fila, by = "gene")
}Plot results of r for all parameers of alpha
tempo <- read_tsv("/Users/natashasanthanam/Downloads/rat_elastic_net_all_parameters_GTEx_only_genes.txt", col_names = TRUE)
data_long <- tempo %>% pivot_longer(!gene, names_to = "value", values_to = "count")
p1 <- ggplot(data_long, aes(x = as.numeric(value), y = count)) + geom_smooth(show_guide = FALSE, se=T, size = .5, col = "dodgerblue2") + xlab(expression(paste("Elastic net mixing parameter (",alpha, ")"))) + ylab(expression(paste("10-fold cross-validated R")))
p2 = ggplot(tempo, aes(x = `0`, y = `0.5`)) + geom_hex(bins = 50) +
geom_abline(slope = 1, intercept = 0, color = "darkgrey", size = 0.8) +
ylab("cor for mixing paramter = 0.5" ) +
xlab("cor for mixing paramter = 0") + theme_bw(base_size = 16)
p1
No matching items