<- read_tsv(data.dir %&% "Box_files/overlap_rat_genes_GTEx.txt", col_names = TRUE) # genes that are present in both rat expression data and in GTEx
overlap
<- readRDS(data.dir %&% "expression/Ac_expression_transformed.RDS")
expr_Ac <- expr_Ac[, intersect(colnames(expr_Ac), overlap$rat_gene)]
expr_Ac saveRDS(expr_Ac, data.dir %&% "expression/Ac_expression_intesrect_GTEx.RDS")
Generate Sparsity Figures
Subset genes used in prediction to those that have an analog in GTEx tissues
we will use this for comparison later
Generate R2 predictions for all elastic net parameters between 0 and 1
We run the same Prediction Model Pipeline only for Ac tissue. However this time, I didn’t break it down into chromosome. This takes longer but means you have less files, a file for each alpha parameter.
for i in $(seq 0 0.1 1.0)
do
qsub -v alpha=$i sparsity_rats_pipeline.pbs
done
Now we have predictability for all parameters of alpha. We can now iterate through all alphas and create the long data format. We also only select for genes that have an average cor > 0.3 and subsample 20 genes.
<- list() # creates a list
ldf <- list.files(path = dir, pattern = "working_TW_Ac_exp_10-foldCV_elasticNet_alpha", full.names = TRUE) # creates the list of all the tsv files in the directory
listtsv <- read_tsv(listtsv[1], col_names = TRUE)
tempo <- tempo %>% select(c(gene, cor))
tempo colnames(tempo)[2] = "0"
for (k in 2:length(listtsv)){
<- read_tsv(listtsv[k], col_names = TRUE)
ldf[[k]] <- substr(listtsv[k], 107, str_length(listtsv[k]) - 13)
alpha <- as.data.frame(ldf[[k]])
fila <- fila %>% select(c(gene, cor))
fila colnames(fila)[2] = alpha
<- inner_join(tempo, fila, by = "gene")
tempo }
Plot results of r for all parameers of alpha
<- read_tsv("/Users/natashasanthanam/Downloads/rat_elastic_net_all_parameters_GTEx_only_genes.txt", col_names = TRUE)
tempo
<- tempo %>% pivot_longer(!gene, names_to = "value", values_to = "count")
data_long
<- ggplot(data_long, aes(x = as.numeric(value), y = count)) + geom_smooth(show_guide = FALSE, se=T, size = .5, col = "dodgerblue2") + xlab(expression(paste("Elastic net mixing parameter (",alpha, ")"))) + ylab(expression(paste("10-fold cross-validated R")))
p1
= ggplot(tempo, aes(x = `0`, y = `0.5`)) + geom_hex(bins = 50) +
p2 geom_abline(slope = 1, intercept = 0, color = "darkgrey", size = 0.8) +
ylab("cor for mixing paramter = 0.5" ) +
xlab("cor for mixing paramter = 0") + theme_bw(base_size = 16)
p1
No matching items