#options(error=recover)
#options(error=browser)
options(error=NULL)
## compare observed correlation with null correlation
suppressMessages(devtools::source_gist("a925fea01b365a8c605e")) ## load qqR fn https://gist.github.com/hakyim/a925fea01b365a8c605e
suppressMessages(devtools::source_gist("38431b74c6c0bf90c12f")) ## qqunif https://gist.github.com/hakyim/38431b74c6c0bf90c12f
suppressMessages(devtools::source_gist("115403f16bec0a0e871f3616d552ce9b")) ## source ratxcan functions https://gist.github.com/hakyim/115403f16bec0a0e871f3616d552ce9b
suppressMessages(library(tidyverse))
suppressMessages(library(glue))
suppressMessages(library(RSQLite))
#suppressMessages(library(expm))
#suppressMessages(library(readxl))
# install.packages("devtools")
# library("devtools")
# install_github("jdstorey/qvalue")
suppressMessages(library(qvalue))
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("biomaRt")
##suppressMessages(library(biomaRt))
##install.packages("ggrepel")
suppressMessages(library(ggrepel))
= c("bodylen","bmi")
traitlist = c("AC", "IL", "LH", "PL", "VO")
tissuelist = c("Standing height","Body mass index (BMI) (21001_raw)")
phenolist names(phenolist) = c("bodylen","bmi")
RatXcan analysis Figure 3 to 4
tutorial for running ratxcan given genotype, phenotype, and prediction weights as input
top
input data
- genotype
- phenotype
- prediction weights
load libraries and functions
download data for the tutorial
- download files from here The folder structure should look like the list below
(base) MBP-HKI-22 ratxcan-tutorial $ tree -L 3
.
├── data
│ ├── expression
│ │ └── gene_annotation.RDS
│ ├── genotype
│ │ ├── DELETE
│ │ ├── rat6k_autosome.bed
│ │ ├── rat6k_autosome.bim
│ │ └── rat6k_autosome.fam
│ ├── phenomexcan
│ │ └── phenomexcan_results.RDS
│ └── phenotype
│ └── pheno.fam
├── models
│ ├── Ac_best_prediXcan_db.db
│ ├── Br-hki.db
│ ├── Il_best_prediXcan_db.db
│ ├── Lh_best_prediXcan_db.db
│ ├── Pl_best_prediXcan_db.db
│ └── Vo_best_prediXcan_db.db
└── software
└── MetaXcan
├── CODE_OF_CONDUCT.md
├── DevNotes.Rmd
├── LICENSE
├── README.md
├── codemap
├── papers
└── software
define data and software paths for R
="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"
WEBDATA
="/Users/haekyungim/bin/plink_mac_20231211/plink"
PLINK="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"
GCTA
<- glue("{WEBDATA}/ratxcan-tutorial") ## this has the input data
INPUT <- glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/scratch") ## this has the output data, intermediate results OUTPUT
define data and software for the terminal
WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"
PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"
GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"
INPUT=$WEBDATA/ratxcan-tutorial
OUTPUT=$WEBDATA/2023-11-27-ratxcan-tutorial/scratch
set up conda environment to run predict expression
conda create -n rat311 python=3.11
conda activate rat311
conda install pandas scipy numpy statsmodels
conda install h5py
## missing modules probably due to M2 processor
## NOT NEEDED conda install bgen_reader
pip install pyliftover
pip install cyvcf2 ## pip intall worked on my macbook M2
predict expression
#WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"
#PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"
#GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"
#INPUT=$WEBDATA/ratxcan-tutorial
#OUTPUT=$WEBDATA/2023-11-27-ratxcan-tutorial/scratch
## convert plink file to vcf
#$PLINK --bfile $GENO_HEADER --recode vcf --out $GENO_HEADER
#gzip $GENO_HEADER.vcf ## to keep file small
conda activate rat311
METAXCAN=$INPUT/software/MetaXcan
GENO=$INPUT/data/genotype
GENO_HEADER=$GENO/rat6k_autosome
MODEL=$INPUT/models
## predict Ac
MODLEFT="VO-filtered"
MODEL_DB="$MODLEFT.db"
python $METAXCAN/software/Predict.py \
$MODEL/$MODEL_DB \
--model_db_path \
--model_db_snp_key rsid ${GENO_HEADER}.vcf.gz \
--vcf_genotypes \
--vcf_mode genotyped "{}_{}_{}_{}" \
--on_the_fly_mapping METADATA $OUTPUT/${MODLEFT}__predict.txt \
--prediction_output $OUTPUT/${MODLEFT}__summary.txt \
--prediction_summary_output --throw
compare with previous prediction of Ac
<- vroom::vroom(glue("{OUTPUT}/Ac-hki-large_geno__predict.txt")) %>%
oldexpr select(-FID) %>% # Remove the FID column
mutate(IID = str_split(IID, "_", simplify = TRUE)[, 1]) # Keep the first part of IID
for(tis in tissuelist)
{<- vroom::vroom(glue("{OUTPUT}/{tis}-filtered__predict.txt")) %>%
new_pred_expr select(-FID) %>% # Remove the FID column
mutate(IID = str_split(IID, "_", simplify = TRUE)[, 1]) # Keep the first part of IID
=calc_cor_matched_cols(oldexpr,new_pred_expr)
kkhist(kk$cor,main=paste(tis, "vs. old Ac-hki-large_geno") )
mtext(paste(sum(kk$cor<0), "genes with negative cor") )
}
read phenotype and check that ids are included in genotype fam file
## DO EVAL #| eval: FALSE
## check whether ids in the phenotype file are in the genotype file
= read_tsv(glue("{INPUT}/data/phenotype/pheno.fam"),col_names = FALSE)
pheno = read_table(glue("{INPUT}/data/genotype/rat6k_autosome.fam"),col_names = FALSE)
fam if( pheno %>% filter(X1 %in% fam$X1) %>% nrow != nrow(pheno) ) message("WARNING: missing pheno ids with missing genotypes")
names(pheno) = c("FID","IID","bodylen","bmi")
read gene annotation
<- readRDS(glue("{INPUT}/data/expression/gene_annotation.RDS")) gene_annotation
calculate GRM
## - [ ] calculate GRM from genotype
## GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64" ## defined at the top
$GCTA --bfile $GENO0/rat6k_autosome --make-grm-bin --out $OUTPUT/rat6k_autosome
#Analysis finished at 00:31:49 CST on Fri Dec 01 2023
#Overall computational time: 1 minute 15 sec
calculate h2
## 1 after --mpheno will use bodylen as phenotype
$GCTA --grm $OUTPUT/rat6k_autosome --reml --pheno $INPUT/data/phenotype/pheno.fam --mpheno 1 --out $OUTPUT/bodylen_h2
## 2 after --mpheno will use bmi as phenotype
$GCTA --grm $OUTPUT/rat6k_autosome --reml --pheno $INPUT/data/phenotype/pheno.fam --mpheno 2 --out $OUTPUT/bmi_h2
read h2
= read_tsv(glue("{OUTPUT}/bodylen_h2.hsq")) %>% filter(Source=="V(G)/Vp")
tempo = tempo %>% pull(Variance)
bodylen_h2 = tempo %>% pull(SE)
bodylen_se = read_tsv(glue("{OUTPUT}/bmi_h2.hsq")) %>% filter(Source=="V(G)/Vp")
tempo = tempo %>% pull(Variance)
bmi_h2 = tempo %>% pull(SE) bmi_se
read grm matrix
<- read_GRMBin(glue("{OUTPUT}/rat6k_autosome.grm")) grm_mat
read predicted expression
= function(filename)
read_pred_expr
{##usage: Br_pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))
<- vroom::vroom(filename) %>%
pred_expr select(-FID) %>% # Remove the FID column
mutate(IID = str_split(IID, "_", simplify = TRUE)[, 1]) # Keep the first part of IID
pred_expr }
show calibration of type I error REMOVE FROM TUTORIAL
simulate genetic effect sizes for nsnp
## on the terminal generate
## $PLINK --bfile $INPUT/data/genotype/rat6k_autosome --freq --out rat6k_autosome
## simulate unrelated Y
= 100
nsim ## to get nsip, read bim file
#tempo = read_tsv(glue("{INPUT}/data/genotype/rat6k_autosome.bim"),col_names = FALSE)
= read_table(glue("{INPUT}/data/genotype/rat6k_autosome.frq"),col_names = TRUE)
tempo = tempo %>% select(SNP,A1,A2,MAF)
tempo = nrow(df_freq)
nsnp ##set.seed(29444)
= 32240
semilla ##semilla = round(runif(1)*1e5)
set.seed(semilla)
= matrix(rnorm(nsnp*nsim),nsnp,nsim)
scoremat ## divide by maf
= sweep(scoremat,1, sqrt(2*tempo$MAF*(1-tempo$MAF)), "/" )
scoremat = cbind(tempo, scoremat)
tempo write_tsv(tempo, file = glue("{OUTPUT}/sim/sim_weights-{semilla}.txt"), col_names = FALSE)
use plink to calculate the sim phenotype $y_ = _k X_k $
An alternative way to simulate Y with related individuals would be to simulate unrelated normal rv. for each rat, then pre-multiply by the correlation matrix \(\Sigma^{0.5}\). This approach would make it more obvious that premultiplying by \(\Sigma^{-0.5}\) would yield a an uncorrelated trait across rats.
# Calculate PRS using plink
# https://www.cog-genomics.org/plink/1.9/score
# --score my.scores 3 2 1
# reads variant IDs from column 3, allele codes from column 2, and scores from column 1.
# > head(tempo)
# A tibble: 6 × 6
# X1 X2 X3 X4 X5 X6
# <dbl> <chr> <dbl> <dbl> <chr> <chr>
#1 1 1_1643610_C_T 0 1643610 T C
#2 1 1_1646409_T_G 0 1646409 T G
## BEFORE 1/30/2024 --score $OUTPUT/sim/$WEIGHTS 2 6 $((6+SIMID)) \
## new column order
#> headleft(tempo,6) tempo has df with snp info and weights N(0,1)/sqrt(2 p (1-p))
# SNP A1 A2 MAF 1 2
#1 1_1643610_C_T T C 0.4521 0.01446167 -0.1048587
#2 1_1646409_T_G T G 0.1253 0.88769963 -0.7607204
#3 1_1658435_A_G G A 0.4511 2.66557665 -0.7946988
# Define the range of SIMID values
NSIM=100 # Replace with your actual value
SEMILLA=32240
WEIGHTS="sim_weights-$SEMILLA.txt"
time (
for ((SIMID=1; SIMID<=$NSIM; SIMID++)); do
$PLINK --bfile $INPUT/data/genotype/rat6k_autosome \
--score $OUTPUT/sim/$WEIGHTS 1 3 $((4+SIMID)) \
--out $OUTPUT/sim/tempo/PRS_output_$SIMID-$SEMILLA
# Print SIMID every 10 iterations
if ((SIMID % 10 == 0)); then
echo "Processed $SIMID simulations"
fi
done
)
#Processed 100 simulations
#real 2m1.887s
#user 1m45.025s
#sys 0m13.337s
format simulated scores into matrix
# semilla = 20
# set.seed(semilla)
= bodylen_h2
h2 ="-32240"
tailo
if(F)
{= suppressMessages(read_table(glue("{OUTPUT}/sim/tempo/PRS_output_1{tailo}.profile")))
simy = 100
nsim = matrix(NA,nrow(simy),nsim)
phenomat rownames(phenomat) = simy$IID
1] = simy$SCORE
phenomat[,for(simid in 2:nsim)
{= suppressMessages(read_table(glue("{OUTPUT}/sim/tempo/PRS_output_{simid}{tailo}.profile")))
simy = simy$SCORE
phenomat[,simid]
}
## scale phenomat
= scale(phenomat)
phenomat ## add noise
#nx = nrow(phenomat)
#phenomat = sqrt(h2) * phenomat + sqrt(1 - h2) * scale(matrix(rnorm(nx*nsim),nx, nsim))
= cbind(simy[,1:2], as.data.frame(phenomat))
simpheno ## CHECK WHAT NAME TO USE
#saveRDS(simpheno,file=glue("{OUTPUT}/sim/tempo/simpheno_{h2}-{tailo}.RDS"))
saveRDS(simpheno,file=glue("{OUTPUT}/sim/tempo/simpheno.RDS")) ## no error component
else
}
{= readRDS(file=glue("{OUTPUT}/sim/tempo/simpheno.RDS")) ## no error component
simpheno #simpheno=readRDS(file=glue("{OUTPUT}/sim/tempo/simpheno_{h2}-{tailo}.RDS"))
= ncol(simpheno) - 2 ## subtract FID, and IID columns
nsim }
visualize raw and corrected pvalues
<- function(tempres, post_titulo="",semilla="") {
myplot # Create a data frame with specific columns
<- data.frame(
df p0.01_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.01)),
p0.01_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.01)),
p0.05_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.05)),
p0.05_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.05)),
p0.10_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.10)),
p0.10_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.10))
# ... [rest of your code for creating df] ...
)
# Pivot the data frame to long format, specifying the columns to keep
<- pivot_longer(df, cols = starts_with("p"))
df_long
<- df_long %>% separate(name,into = c("threshold","corrected"),sep="_") %>% rename(proportion=value)
df_long
# Rename the name column to replace p0.xx with p<0.xx
<- df_long %>%
df_long mutate(threshold = gsub("p0\\.", "p<0.", threshold))
# Create boxplots with mean
<- ggplot(df_long, aes(x = threshold, y = proportion, fill = corrected)) +
pp geom_boxplot(alpha = 0.6) +
stat_summary(fun = mean, geom = "point", shape = 3, size = 2, stroke = 2, color = "blue", #position = position_dodge(width = 0.8)) +
position = position_dodge(width = -0.1)) +
#stat_summary(fun = mean, geom = "crossbar", size = .5, color = "blue") +
#stat_summary(fun = mean, geom = "crossbar", size = .5, color = "darkgray") +
geom_hline(yintercept = c(0.01, 0.05, 0.10), linetype = "dashed", color = "gray") +
theme_minimal(base_size = 15) +
#ggtitle(glue("Type I Error Calibration {semilla} {post_titulo}")) +
xlab("significance") + ylab("false positive rate")
pp
}#myplot(tempres_sim,post_titulo = glue("nsam:", {nsam},"\n"),semilla)
define function lmm with GRM
## HERE WE USE THE FULL GRM MATRIX AND CALCULATE THE INVERSE OF THE SIGMA MATRIX
## define lmm association function
= function(pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=6:6,out=NULL)
lmmGRM
{## input pheno is a data frame with id column pheno_id_col=1 by default
## phenotype values are in pheno_value_cols, 6:6 by default (SCORE column location in plink output), it can have more than one phenotype
## but h2 has to be the same, this is useful when running simulations with different h2
## call lmmXcan(pheno %>% select(IID,SCORE))
## format pheno to matrix form
<- as.matrix(pheno[,pheno_value_cols])
phenomat rownames(phenomat) <- pheno[[pheno_id_col]]
## turn pred_expr into matrix with rownames =IID, keep only IIDs in ymat
= as.matrix(pred_expr %>% select(-IID))
exp_mat rownames(exp_mat) = pred_expr$IID
## align pheno and expr matrices
= intersect(rownames(phenomat), rownames(exp_mat))
idlist
= length(idlist)
nsam
## CALCULATE SIGMA
= diag(rep(1,nsam))
ID_mat
#testing_scale_grm = TRUE
#if(testing_scale_grm) grm_mat = sweep( sweep(grm_mat,2, 1/sqrt(diag(grm_mat)),"*"), 1, 1/sqrt(diag(grm_mat)),"*")
= grm_mat[idlist,idlist] * h2 + (1 - h2) * ID_mat
Sigma
= eigen(Sigma)
Sig_eigen rownames(Sig_eigen$vectors) = rownames(Sigma)
= Sig_eigen$vectors %*% diag( 1 / sqrt( Sig_eigen$values ) ) %*% t(Sig_eigen$vectors)
isighalf
## perform raw association
= matrix_lm(phenomat[idlist,, drop = FALSE], exp_mat[idlist,])
cormat_raw = cor2pval(cormat_raw,nsam)
pmat_raw colnames(pmat_raw) <- gsub("cor_", "pval_", colnames(pmat_raw))
## perform corrected association
= matrix_lm(isighalf%*% phenomat[idlist,, drop = FALSE], isighalf %*% exp_mat[idlist,])
cormat_correct = cor2pval(cormat_correct,nsam)
pmat_correct colnames(pmat_correct) <- gsub("cor_", "pval_", colnames(pmat_correct))
if(!is.null(out))
{saveRDS(cormat_correct,file = glue("{out}_cormat_correct.RDS"))
saveRDS(pmat_correct, file = glue("{out}_pmat_correct.RDS"))
saveRDS(cormat_raw, file = glue("{out}_cormat_raw.RDS"))
saveRDS(pmat_raw, file = glue("{out}_pmat_raw.RDS"))
}= list(
res cormat_correct=cormat_correct,
pmat_correct=pmat_correct,
cormat_raw=cormat_raw,
pmat_raw=pmat_raw)
res
}
run ratxcan null regression
##pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))
= function(simpheno,h2)
add_noise
{= as.matrix(simpheno %>% select(-IID,-FID))
phenomat = scale(phenomat)
phenomat ## add noise
= nrow(phenomat)
nx =ncol(phenomat)
nsim= sqrt(h2) * phenomat + sqrt(1 - h2) * scale(matrix(rnorm(nx*nsim),nx, nsim))
phenomat cbind(simpheno[,1:2], as.data.frame(phenomat))
}
=FALSE
recalculatefor(h2 in c(0.1,0.2,0.4,0.6,0.8))
{cat("---",h2,"---\n")
if(recalculate)
{ =Sys.time()
tic<- lmmGRM(add_noise(simpheno,h2),
tempres pheno_id_col=1,
grm_mat, h2,pred_expr,pheno_value_cols=2+(1:nsim))
=Sys.time()
tocprint(toc - tic)
saveRDS(tempres,file=glue("{OUTPUT}/tempres-null-h2-{h2}.RDS"))
else tempres = readRDS(glue("{OUTPUT}/tempres-null-h2-{h2}.RDS"))
} myplot(tempres_sim)
ggsave(glue("{OUTPUT}/calib-figure-h2-{h2}.png"),width=6,height=5)
png(glue("{OUTPUT}/hist-p-corrected-{h2}.png"))
hist(tempres$pmat_correct,main=glue("corrected p-values - {h2}"))
dev.off()
png(glue("{OUTPUT}/hist-p-raw-{h2}.png"))
hist(tempres$pmat_raw,main=glue("raw p-values - {h2}"))
dev.off()
png(glue("{OUTPUT}/qqunif-compare-raw-corrected-{h2}.png"))
qqunif.compare(tempres$pmat_raw,tempres$pmat_correct,BF=FALSE,BF2=FALSE,main="qqunif {h2}")
dev.off()
}##%HERE
simulate \(Y = Sigma^{1/2}\epsilon\) and run assoc with expr_mat
=nrow(grm_mat)
nsam=1:nsam
ind#ind=1:1000
= grm_mat[ind,ind]
test_mat =nrow(test_mat)
nsam= test_mat * h2 + (1 - h2) * diag(rep(1,nsam))
Sigma = eigen(Sigma)
Sig_eigen rownames(Sig_eigen$vectors) = rownames(Sigma)
##sighalf = Sig_eigen$vectors %*% diag( sqrt( Sig_eigen$values ) ) %*% t(Sig_eigen$vectors)
## make this multiplication more efficient using sweep
= Sig_eigen$vectors %*% sweep(t(Sig_eigen$vectors),1,sqrt( Sig_eigen$values ),"*")
sighalf
for(ii in 1:10)
{= sighalf %*% matrix(rnorm(nsam * nsim), nsam, nsim)
sim_sigma_pheno =cbind(FID=rownames(sim_sigma_pheno),IID=rownames(sim_sigma_pheno),as.data.frame(sim_sigma_pheno))
sim_sigma_pheno
=Sys.time()
tic<- lmmGRM(sim_sigma_pheno,grm_mat, h2,pred_expr,pheno_id_col=1, pheno_value_cols=2+(1:nsim))
tempres_sigma_pheno =Sys.time()
toc- tic
toc <-myplot(tempres_sigma_pheno,post_titulo = glue("sigma_pheno n= {nsam} - ii={ii}"))
ppcat(ii,"\n")
print(pp)
ggsave(glue("{OUTPUT}/calib-sim-sigma-n{nsam}-ii{ii}.png"))
}
simulate \(Y = GRM^{1/2} h2 + (1 - h2) \epsilon\) and run assoc with expr_mat
=nrow(grm_mat)
nsam=1:nsam
ind#ind=1:1000
= grm_mat[ind,ind]
test_mat =nrow(test_mat)
nsam
= eigen(test_mat)
test_eigen rownames(test_eigen$vectors) = rownames(test_mat)
## show smallest eigenvalues of GRM
cat(sort(test_eigen$values) %>% head, "these numbers should be non negative \n")
## force eigenvalues to be nonnegative
$values = pmax(test_eigen$values,0)
test_eigen= test_eigen$vectors %*% sweep(t(test_eigen$vectors), 1, sqrt(test_eigen$values),"*")
grmhalf
for(ii in 1:10)
{## simulate phenomat as rv with h2*grm as cov + indep noise term (1-h2)
= sqrt(h2) * grmhalf %*% matrix(rnorm(nsam * nsim), nsam, nsim) + sqrt(1-h2) * matrix(rnorm(nsam * nsim), nsam, nsim)
sim_grm_pheno =cbind(FID=rownames(sim_grm_pheno),IID=rownames(sim_grm_pheno),as.data.frame(sim_grm_pheno))
sim_grm_pheno
=Sys.time()
tic<- lmmGRM(sim_grm_pheno,grm_mat, h2,pred_expr,pheno_id_col=1, pheno_value_cols=2+(1:nsim))
tempres_grm_pheno =Sys.time()
toc- tic
toc <-myplot(tempres_grm_pheno,post_titulo = glue("sigma_pheno n= {nsam} - ii={ii}"))
ppcat(ii,"\n")
print(pp)
ggsave(glue("{OUTPUT}/calib-sim-grm-n{nsam}-ii{ii}.png"))
}
run bodylen regression with BR expr lmmGRM
= read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))
pred_expr
= FALSE
recalculate
= "bodylen"
trait = bodylen_h2
h2 = bodylen_h2+bodylen_se
h2se ##pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=6:6,out=NULL
if(recalculate)
{=Sys.time()
tic<- lmmGRM(pheno,grm_mat, h2, pred_expr,pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
tempres_h2 =Sys.time()
toc- tic
toc saveRDS(tempres_h2,glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))
else
} <- readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))
tempres_h2
png(glue("{OUTPUT}/{trait}-BR-lmmGRM.png"))
qqunif.compare(tempres_h2$pmat_raw,tempres_h2$pmat_correct,main=glue("ratxcan {trait}") )
dev.off()
if(recalculate)
{=Sys.time()
tic<- lmmGRM(pheno,grm_mat, h2se,pred_expr,pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
tempres_h2se =Sys.time()
toc- tic
toc saveRDS(tempres_h2se,glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS"))
else
} <- readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS")) tempres_h2se
run bmi regression with lmmGRM
= FALSE
recalculate
<- bmi_h2
h2 = bmi_h2 + bmi_se
h2se = "bmi"
trait
= read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))
pred_expr
if(recalculate)
{=Sys.time()
tic= bmi_h2 + bmi_se
h2 <- lmmGRM(pheno,grm_mat, h2, pred_expr, pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
tempres_h2 =Sys.time()
toc- tic
toc saveRDS(tempres_h2,glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))
else tempres_h2 = readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))
}
png(glue("{OUTPUT}/{trait}-BR-lmmGRM.png"))
qqunif.compare(tempres_h2$pmat_correct,tempres_h2$pmat_raw,main=glue("ratxcan {trait}") )
dev.off()
if(recalculate)
{=Sys.time()
tic<- lmmGRM(pheno,grm_mat, h2se,pred_expr,pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
tempres_h2se =Sys.time()
toc- tic
toc saveRDS(tempres_h2se,glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS"))
else tempres_h2se = readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS"))
}
png(glue("{OUTPUT}/{trait}-BR-lmmGRM_h2_plus_se.png"))
qqunif.compare(tempres_h2se$pmat_correct,tempres_h2se$pmat_raw,main=glue("ratxcan {trait}") )
dev.off()
bodylen and bmi vs 5 brain region expr association
= FALSE
recalculate #tissuelist = c("AC", "IL", "LH", "PL", "VO")
for(trait in c("bodylen","bmi"))
{if(trait == "bmi") h2 = bmi_h2+bmi_se else
if(trait == "bodylen") h2 = bodylen_h2 + bodylen_se else stop("unknown trait")
print(trait)
for(tis in tissuelist)
{print(tis)
if(recalculate)
{= read_pred_expr(glue("{OUTPUT}/{tis}-filtered__predict.txt"))
pred_expr
## run lmmGRM
=Sys.time()
tic<-
tempres_h2se lmmGRM(pheno,grm_mat,
h2,
pred_expr,pheno_id_col=1,
pheno_value_cols=which(colnames(pheno)==trait) )
=Sys.time()
tocprint(toc - tic)
## save results
saveRDS(tempres_h2se,glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS"))
else
} = readRDS(glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS"))
tempres_h2se
png(glue("{OUTPUT}/{trait}-{tis}-lmmGRM_h2_plus_se.png"))
qqunif.compare(tempres_h2se$pmat_raw,tempres_h2se$pmat_correct,main=glue("ratxcan {trait} {tis}") )
dev.off()
}
}
download rat gene annotation and human phenomexcan
#ensembl = biomaRt::useMart("ensembl", dataset = "rnorvegicus_gene_ensembl")
#annotation = biomaRt::getBM(attributes = c('ensembl_gene_id','external_gene_name', 'start_position', 'end_position', 'chromosome_name'),mart = ensembl)
=FALSE
new_ensembl_queryif(new_ensembl_query)
{= biomaRt::useEnsembl(biomart='ensembl', dataset="hsapiens_gene_ensembl", mirror = "useast")
human #attributes <- listAttributes(human)
= c("ensembl_gene_id", "external_gene_name", "rnorvegicus_homolog_ensembl_gene", "rnorvegicus_homolog_associated_gene_name")
attributes = biomaRt::getBM(attributes, filters="with_rnorvegicus_homolog",values=TRUE, mart = human, uniqueRows=TRUE)
orth.rats saveRDS(orth.rats,file=glue("{INPUT}/data/expression/orth.rats.RDS"))
else
} = readRDS(file=glue("{INPUT}/data/expression/orth.rats.RDS"))
orth.rats
= readRDS(glue("{INPUT}/data/phenomexcan/phenomexcan_results_signs.RDS")) phenomexcan_results
create ratxcan association table to save as csv files
= function(tempres,out=NULL,phenoname)
generate_results_table
{## phenoname: "Body mass index (BMI) (21001_raw)" or "Standing height"
= tempres$pmat_correct
pmat = data.frame(gene=rownames(pmat), p_correc=pmat, cor_correct=tempres$cormat_correct, p_raw=tempres$pmat_raw)
df names(df) = c("gene","p_correct","cor_correct","p_raw")
rownames(df) = NULL
## annotate
= df %>%
df left_join(gene_annotation %>%
select(genename=gene_name,
gene_id,
chr,by=c("gene"="gene_id")) %>%
start), arrange(p_correct)
## write csv file
if(!is.null(out)) write_csv(df,file = out)
df
}
= data.frame()
df_all
="BR"
tis="bodylen"
trait#tissuelist = c("AC", "IL", "LH", "PL", "VO")
for(tis in c("BR",tissuelist))
{= glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS")
RDSpath = readRDS(RDSpath)
tempres = generate_results_table(tempres, #out=glue("{OUTPUT}/{trait}-results.csv"),
df phenoname="Standing height")
$trait=trait
df$tissue = tis
df= rbind(df_all,df)
df_all
}
="bmi"
traitfor(tis in c("BR",tissuelist))
{= glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS")
RDSpath = readRDS(RDSpath)
tempres = generate_results_table(tempres,
df ##out=glue("{OUTPUT}/{trait}-results.csv"),
phenoname="Body mass index (BMI) (21001_raw)")
$trait=trait
df$tissue = tis
df= rbind(df_all,df)
df_all
}
#saveRDS(df_all,file = glue("{OUTPUT}/all-assoc-results.RDS"))
## pivot wider
<- df_all %>% select(gene,trait,tissue,p_correct) %>% pivot_wider(
df_wider names_from = tissue,
values_from = p_correct,
id_cols = c(gene, trait)
)
= function(pvec)
acat
{= pvec[!is.na(pvec)]
pvec = sum( tan( (0.5 - pvec) *pi ) )
TT 5 - atan(TT / length(pvec)) / pi
.
}
$p_acat_5 = apply(df_wider %>% select(all_of(tissuelist)) ,1,function(x) acat(x) )
df_wider
$p_acat_6 = apply(df_wider %>% select(all_of(c("BR",tissuelist)),) ,1,function(x) acat(x) )
df_wider
= df_wider %>% left_join(gene_annotation %>% select(gene_id,gene_name,chr,start),by=c("gene"="gene_id")) df_wider
collect signs
## pivot wider
<- df_all %>% select(gene,trait,tissue,cor_correct) %>% pivot_wider(
df_wider_cor names_from = tissue,
values_from = cor_correct,
id_cols = c(gene, trait)
)
$max_value_original_sign = apply(df_wider_cor[, -c(1, 2)], 1, function(x) {
df_wider_cor= which.max(abs(x))
idx_max_magnitude return(x[idx_max_magnitude])
})
$best_sign = sign(df_wider_cor$max_value_original_sign)
df_wider_cor
for(ctrait in traitlist)
= df_wider_cor %>%
{tempo filter(trait == ctrait) %>%
inner_join(df_wider %>%
select(gene,trait,p_acat_6), by=c("gene"="gene","trait"="trait")) %>%
left_join(phenomexcan_results %>%
filter(phenotype==phenolist[ctrait]) %>%
select(p_human=pvalue,
hugo_gene=external_gene_name,
gene_id,rnorvegicus_homolog_ensembl_gene,
dir_effect_most_signif),by=c("gene"="rnorvegicus_homolog_ensembl_gene"),relationship = "many-to-many")
=tempo$p_acat_6 < 1e-4;
indcat("---",ctrait,"---\n")
print(fisher.test(tempo$best_sign[ind], tempo$dir_effect_most_signif[ind]))
}
no concordance of signs
$qval = NA
df_wider
## calculate qvalues
for(ctrait in traitlist)
{= df_wider$trait == ctrait
ind = qvalue(df_wider$p_acat_6[ind])
qq $qval[ind] = qq$qvalues
df_wider }
enrichment of human gene analysis
## annotate with human genes
## add human pvalues
#ctrait = "bmi"
for(ctrait in traitlist)
{= df_wider %>% filter(trait == ctrait) %>%
tempo left_join(phenomexcan_results %>%
filter(phenotype==phenolist[ctrait]) %>%
select(p_human=pvalue,hugo_gene=external_gene_name,
gene_id,rnorvegicus_homolog_ensembl_gene, dir_effect_most_signif), by=c("gene"="rnorvegicus_homolog_ensembl_gene"),
relationship = "many-to-many")
= 17573
mhuman = nrow(tempo)
mrat
cat("fisher test",ctrait,"p_acat_6 \n")
for(thres in c(0.01,0.05,0.1))
{cat('---',thres,'---\n')
with(tempo, print(fisher.test(p_acat_6<thres/mrat,p_human<thres/mhuman)) )
with(tempo, print(table(p_acat_6<thres/mrat,p_human<thres/mhuman)) )
}
cat("fisher test",ctrait,"p_acat_5 \n")
for(thres in c(0.01,0.05,0.1))
{cat('---',thres,'---\n')
if(thres==0.05) cat('---\n--- UPDATE PAPER WITH THIS 0.05 ---\n---\n')
with(tempo, print(fisher.test(p_acat_5<thres/mrat,p_human<thres/mhuman)) )
with(tempo, print(table(p_acat_5<thres/mrat,p_human<thres/mhuman)) )
}
= tempo$p_acat_6 < 0.05/mrat
ind # ind = tempo$p_acat_6 < p_fdr05
print(ctrait)
qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,main=glue("human gene assoc for rat {ctrait} genes"),cex2=1.5)
png(glue("{OUTPUT}/qqunif-compare-{ctrait}.png"),width = 640,height = 480)
qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,col='maroon1',col2='dodgerblue',cex2=1.5)
dev.off()
cat("wilcoxon test")
print(wilcox.test(tempo$p_human[!ind],tempo$p_human[ind]))
= tempo$p_acat_5 < 0.05/(tempo %>% filter(!is.na(p_acat_5)) %>% nrow()) & !is.na(tempo$p_acat_5)
ind # ind = tempo$p_acat_6 < p_fdr05
print(ctrait)
qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,main=glue("human gene assoc for rat {ctrait} genes"),cex2=1.5)
png(glue("{OUTPUT}/qqunif-compare-{ctrait}-p_acat_05.png"),width = 640,height = 480)
qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,col='maroon1',col2='dodgerblue',cex2=1.5)
dev.off()
cat("wilcoxon test")
print(wilcox.test(tempo$p_human[!ind],tempo$p_human[ind]))
if(F) write_csv(tempo %>% arrange(p_acat_6) %>%
select(gene_name,p_acat_6,chr,start,p_human,hugo_gene,
trait,gene,gene_id,BR,AC,IL,LH,PL,VO,p_acat_5,qval),file = glue("{OUTPUT}/ratxcan-{ctrait}-results-qval.csv"))
}
enrichment of Crouse’s consensus genes
<- readxl::read_excel(glue("{INPUT}/data/expression/Table S10 Fat_consensus_alpha_0.01 PG.xlsx"),skip=1)
consensus_df
dim(consensus_df)
## enrichment of consensus genes among bmi associated genes in rats
= df_wider %>% filter(trait=="bmi") %>% left_join(consensus_df %>% select(ensembl_gene_id, description...3),by=c("gene"="ensembl_gene_id")) %>% select(gene,p_acat_6,description...3) %>% mutate(consensus = !is.na(description...3))
tempo
qqunif.compare(tempo$p_acat_6,tempo$p_acat_6[tempo$consensus], main="no enrich. of consensus genes in rat bmi genes")
## enrichment of consensus genes among bmi associated genes in humans
= phenomexcan_results %>% filter(phenotype=="Body mass index (BMI) (21001_raw)") %>% left_join(consensus_df %>% select(ensembl_gene_id, description...3),by=c("rnorvegicus_homolog_ensembl_gene"="ensembl_gene_id")) %>% mutate(consensus = !is.na(description...3)) %>% select(gene=rnorvegicus_homolog_ensembl_gene,p_human=pvalue,consensus)
tempo
qqunif.compare(tempo$p_human,tempo$p_human[tempo$consensus], main="no enrich. of consensus genes in human bmi genes")
Crouse et al’s consensus genes are not enriched among human nor rat BMI genes
summarize results for the paper
= 0.05/ (df_wider %>% filter(trait==ctrait) %>% filter(!is.na(p_acat_6)) %>% nrow())
BF_thres for(ctrait in traitlist)
{cat("BF threshold ", signif(BF_thres,3),"\n")
cat("---",ctrait,"---\n")
= df_wider %>%
res filter(trait==ctrait) %>%
filter(p_acat_6 < BF_thres) %>%
select(trait,p_acat_6,gene_name,chr,start,gene) %>%
left_join(phenomexcan_results %>% filter(phenotype==phenolist[ctrait]) %>% select(rnorvegicus_homolog_ensembl_gene,p_human=pvalue,hugo=gene_name) , by=c("gene"="rnorvegicus_homolog_ensembl_gene")) %>%
arrange(p_acat_6)
print(res)
= res %>% select(gene_name) %>% unique() %>% nrow()
res cat("---\n")
cat(res," BF significant genes\n")
}
= (df_wider %>% filter(trait==ctrait) %>% filter(!is.na(p_acat_5)) %>% nrow())
ntests = 0.05/ntests
BF_thres for(ctrait in traitlist)
{cat("BF threshold ", signif(BF_thres,3),"\n")
cat("---",ctrait,"--- p_acat_5 - UPDATE PAPER WITH THIS\n")
= df_wider %>%
res filter(trait==ctrait) %>%
filter(p_acat_5 < BF_thres) %>%
select(trait,p_acat_5,gene_name,chr,start,gene) %>%
left_join(phenomexcan_results %>% filter(phenotype==phenolist[ctrait]) %>% select(rnorvegicus_homolog_ensembl_gene,p_human=pvalue,hugo=gene_name) , by=c("gene"="rnorvegicus_homolog_ensembl_gene")) %>%
arrange(p_acat_5)
print(res)
= res %>% select(gene_name) %>% unique() %>% nrow()
res cat("---\n")
cat(res," BF significant genes p_acat_5 - UPDATE PAPER WITH THIS\n")
}
cat(BF_thres," BF threshold for p_acat_5 - UPDATE PAPER WITH THIS\n")
cat(ntests," number of tests p_acat_5 - UPDATE PAPER WITH THIS\n")
gene set enrichment
= "bmi"
ctrait = 1e-2
thres
= df_wider %>% filter(trait == ctrait) %>% filter(p_acat_6 < thres) %>% pull(gene)
top_genes_thres write(top_genes_thres,file="~/Downloads/top_genes_thres.txt")
= df_wider %>% filter(trait == ctrait) %>% arrange(p_acat_6) %>% pull(gene) %>% head(500)
top_genes_n write(top_genes_n,file="~/Downloads/top_genes_n.txt")
#top_genes_hugo_thres = df_wider %>% filter(trait == ctrait) %>% filter(p_acat_6 < thres) %>% inner_join(orth.rats,by=c("gene"="rnorvegicus_homolog_ensembl_gene")) %>% pull(external_gene_name)
#write(top_genes_hugo,file="~/Downloads/top_genes_hugo_thres.txt")
suppressMessages(library(gprofiler2))
<- gost(query = top_genes_thres,organism="rnorvegicus")
gostres #gostres <- gost(query = top_genes_thres,organism="hsapiens")
gostplot(gostres)
define ggplot based manhattan plotting function
## here
## qq_manhattan(tempo %>% rename(pvalue=p_acat_6))
<- function(df, titulo="",significance_threshold = 0.05) {
gg_manhattan ## USAGE: gg_manhattan(df,0.05)
## df has columns: pvalue, chr (numeric), and start (position)
## significance threshold gets divided by the number of tests
##
# Calculate cumulative base pair positions
<- df %>%
data_cum group_by(chr) %>%
summarise(max_bp = as.numeric(max(start)), .groups = 'drop') %>%
mutate(bp_add = lag(cumsum(max_bp), default = 0))
<- df %>%
gwas_data inner_join(data_cum, by = "chr") %>%
mutate(bp_cum = start + bp_add)
# Calculate axis labels
<- gwas_data %>%
axis_set group_by(chr) %>%
summarize(center = mean(bp_cum), .groups = 'drop')
# Determine the ylim based on the most significant p-value
<- gwas_data %>%
ylim filter(pvalue == min(pvalue)) %>%
summarise(ylim = abs(floor(log10(pvalue))) + 2) %>%
pull(ylim)
# Calculate the genome-wide significance level
<- significance_threshold / nrow(df)
sig
# Construct the Manhattan plot
<- ggplot(gwas_data, aes(x = bp_cum, y = -log10(pvalue), color = as.factor(chr), size = -log10(pvalue))) +
manhattan_plot geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
geom_hline(yintercept = -log10(0.0001), color = "red", linetype = "dashed") +
geom_point(alpha = 0.75, shape = 19) + # Simplified shape decision for clarity
geom_label_repel(aes(label = ifelse(pvalue <= sig, gene_name, "")), size = 3) +
ylim(c(0, ylim)) +
scale_x_continuous(labels = axis_set$chr, breaks = axis_set$center) +
scale_color_manual(values = rep(c("dodgerblue4", "midnightblue"), length(unique(axis_set$chr)))) +
scale_size_continuous(range = c(0.5, 3)) +
labs(x = NULL, y = expression(-log[10](italic(p)))) +
theme_minimal() +
theme(legend.position = "none",
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.text.x = element_text(angle = 90, size = 12),
axis.text.y = element_text(size = 12, vjust = 0),
axis.title = element_text(size = 20))
if(titulo !="") manhattan_plot = manhattan_plot + ggtitle(titulo)
return(manhattan_plot)
}
plot manhattan plots of bmi and body length
for(ctrait in traitlist)
{= df_wider %>% filter(trait==ctrait) %>% mutate(chr=as.numeric(chr))
tempo = gg_manhattan(tempo %>% rename(pvalue=p_acat_6), titulo=ctrait)
gg print(gg)
ggsave(glue("{OUTPUT}/{ctrait}-manhattan-p_acat_6.png"))
}
for(ctrait in traitlist)
{= df_wider %>% filter(trait==ctrait) %>% mutate(chr=as.numeric(chr))
tempo = gg_manhattan(tempo %>% rename(pvalue=p_acat_5) %>% filter(!is.na(pvalue)), titulo=ctrait)
gg print(gg)
ggsave(glue("{OUTPUT}/{ctrait}-manhattan-p_acat_5.png"))
}
double checking that the saved qvalues are correct
= read_tsv("/Users/haekyungim/Downloads/ratxcan-bmi-results - ratxcan-bmi-results(1).tsv")
kk5 = read_tsv("/Users/haekyungim/Downloads/ratxcan-bodylen-results - ratxcan-bodylen-results(1).tsv")
kk6 all.equal(qvalue(kk5$p_acat_6)$qvalues,kk5$qval)
all.equal(qvalue(kk6$p_acat_6)$qvalues,kk6$qval)
plot(qvalue(kk5$p_acat_6)$qvalues,kk5$qval); abline(0,1)
plot(qvalue(kk6$p_acat_6)$qvalues,kk6$qval); abline(0,1)
# kk1 = read_tsv("/Users/haekyungim/Downloads/ratxcan-bmi-results - ratxcan-bmi-results.tsv")
# kk2 = read_tsv("/Users/haekyungim/Downloads/with-qval-ratxcan-bmi-results - ratxcan-bmi-results.tsv")
# kk3 = read_tsv("/Users/haekyungim/Downloads/ratxcan-bodylen-results - ratxcan-bodylen-results.tsv")
# kk4 = read_tsv("/Users/haekyungim/Downloads/with-qval-ratxcan-bodylen-results - ratxcan-bodylen-results.tsv")
# all.equal(kk1$p_acat_6,kk2$p_acat_6)
# all.equal(kk3$p_acat_6,kk4$p_acat_6)
#
# all.equal(qvalue(kk1$p_acat_6)$qvalues,kk2$qval)
# all.equal(qvalue(kk3$p_acat_6)$qvalues,kk4$qval)
## small difference between qvalues were calculated before joining with human data
plot(qvalue(kk1$p_acat_6)$qvalues,kk2$qval); abline(0,1)
plot(qvalue(kk3$p_acat_6)$qvalues,kk4$qval); abline(0,1)
No matching items