RatXcan analysis Figure 3 to 4

tutorial for running ratxcan given genotype, phenotype, and prediction weights as input
Author

Haky Im

Published

November 27, 2023

top

input data

  • genotype
  • phenotype
  • prediction weights

load libraries and functions

#options(error=recover)
#options(error=browser)
options(error=NULL)

## compare observed correlation with null correlation
suppressMessages(devtools::source_gist("a925fea01b365a8c605e")) ## load qqR fn https://gist.github.com/hakyim/a925fea01b365a8c605e
suppressMessages(devtools::source_gist("38431b74c6c0bf90c12f")) ## qqunif https://gist.github.com/hakyim/38431b74c6c0bf90c12f
suppressMessages(devtools::source_gist("115403f16bec0a0e871f3616d552ce9b")) ## source ratxcan functions https://gist.github.com/hakyim/115403f16bec0a0e871f3616d552ce9b 

suppressMessages(library(tidyverse))
suppressMessages(library(glue))
suppressMessages(library(RSQLite))
#suppressMessages(library(expm))
#suppressMessages(library(readxl))
# install.packages("devtools")
# library("devtools")
# install_github("jdstorey/qvalue")
suppressMessages(library(qvalue))
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("biomaRt")
##suppressMessages(library(biomaRt))
##install.packages("ggrepel")
suppressMessages(library(ggrepel))


traitlist = c("bodylen","bmi")
tissuelist = c("AC", "IL", "LH", "PL", "VO")
phenolist = c("Standing height","Body mass index (BMI) (21001_raw)")
names(phenolist) = c("bodylen","bmi")

download data for the tutorial

  • download files from here The folder structure should look like the list below
(base) MBP-HKI-22 ratxcan-tutorial $ tree -L 3
.
├── data
│  ├── expression
│   │   └── gene_annotation.RDS
│   ├── genotype
│   │   ├── DELETE
│   │   ├── rat6k_autosome.bed
│   │   ├── rat6k_autosome.bim
│   │   └── rat6k_autosome.fam
│   ├── phenomexcan
│   │   └── phenomexcan_results.RDS
│   └── phenotype
│       └── pheno.fam
├── models
│   ├── Ac_best_prediXcan_db.db
│   ├── Br-hki.db
│   ├── Il_best_prediXcan_db.db
│   ├── Lh_best_prediXcan_db.db
│   ├── Pl_best_prediXcan_db.db
│   └── Vo_best_prediXcan_db.db
└── software
    └── MetaXcan
        ├── CODE_OF_CONDUCT.md
        ├── DevNotes.Rmd
        ├── LICENSE
        ├── README.md
        ├── codemap
        ├── papers
        └── software

define data and software paths for R

WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"

PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"
GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"

INPUT <- glue("{WEBDATA}/ratxcan-tutorial") ## this has the input data 
OUTPUT <- glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/scratch") ## this has the output data, intermediate results

define data and software for the terminal

WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"

PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"
GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"
  
INPUT=$WEBDATA/ratxcan-tutorial
OUTPUT=$WEBDATA/2023-11-27-ratxcan-tutorial/scratch

set up conda environment to run predict expression

conda create -n rat311 python=3.11
conda activate rat311
conda install pandas scipy numpy statsmodels
conda install h5py
## missing modules probably due to M2 processor
## NOT NEEDED conda install bgen_reader
pip install pyliftover
pip install cyvcf2  ## pip intall worked on my macbook M2

predict expression

#WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"
#PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"
#GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"
#INPUT=$WEBDATA/ratxcan-tutorial
#OUTPUT=$WEBDATA/2023-11-27-ratxcan-tutorial/scratch

## convert plink file to vcf
#$PLINK --bfile $GENO_HEADER --recode vcf --out $GENO_HEADER
#gzip $GENO_HEADER.vcf ## to keep file small

conda activate rat311

METAXCAN=$INPUT/software/MetaXcan
GENO=$INPUT/data/genotype
GENO_HEADER=$GENO/rat6k_autosome
MODEL=$INPUT/models

## predict Ac
MODLEFT="VO-filtered"
MODEL_DB="$MODLEFT.db"
python $METAXCAN/software/Predict.py \
--model_db_path $MODEL/$MODEL_DB \
--model_db_snp_key rsid \
--vcf_genotypes ${GENO_HEADER}.vcf.gz \
--vcf_mode genotyped \
--on_the_fly_mapping METADATA "{}_{}_{}_{}" \
--prediction_output $OUTPUT/${MODLEFT}__predict.txt  \
--prediction_summary_output $OUTPUT/${MODLEFT}__summary.txt \
--throw

compare with previous prediction of Ac

oldexpr <- vroom::vroom(glue("{OUTPUT}/Ac-hki-large_geno__predict.txt")) %>% 
  select(-FID) %>%  # Remove the FID column
  mutate(IID = str_split(IID, "_", simplify = TRUE)[, 1])  # Keep the first part of IID

for(tis in tissuelist)
{
  new_pred_expr <- vroom::vroom(glue("{OUTPUT}/{tis}-filtered__predict.txt")) %>% 
  select(-FID) %>%  # Remove the FID column
  mutate(IID = str_split(IID, "_", simplify = TRUE)[, 1])  # Keep the first part of IID

  kk=calc_cor_matched_cols(oldexpr,new_pred_expr)
  hist(kk$cor,main=paste(tis, "vs. old Ac-hki-large_geno") )
  mtext(paste(sum(kk$cor<0), "genes with negative cor") )

}

read phenotype and check that ids are included in genotype fam file

## DO EVAL #| eval: FALSE
## check whether ids in the phenotype file are in the genotype file
pheno = read_tsv(glue("{INPUT}/data/phenotype/pheno.fam"),col_names = FALSE)
fam = read_table(glue("{INPUT}/data/genotype/rat6k_autosome.fam"),col_names = FALSE)
if( pheno %>% filter(X1 %in% fam$X1) %>% nrow != nrow(pheno) ) message("WARNING: missing pheno ids with missing genotypes") 
names(pheno) = c("FID","IID","bodylen","bmi")

read gene annotation

gene_annotation <- readRDS(glue("{INPUT}/data/expression/gene_annotation.RDS"))

calculate GRM

## - [ ] calculate GRM from genotype
## GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64" ## defined at the top
$GCTA --bfile $GENO0/rat6k_autosome --make-grm-bin --out $OUTPUT/rat6k_autosome
#Analysis finished at 00:31:49 CST on Fri Dec 01 2023
#Overall computational time: 1 minute 15 sec

calculate h2

## 1 after --mpheno will use bodylen as phenotype
$GCTA --grm $OUTPUT/rat6k_autosome --reml --pheno $INPUT/data/phenotype/pheno.fam --mpheno 1 --out $OUTPUT/bodylen_h2
## 2 after --mpheno will use bmi as phenotype
$GCTA --grm $OUTPUT/rat6k_autosome --reml --pheno $INPUT/data/phenotype/pheno.fam --mpheno 2 --out $OUTPUT/bmi_h2

read h2

tempo = read_tsv(glue("{OUTPUT}/bodylen_h2.hsq")) %>% filter(Source=="V(G)/Vp") 
bodylen_h2 = tempo %>% pull(Variance)
bodylen_se = tempo %>% pull(SE)
tempo = read_tsv(glue("{OUTPUT}/bmi_h2.hsq")) %>% filter(Source=="V(G)/Vp") 
bmi_h2 = tempo %>% pull(Variance)
bmi_se = tempo %>% pull(SE)

read grm matrix

grm_mat <- read_GRMBin(glue("{OUTPUT}/rat6k_autosome.grm"))

read predicted expression

read_pred_expr = function(filename)
{
  ##usage: Br_pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))
  pred_expr <- vroom::vroom(filename) %>% 
  select(-FID) %>%  # Remove the FID column
  mutate(IID = str_split(IID, "_", simplify = TRUE)[, 1])  # Keep the first part of IID
  pred_expr
}

show calibration of type I error REMOVE FROM TUTORIAL

simulate genetic effect sizes for nsnp

## on the terminal generate
## $PLINK --bfile $INPUT/data/genotype/rat6k_autosome --freq --out rat6k_autosome 
## simulate unrelated Y
nsim = 100 
## to get nsip, read bim file
#tempo = read_tsv(glue("{INPUT}/data/genotype/rat6k_autosome.bim"),col_names = FALSE)

tempo = read_table(glue("{INPUT}/data/genotype/rat6k_autosome.frq"),col_names = TRUE)
tempo = tempo %>% select(SNP,A1,A2,MAF)
nsnp = nrow(df_freq)
##set.seed(29444)
semilla = 32240
##semilla = round(runif(1)*1e5)
set.seed(semilla)
scoremat = matrix(rnorm(nsnp*nsim),nsnp,nsim)
## divide by maf
scoremat = sweep(scoremat,1, sqrt(2*tempo$MAF*(1-tempo$MAF)), "/" )
tempo = cbind(tempo, scoremat)
write_tsv(tempo, file = glue("{OUTPUT}/sim/sim_weights-{semilla}.txt"), col_names = FALSE) 

format simulated scores into matrix

# semilla = 20
# set.seed(semilla) 
h2 = bodylen_h2
tailo="-32240"

if(F)
{
  simy = suppressMessages(read_table(glue("{OUTPUT}/sim/tempo/PRS_output_1{tailo}.profile")))
  nsim = 100
  phenomat = matrix(NA,nrow(simy),nsim)
  rownames(phenomat) = simy$IID
  phenomat[,1] = simy$SCORE
  for(simid in 2:nsim)
  {
    simy = suppressMessages(read_table(glue("{OUTPUT}/sim/tempo/PRS_output_{simid}{tailo}.profile")))
    phenomat[,simid] = simy$SCORE
  }
  
  ## scale phenomat
  phenomat = scale(phenomat)
  ## add noise
  #nx = nrow(phenomat)
  #phenomat = sqrt(h2) * phenomat + sqrt(1 - h2) * scale(matrix(rnorm(nx*nsim),nx, nsim))
  
  simpheno = cbind(simy[,1:2], as.data.frame(phenomat))
  ## CHECK WHAT NAME TO USE
  #saveRDS(simpheno,file=glue("{OUTPUT}/sim/tempo/simpheno_{h2}-{tailo}.RDS"))
  saveRDS(simpheno,file=glue("{OUTPUT}/sim/tempo/simpheno.RDS")) ## no error component
} else 
{
  simpheno = readRDS(file=glue("{OUTPUT}/sim/tempo/simpheno.RDS")) ## no error component
  #simpheno=readRDS(file=glue("{OUTPUT}/sim/tempo/simpheno_{h2}-{tailo}.RDS"))
  nsim = ncol(simpheno) - 2 ## subtract FID, and IID columns
}

visualize raw and corrected pvalues

myplot <- function(tempres, post_titulo="",semilla="") {
  # Create a data frame with specific columns
  df <- data.frame(
    p0.01_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.01)),
    p0.01_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.01)),
    p0.05_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.05)),
    p0.05_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.05)),
    p0.10_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.10)), 
    p0.10_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.10))
    # ... [rest of your code for creating df] ...
  )

# Pivot the data frame to long format, specifying the columns to keep
df_long <- pivot_longer(df, cols = starts_with("p"))

df_long <- df_long %>% separate(name,into = c("threshold","corrected"),sep="_") %>% rename(proportion=value)

# Rename the name column to replace p0.xx with p<0.xx
df_long <- df_long %>%
  mutate(threshold = gsub("p0\\.", "p<0.", threshold))

  # Create boxplots with mean
  pp <- ggplot(df_long, aes(x = threshold, y = proportion, fill = corrected)) +
    geom_boxplot(alpha = 0.6) +
    stat_summary(fun = mean, geom = "point", shape = 3, size = 2, stroke = 2, color = "blue",                  #position = position_dodge(width = 0.8)) +
                 position = position_dodge(width = -0.1)) +
    #stat_summary(fun = mean, geom = "crossbar",  size = .5, color = "blue") +
    #stat_summary(fun = mean, geom = "crossbar",  size = .5, color = "darkgray") +
    geom_hline(yintercept = c(0.01, 0.05, 0.10), linetype = "dashed", color = "gray") +
    theme_minimal(base_size = 15) +
    #ggtitle(glue("Type I Error Calibration {semilla} {post_titulo}")) +
    xlab("significance") + ylab("false positive rate")

  pp
}
#myplot(tempres_sim,post_titulo = glue("nsam:", {nsam},"\n"),semilla)

define function lmm with GRM

## HERE WE USE THE FULL GRM MATRIX AND CALCULATE THE INVERSE OF THE SIGMA MATRIX
## define lmm association function 
lmmGRM = function(pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=6:6,out=NULL)
{
  ## input pheno is a data frame with id column pheno_id_col=1 by default
  ## phenotype values are in pheno_value_cols, 6:6 by default (SCORE column location in plink output), it can have more than one phenotype
  ## but h2 has to be the same, this is useful when running simulations with different h2
  ## call lmmXcan(pheno %>% select(IID,SCORE))
  
  ## format pheno to matrix form
  phenomat <- as.matrix(pheno[,pheno_value_cols])
  rownames(phenomat) <- pheno[[pheno_id_col]]
  
  ## turn pred_expr into matrix with rownames =IID, keep only IIDs in ymat
  exp_mat = as.matrix(pred_expr %>% select(-IID))
  rownames(exp_mat) = pred_expr$IID

  ## align pheno and expr matrices
  idlist = intersect(rownames(phenomat), rownames(exp_mat))
  
  nsam = length(idlist)
  
  ## CALCULATE SIGMA
  ID_mat = diag(rep(1,nsam))
  
  #testing_scale_grm = TRUE
  #if(testing_scale_grm) grm_mat = sweep( sweep(grm_mat,2, 1/sqrt(diag(grm_mat)),"*"), 1, 1/sqrt(diag(grm_mat)),"*")    
  
  Sigma = grm_mat[idlist,idlist] * h2 + (1 - h2) * ID_mat
  
  Sig_eigen = eigen(Sigma)
  rownames(Sig_eigen$vectors) = rownames(Sigma)
  
  isighalf = Sig_eigen$vectors %*% diag( 1 / sqrt(  Sig_eigen$values  ) ) %*% t(Sig_eigen$vectors)
  
  ## perform raw association
  cormat_raw = matrix_lm(phenomat[idlist,, drop = FALSE], exp_mat[idlist,])
  pmat_raw = cor2pval(cormat_raw,nsam)
  colnames(pmat_raw) <- gsub("cor_", "pval_", colnames(pmat_raw))
  
  ## perform corrected association
  cormat_correct = matrix_lm(isighalf%*% phenomat[idlist,, drop = FALSE], isighalf %*% exp_mat[idlist,])
  pmat_correct = cor2pval(cormat_correct,nsam)
  colnames(pmat_correct) <- gsub("cor_", "pval_", colnames(pmat_correct))
  
  if(!is.null(out))
  {
    saveRDS(cormat_correct,file = glue("{out}_cormat_correct.RDS"))
    saveRDS(pmat_correct,  file = glue("{out}_pmat_correct.RDS"))
    saveRDS(cormat_raw,    file = glue("{out}_cormat_raw.RDS"))
    saveRDS(pmat_raw,      file = glue("{out}_pmat_raw.RDS"))
  }
  res = list(
    cormat_correct=cormat_correct, 
    pmat_correct=pmat_correct, 
    cormat_raw=cormat_raw, 
    pmat_raw=pmat_raw)
  res
  
}

run ratxcan null regression

##pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))

add_noise = function(simpheno,h2)
{
  phenomat = as.matrix(simpheno %>% select(-IID,-FID))
  phenomat = scale(phenomat)
  ## add noise
  nx = nrow(phenomat)
  nsim=ncol(phenomat)
  phenomat = sqrt(h2) * phenomat + sqrt(1 - h2) * scale(matrix(rnorm(nx*nsim),nx, nsim))
  cbind(simpheno[,1:2], as.data.frame(phenomat))
}

recalculate=FALSE
for(h2 in c(0.1,0.2,0.4,0.6,0.8))
{
  cat("---",h2,"---\n")
  if(recalculate)
  { 
    tic=Sys.time()
    tempres <- lmmGRM(add_noise(simpheno,h2),
                      grm_mat, h2,pred_expr,pheno_id_col=1, 
                      pheno_value_cols=2+(1:nsim))
    toc=Sys.time()
    print(toc - tic)
    saveRDS(tempres,file=glue("{OUTPUT}/tempres-null-h2-{h2}.RDS"))
  } else tempres = readRDS(glue("{OUTPUT}/tempres-null-h2-{h2}.RDS"))
  myplot(tempres_sim)
  ggsave(glue("{OUTPUT}/calib-figure-h2-{h2}.png"),width=6,height=5)
  png(glue("{OUTPUT}/hist-p-corrected-{h2}.png"))
  hist(tempres$pmat_correct,main=glue("corrected p-values - {h2}"))
  dev.off()
  png(glue("{OUTPUT}/hist-p-raw-{h2}.png"))
  hist(tempres$pmat_raw,main=glue("raw p-values - {h2}"))
  dev.off()
  png(glue("{OUTPUT}/qqunif-compare-raw-corrected-{h2}.png"))
  qqunif.compare(tempres$pmat_raw,tempres$pmat_correct,BF=FALSE,BF2=FALSE,main="qqunif {h2}")
  dev.off()
}
##%HERE

simulate \(Y = Sigma^{1/2}\epsilon\) and run assoc with expr_mat

nsam=nrow(grm_mat)
ind=1:nsam
#ind=1:1000
test_mat = grm_mat[ind,ind]
nsam=nrow(test_mat)
Sigma = test_mat * h2 + (1 - h2) * diag(rep(1,nsam))
Sig_eigen = eigen(Sigma)
rownames(Sig_eigen$vectors) = rownames(Sigma)
##sighalf = Sig_eigen$vectors %*% diag( sqrt(  Sig_eigen$values  ) ) %*% t(Sig_eigen$vectors)
## make this multiplication more efficient using sweep
sighalf = Sig_eigen$vectors %*% sweep(t(Sig_eigen$vectors),1,sqrt(  Sig_eigen$values ),"*")

for(ii in 1:10)
{
sim_sigma_pheno = sighalf %*% matrix(rnorm(nsam * nsim), nsam, nsim) 
sim_sigma_pheno=cbind(FID=rownames(sim_sigma_pheno),IID=rownames(sim_sigma_pheno),as.data.frame(sim_sigma_pheno))


tic=Sys.time()
tempres_sigma_pheno <- lmmGRM(sim_sigma_pheno,grm_mat, h2,pred_expr,pheno_id_col=1, pheno_value_cols=2+(1:nsim))
toc=Sys.time()
toc - tic
pp<-myplot(tempres_sigma_pheno,post_titulo = glue("sigma_pheno n= {nsam} - ii={ii}"))
cat(ii,"\n")
print(pp)
ggsave(glue("{OUTPUT}/calib-sim-sigma-n{nsam}-ii{ii}.png"))
}

simulate \(Y = GRM^{1/2} h2 + (1 - h2) \epsilon\) and run assoc with expr_mat

nsam=nrow(grm_mat)
ind=1:nsam
#ind=1:1000
test_mat = grm_mat[ind,ind]
nsam=nrow(test_mat)

test_eigen = eigen(test_mat)
rownames(test_eigen$vectors) = rownames(test_mat)
## show smallest eigenvalues of GRM
cat(sort(test_eigen$values) %>% head, "these numbers should be non negative \n")
## force eigenvalues to be nonnegative
test_eigen$values = pmax(test_eigen$values,0)
grmhalf = test_eigen$vectors %*% sweep(t(test_eigen$vectors), 1, sqrt(test_eigen$values),"*")

for(ii in 1:10)
{
  ## simulate phenomat as rv with h2*grm as cov + indep noise term (1-h2)
sim_grm_pheno = sqrt(h2) * grmhalf %*% matrix(rnorm(nsam * nsim), nsam, nsim) + sqrt(1-h2) * matrix(rnorm(nsam * nsim), nsam, nsim)
sim_grm_pheno=cbind(FID=rownames(sim_grm_pheno),IID=rownames(sim_grm_pheno),as.data.frame(sim_grm_pheno))

tic=Sys.time()
tempres_grm_pheno <- lmmGRM(sim_grm_pheno,grm_mat, h2,pred_expr,pheno_id_col=1, pheno_value_cols=2+(1:nsim))
toc=Sys.time()
toc - tic
pp<-myplot(tempres_grm_pheno,post_titulo = glue("sigma_pheno n= {nsam} - ii={ii}"))
cat(ii,"\n")
print(pp)
ggsave(glue("{OUTPUT}/calib-sim-grm-n{nsam}-ii{ii}.png"))
}

run bodylen regression with BR expr lmmGRM

pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))

recalculate = FALSE

trait = "bodylen"
h2 = bodylen_h2
h2se = bodylen_h2+bodylen_se
##pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=6:6,out=NULL
if(recalculate)
{
  tic=Sys.time()
  tempres_h2 <- lmmGRM(pheno,grm_mat, h2, pred_expr,pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
  toc=Sys.time()
  toc - tic
  saveRDS(tempres_h2,glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))
} else
tempres_h2 <- readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))

png(glue("{OUTPUT}/{trait}-BR-lmmGRM.png"))
qqunif.compare(tempres_h2$pmat_raw,tempres_h2$pmat_correct,main=glue("ratxcan {trait}") )
dev.off()

if(recalculate)
{
tic=Sys.time()
tempres_h2se <- lmmGRM(pheno,grm_mat, h2se,pred_expr,pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
toc=Sys.time()
toc - tic
saveRDS(tempres_h2se,glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS"))
} else
tempres_h2se <- readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS"))

run bmi regression with lmmGRM

recalculate = FALSE

h2 <- bmi_h2
h2se = bmi_h2 + bmi_se
trait = "bmi"

pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))

if(recalculate)
{
tic=Sys.time()
h2 = bmi_h2 + bmi_se
tempres_h2 <- lmmGRM(pheno,grm_mat, h2, pred_expr, pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
toc=Sys.time()
toc - tic
saveRDS(tempres_h2,glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))
} else tempres_h2 = readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))

png(glue("{OUTPUT}/{trait}-BR-lmmGRM.png"))
qqunif.compare(tempres_h2$pmat_correct,tempres_h2$pmat_raw,main=glue("ratxcan {trait}") )
dev.off()

if(recalculate)
{
tic=Sys.time()
tempres_h2se <- lmmGRM(pheno,grm_mat, h2se,pred_expr,pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
toc=Sys.time()
toc - tic
saveRDS(tempres_h2se,glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS"))
} else tempres_h2se = readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS"))
  
png(glue("{OUTPUT}/{trait}-BR-lmmGRM_h2_plus_se.png"))
qqunif.compare(tempres_h2se$pmat_correct,tempres_h2se$pmat_raw,main=glue("ratxcan {trait}") )
dev.off()

bodylen and bmi vs 5 brain region expr association

recalculate = FALSE
#tissuelist = c("AC", "IL", "LH", "PL", "VO")

for(trait in c("bodylen","bmi"))
{
  if(trait == "bmi") h2 = bmi_h2+bmi_se else
  if(trait == "bodylen") h2 = bodylen_h2 + bodylen_se else stop("unknown trait")
  
  print(trait)
  for(tis in tissuelist)
  {
    print(tis)
    if(recalculate)
    {
      pred_expr = read_pred_expr(glue("{OUTPUT}/{tis}-filtered__predict.txt"))
      
      ## run lmmGRM
      tic=Sys.time()
      tempres_h2se <- 
        lmmGRM(pheno,grm_mat,
               h2,
               pred_expr,
               pheno_id_col=1,
               pheno_value_cols=which(colnames(pheno)==trait) )
      toc=Sys.time()
      print(toc - tic)
      
      ## save results
      saveRDS(tempres_h2se,glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS"))
      
    } else
      tempres_h2se = readRDS(glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS"))
    
    png(glue("{OUTPUT}/{trait}-{tis}-lmmGRM_h2_plus_se.png"))
    qqunif.compare(tempres_h2se$pmat_raw,tempres_h2se$pmat_correct,main=glue("ratxcan {trait} {tis}") )
    dev.off()
  }

}

download rat gene annotation and human phenomexcan

#ensembl = biomaRt::useMart("ensembl", dataset = "rnorvegicus_gene_ensembl")
#annotation = biomaRt::getBM(attributes = c('ensembl_gene_id','external_gene_name', 'start_position', 'end_position', 'chromosome_name'),mart = ensembl)
new_ensembl_query=FALSE
if(new_ensembl_query)
{
  human = biomaRt::useEnsembl(biomart='ensembl', dataset="hsapiens_gene_ensembl", mirror = "useast")
#attributes <- listAttributes(human)
attributes = c("ensembl_gene_id", "external_gene_name", "rnorvegicus_homolog_ensembl_gene", "rnorvegicus_homolog_associated_gene_name")
orth.rats = biomaRt::getBM(attributes, filters="with_rnorvegicus_homolog",values=TRUE, mart = human, uniqueRows=TRUE)
saveRDS(orth.rats,file=glue("{INPUT}/data/expression/orth.rats.RDS"))
} else
  orth.rats = readRDS(file=glue("{INPUT}/data/expression/orth.rats.RDS"))

phenomexcan_results = readRDS(glue("{INPUT}/data/phenomexcan/phenomexcan_results_signs.RDS"))

create ratxcan association table to save as csv files

generate_results_table = function(tempres,out=NULL,phenoname)
{
  ## phenoname: "Body mass index (BMI) (21001_raw)" or "Standing height"
  pmat = tempres$pmat_correct
  df = data.frame(gene=rownames(pmat), p_correc=pmat, cor_correct=tempres$cormat_correct, p_raw=tempres$pmat_raw)
  names(df) = c("gene","p_correct","cor_correct","p_raw")
  rownames(df) = NULL
  
  ## annotate
  df = df %>% 
    left_join(gene_annotation %>% 
                select(genename=gene_name,
                       gene_id,
                       chr,
                       start), by=c("gene"="gene_id")) %>%
    arrange(p_correct)

  ## write csv file
  if(!is.null(out)) write_csv(df,file = out)
  
  df
}

df_all = data.frame()

tis="BR"
trait="bodylen"
#tissuelist = c("AC", "IL", "LH", "PL", "VO")

for(tis in c("BR",tissuelist))
{
  RDSpath = glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS")
  tempres = readRDS(RDSpath)
  df = generate_results_table(tempres, #out=glue("{OUTPUT}/{trait}-results.csv"),
                                    phenoname="Standing height")
  df$trait=trait
  df$tissue = tis
  df_all = rbind(df_all,df)
}

trait="bmi"
for(tis in c("BR",tissuelist))
{
  RDSpath = glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS")
  tempres = readRDS(RDSpath)
  df = generate_results_table(tempres, 
                                ##out=glue("{OUTPUT}/{trait}-results.csv"),
                                phenoname="Body mass index (BMI) (21001_raw)")
  df$trait=trait
  df$tissue = tis
  df_all = rbind(df_all,df)
}

#saveRDS(df_all,file = glue("{OUTPUT}/all-assoc-results.RDS"))

## pivot wider
df_wider <- df_all %>% select(gene,trait,tissue,p_correct)  %>% pivot_wider(
    names_from = tissue, 
    values_from = p_correct,
    id_cols = c(gene, trait)
  )


acat = function(pvec) 
{
  pvec = pvec[!is.na(pvec)]
  TT = sum( tan( (0.5 - pvec) *pi ) )
  .5 - atan(TT / length(pvec)) / pi
}

df_wider$p_acat_5 = apply(df_wider %>% select(all_of(tissuelist)) ,1,function(x) acat(x) ) 
 
df_wider$p_acat_6 = apply(df_wider %>% select(all_of(c("BR",tissuelist)),) ,1,function(x) acat(x) ) 

df_wider = df_wider %>% left_join(gene_annotation %>% select(gene_id,gene_name,chr,start),by=c("gene"="gene_id"))

collect signs

## pivot wider
df_wider_cor <- df_all %>% select(gene,trait,tissue,cor_correct)  %>% pivot_wider(
    names_from = tissue, 
    values_from = cor_correct,
    id_cols = c(gene, trait)
  )

df_wider_cor$max_value_original_sign = apply(df_wider_cor[, -c(1, 2)], 1, function(x) {
    idx_max_magnitude = which.max(abs(x))
    return(x[idx_max_magnitude])
})

df_wider_cor$best_sign = sign(df_wider_cor$max_value_original_sign)

for(ctrait in traitlist)
{tempo = df_wider_cor %>% 
  filter(trait == ctrait) %>% 
  inner_join(df_wider %>% 
               select(gene,trait,p_acat_6), by=c("gene"="gene","trait"="trait"))  %>%
  left_join(phenomexcan_results %>%
              filter(phenotype==phenolist[ctrait]) %>%
              select(p_human=pvalue,
                     hugo_gene=external_gene_name,
                     gene_id,rnorvegicus_homolog_ensembl_gene,
                     dir_effect_most_signif),
            by=c("gene"="rnorvegicus_homolog_ensembl_gene"),relationship = "many-to-many")

ind=tempo$p_acat_6 < 1e-4;
cat("---",ctrait,"---\n")
print(fisher.test(tempo$best_sign[ind], tempo$dir_effect_most_signif[ind]))
}

no concordance of signs

df_wider$qval = NA

## calculate qvalues
for(ctrait in traitlist)
{
  ind = df_wider$trait == ctrait
  qq = qvalue(df_wider$p_acat_6[ind])
  df_wider$qval[ind] = qq$qvalues
}

enrichment of human gene analysis

## annotate with human genes

## add human pvalues
#ctrait = "bmi"
for(ctrait in traitlist)
{
  tempo = df_wider %>% filter(trait == ctrait) %>%
  left_join(phenomexcan_results %>% 
              filter(phenotype==phenolist[ctrait]) %>%
              select(p_human=pvalue,hugo_gene=external_gene_name,
                     gene_id,rnorvegicus_homolog_ensembl_gene, dir_effect_most_signif), 
            by=c("gene"="rnorvegicus_homolog_ensembl_gene"),
            relationship = "many-to-many")
  
  mhuman = 17573
  mrat = nrow(tempo)
  
  cat("fisher test",ctrait,"p_acat_6 \n")
  for(thres in c(0.01,0.05,0.1))
  {
    cat('---',thres,'---\n')
    with(tempo, print(fisher.test(p_acat_6<thres/mrat,p_human<thres/mhuman)) )
    with(tempo, print(table(p_acat_6<thres/mrat,p_human<thres/mhuman)) )
  }
  
  cat("fisher test",ctrait,"p_acat_5 \n")
  for(thres in c(0.01,0.05,0.1))
  {
    cat('---',thres,'---\n')
    if(thres==0.05) cat('---\n--- UPDATE PAPER WITH THIS 0.05 ---\n---\n')
    with(tempo, print(fisher.test(p_acat_5<thres/mrat,p_human<thres/mhuman)) )
    with(tempo, print(table(p_acat_5<thres/mrat,p_human<thres/mhuman)) )
  }

  ind = tempo$p_acat_6 < 0.05/mrat 
  # ind = tempo$p_acat_6 < p_fdr05
  print(ctrait)
  qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,main=glue("human gene assoc for rat {ctrait} genes"),cex2=1.5)
  png(glue("{OUTPUT}/qqunif-compare-{ctrait}.png"),width = 640,height = 480)
  qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,col='maroon1',col2='dodgerblue',cex2=1.5)
  dev.off()
  cat("wilcoxon test")
  print(wilcox.test(tempo$p_human[!ind],tempo$p_human[ind]))
  

  ind = tempo$p_acat_5 < 0.05/(tempo %>% filter(!is.na(p_acat_5)) %>% nrow()) & !is.na(tempo$p_acat_5)
  # ind = tempo$p_acat_6 < p_fdr05
  print(ctrait)
  qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,main=glue("human gene assoc for rat {ctrait} genes"),cex2=1.5)
  png(glue("{OUTPUT}/qqunif-compare-{ctrait}-p_acat_05.png"),width = 640,height = 480)
  qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,col='maroon1',col2='dodgerblue',cex2=1.5)
  dev.off()
  cat("wilcoxon test")
  print(wilcox.test(tempo$p_human[!ind],tempo$p_human[ind]))
  
  
  
    if(F) write_csv(tempo %>% arrange(p_acat_6) %>%
              select(gene_name,p_acat_6,chr,start,p_human,hugo_gene,
                     trait,gene,gene_id,BR,AC,IL,LH,PL,VO,p_acat_5,qval),
            file = glue("{OUTPUT}/ratxcan-{ctrait}-results-qval.csv"))
  
}

enrichment of Crouse’s consensus genes

consensus_df <- readxl::read_excel(glue("{INPUT}/data/expression/Table S10 Fat_consensus_alpha_0.01 PG.xlsx"),skip=1)

dim(consensus_df)

## enrichment of consensus genes among bmi associated genes in rats

tempo = df_wider %>% filter(trait=="bmi") %>% left_join(consensus_df %>% select(ensembl_gene_id, description...3),by=c("gene"="ensembl_gene_id")) %>% select(gene,p_acat_6,description...3) %>% mutate(consensus = !is.na(description...3))

qqunif.compare(tempo$p_acat_6,tempo$p_acat_6[tempo$consensus], main="no enrich. of consensus genes in rat bmi genes")

## enrichment of consensus genes among bmi associated genes in humans

tempo = phenomexcan_results %>% filter(phenotype=="Body mass index (BMI) (21001_raw)") %>% left_join(consensus_df %>% select(ensembl_gene_id, description...3),by=c("rnorvegicus_homolog_ensembl_gene"="ensembl_gene_id")) %>% mutate(consensus = !is.na(description...3)) %>% select(gene=rnorvegicus_homolog_ensembl_gene,p_human=pvalue,consensus) 

qqunif.compare(tempo$p_human,tempo$p_human[tempo$consensus], main="no enrich. of consensus genes in human bmi genes")

Crouse et al’s consensus genes are not enriched among human nor rat BMI genes

summarize results for the paper

BF_thres = 0.05/ (df_wider %>% filter(trait==ctrait) %>% filter(!is.na(p_acat_6)) %>% nrow())
for(ctrait in traitlist)
{
cat("BF threshold ", signif(BF_thres,3),"\n")
cat("---",ctrait,"---\n")
res = df_wider %>% 
  filter(trait==ctrait) %>% 
  filter(p_acat_6 < BF_thres) %>%
  select(trait,p_acat_6,gene_name,chr,start,gene) %>%
   left_join(phenomexcan_results %>% filter(phenotype==phenolist[ctrait]) %>%              select(rnorvegicus_homolog_ensembl_gene,p_human=pvalue,hugo=gene_name) , by=c("gene"="rnorvegicus_homolog_ensembl_gene")) %>% 
  arrange(p_acat_6)
print(res)
res = res %>% select(gene_name) %>% unique() %>% nrow()
cat("---\n")
cat(res," BF significant genes\n")
}

ntests = (df_wider %>% filter(trait==ctrait) %>% filter(!is.na(p_acat_5)) %>% nrow()) 
BF_thres = 0.05/ntests
for(ctrait in traitlist)
{
cat("BF threshold ", signif(BF_thres,3),"\n")
cat("---",ctrait,"--- p_acat_5 - UPDATE PAPER WITH THIS\n")
res = df_wider %>% 
  filter(trait==ctrait) %>% 
  filter(p_acat_5 < BF_thres) %>%
  select(trait,p_acat_5,gene_name,chr,start,gene) %>%
   left_join(phenomexcan_results %>% filter(phenotype==phenolist[ctrait]) %>%              select(rnorvegicus_homolog_ensembl_gene,p_human=pvalue,hugo=gene_name) , by=c("gene"="rnorvegicus_homolog_ensembl_gene")) %>% 
  arrange(p_acat_5)
print(res)
res = res %>% select(gene_name) %>% unique() %>% nrow()
cat("---\n")
cat(res," BF significant genes  p_acat_5 - UPDATE PAPER WITH THIS\n")
}

cat(BF_thres," BF threshold for  p_acat_5 - UPDATE PAPER WITH THIS\n")
cat(ntests," number of tests  p_acat_5 - UPDATE PAPER WITH THIS\n")

gene set enrichment

ctrait = "bmi"
thres = 1e-2

top_genes_thres = df_wider %>% filter(trait == ctrait) %>% filter(p_acat_6 < thres) %>% pull(gene)
write(top_genes_thres,file="~/Downloads/top_genes_thres.txt")

top_genes_n = df_wider %>% filter(trait == ctrait) %>% arrange(p_acat_6) %>% pull(gene) %>% head(500)
write(top_genes_n,file="~/Downloads/top_genes_n.txt")

#top_genes_hugo_thres = df_wider %>% filter(trait == ctrait) %>% filter(p_acat_6 < thres) %>% inner_join(orth.rats,by=c("gene"="rnorvegicus_homolog_ensembl_gene")) %>% pull(external_gene_name)
#write(top_genes_hugo,file="~/Downloads/top_genes_hugo_thres.txt")

suppressMessages(library(gprofiler2))
gostres <- gost(query = top_genes_thres,organism="rnorvegicus")
#gostres <- gost(query = top_genes_thres,organism="hsapiens")
gostplot(gostres)

define ggplot based manhattan plotting function

from Natasha

## here
## qq_manhattan(tempo %>% rename(pvalue=p_acat_6))

gg_manhattan <- function(df, titulo="",significance_threshold = 0.05) {
  ## USAGE: gg_manhattan(df,0.05)
  ## df has columns: pvalue, chr (numeric), and start (position)
  ## significance threshold gets divided by the number of tests
  ## 
  # Calculate cumulative base pair positions
  data_cum <- df %>%
    group_by(chr) %>%
    summarise(max_bp = as.numeric(max(start)), .groups = 'drop') %>%
    mutate(bp_add = lag(cumsum(max_bp), default = 0))

  gwas_data <- df %>%
    inner_join(data_cum, by = "chr") %>%
    mutate(bp_cum = start + bp_add)

  # Calculate axis labels
  axis_set <- gwas_data %>%
    group_by(chr) %>%
    summarize(center = mean(bp_cum), .groups = 'drop')

  # Determine the ylim based on the most significant p-value
  ylim <- gwas_data %>%
    filter(pvalue == min(pvalue)) %>%
    summarise(ylim = abs(floor(log10(pvalue))) + 2) %>%
    pull(ylim)

  # Calculate the genome-wide significance level
  sig <- significance_threshold / nrow(df)

  # Construct the Manhattan plot
  manhattan_plot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(pvalue), color = as.factor(chr), size = -log10(pvalue))) +
    geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
    geom_hline(yintercept = -log10(0.0001), color = "red", linetype = "dashed") +
    geom_point(alpha = 0.75, shape = 19) + # Simplified shape decision for clarity
    geom_label_repel(aes(label = ifelse(pvalue <= sig, gene_name, "")), size = 3) +
    ylim(c(0, ylim)) +
    scale_x_continuous(labels = axis_set$chr, breaks = axis_set$center) +
    scale_color_manual(values = rep(c("dodgerblue4", "midnightblue"), length(unique(axis_set$chr)))) +
    scale_size_continuous(range = c(0.5, 3)) +
    labs(x = NULL, y = expression(-log[10](italic(p)))) +
    theme_minimal() +
    theme(legend.position = "none",
          panel.border = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          axis.text.x = element_text(angle = 90, size = 12),
          axis.text.y = element_text(size = 12, vjust = 0),
          axis.title = element_text(size = 20))

  if(titulo !="") manhattan_plot = manhattan_plot + ggtitle(titulo)
  return(manhattan_plot)
}

plot manhattan plots of bmi and body length

for(ctrait in traitlist)
{
  tempo = df_wider %>% filter(trait==ctrait) %>% mutate(chr=as.numeric(chr))
  gg = gg_manhattan(tempo %>% rename(pvalue=p_acat_6), titulo=ctrait)
  print(gg)
  ggsave(glue("{OUTPUT}/{ctrait}-manhattan-p_acat_6.png"))
}

for(ctrait in traitlist)
{
  tempo = df_wider %>% filter(trait==ctrait) %>% mutate(chr=as.numeric(chr))
  gg = gg_manhattan(tempo %>% rename(pvalue=p_acat_5) %>% filter(!is.na(pvalue)), titulo=ctrait)
  print(gg)
  ggsave(glue("{OUTPUT}/{ctrait}-manhattan-p_acat_5.png"))
}

double checking that the saved qvalues are correct

kk5 = read_tsv("/Users/haekyungim/Downloads/ratxcan-bmi-results - ratxcan-bmi-results(1).tsv")
kk6 = read_tsv("/Users/haekyungim/Downloads/ratxcan-bodylen-results - ratxcan-bodylen-results(1).tsv")
all.equal(qvalue(kk5$p_acat_6)$qvalues,kk5$qval)
all.equal(qvalue(kk6$p_acat_6)$qvalues,kk6$qval)
plot(qvalue(kk5$p_acat_6)$qvalues,kk5$qval); abline(0,1)
plot(qvalue(kk6$p_acat_6)$qvalues,kk6$qval); abline(0,1)

# kk1 = read_tsv("/Users/haekyungim/Downloads/ratxcan-bmi-results - ratxcan-bmi-results.tsv")
# kk2 = read_tsv("/Users/haekyungim/Downloads/with-qval-ratxcan-bmi-results - ratxcan-bmi-results.tsv")
# kk3 = read_tsv("/Users/haekyungim/Downloads/ratxcan-bodylen-results - ratxcan-bodylen-results.tsv")
# kk4 = read_tsv("/Users/haekyungim/Downloads/with-qval-ratxcan-bodylen-results - ratxcan-bodylen-results.tsv")

# all.equal(kk1$p_acat_6,kk2$p_acat_6)
# all.equal(kk3$p_acat_6,kk4$p_acat_6)
# 
# all.equal(qvalue(kk1$p_acat_6)$qvalues,kk2$qval)
# all.equal(qvalue(kk3$p_acat_6)$qvalues,kk4$qval)

## small difference between qvalues were calculated before joining with human data
plot(qvalue(kk1$p_acat_6)$qvalues,kk2$qval); abline(0,1)
plot(qvalue(kk3$p_acat_6)$qvalues,kk4$qval); abline(0,1)
No matching items