RatXcan analysis Figure 3 to 4

tutorial for running ratxcan given genotype, phenotype, and prediction weights as input

Author

Haky Im

Published

November 27, 2023

top

input data

genotype
phenotype
prediction weights

load libraries and functions

#options(error=recover)
#options(error=browser)
options(error=NULL)

## compare observed correlation with null correlation
suppressMessages(devtools::source_gist("a925fea01b365a8c605e")) ## load qqR fn https://gist.github.com/hakyim/a925fea01b365a8c605e
suppressMessages(devtools::source_gist("38431b74c6c0bf90c12f")) ## qqunif https://gist.github.com/hakyim/38431b74c6c0bf90c12f
suppressMessages(devtools::source_gist("115403f16bec0a0e871f3616d552ce9b")) ## source ratxcan functions https://gist.github.com/hakyim/115403f16bec0a0e871f3616d552ce9b 

suppressMessages(library(tidyverse))
suppressMessages(library(glue))
suppressMessages(library(RSQLite))
#suppressMessages(library(expm))
#suppressMessages(library(readxl))
# install.packages("devtools")
# library("devtools")
# install_github("jdstorey/qvalue")
suppressMessages(library(qvalue))
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("biomaRt")
##suppressMessages(library(biomaRt))
##install.packages("ggrepel")
suppressMessages(library(ggrepel))


traitlist = c("bodylen","bmi")
tissuelist = c("AC", "IL", "LH", "PL", "VO")
phenolist = c("Standing height","Body mass index (BMI) (21001_raw)")
names(phenolist) = c("bodylen","bmi")

download data for the tutorial

download files from here The folder structure should look like the list below

(base) MBP-HKI-22 ratxcan-tutorial $ tree -L 3
.
├── data
│  ├── expression
│   │   └── gene_annotation.RDS
│   ├── genotype
│   │   ├── DELETE
│   │   ├── rat6k_autosome.bed
│   │   ├── rat6k_autosome.bim
│   │   └── rat6k_autosome.fam
│   ├── phenomexcan
│   │   └── phenomexcan_results.RDS
│   └── phenotype
│       └── pheno.fam
├── models
│   ├── Ac_best_prediXcan_db.db
│   ├── Br-hki.db
│   ├── Il_best_prediXcan_db.db
│   ├── Lh_best_prediXcan_db.db
│   ├── Pl_best_prediXcan_db.db
│   └── Vo_best_prediXcan_db.db
└── software
    └── MetaXcan
        ├── CODE_OF_CONDUCT.md
        ├── DevNotes.Rmd
        ├── LICENSE
        ├── README.md
        ├── codemap
        ├── papers
        └── software

define data and software paths for R

WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"

PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"
GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"

INPUT <- glue("{WEBDATA}/ratxcan-tutorial") ## this has the input data 
OUTPUT <- glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/scratch") ## this has the output data, intermediate results

define data and software for the terminal

WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"

PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"
GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"
  
INPUT=$WEBDATA/ratxcan-tutorial
OUTPUT=$WEBDATA/2023-11-27-ratxcan-tutorial/scratch

set up conda environment to run predict expression

conda create -n rat311 python=3.11
conda activate rat311
conda install pandas scipy numpy statsmodels
conda install h5py
## missing modules probably due to M2 processor
## NOT NEEDED conda install bgen_reader
pip install pyliftover
pip install cyvcf2  ## pip intall worked on my macbook M2

predict expression

#WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"
#PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"
#GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"
#INPUT=$WEBDATA/ratxcan-tutorial
#OUTPUT=$WEBDATA/2023-11-27-ratxcan-tutorial/scratch

## convert plink file to vcf
#$PLINK --bfile $GENO_HEADER --recode vcf --out $GENO_HEADER
#gzip $GENO_HEADER.vcf ## to keep file small

conda activate rat311

METAXCAN=$INPUT/software/MetaXcan
GENO=$INPUT/data/genotype
GENO_HEADER=$GENO/rat6k_autosome
MODEL=$INPUT/models

## predict Ac
MODLEFT="VO-filtered"
MODEL_DB="$MODLEFT.db"
python $METAXCAN/software/Predict.py \
--model_db_path $MODEL/$MODEL_DB \
--model_db_snp_key rsid \
--vcf_genotypes ${GENO_HEADER}.vcf.gz \
--vcf_mode genotyped \
--on_the_fly_mapping METADATA "{}_{}_{}_{}" \
--prediction_output $OUTPUT/${MODLEFT}__predict.txt  \
--prediction_summary_output $OUTPUT/${MODLEFT}__summary.txt \
--throw

compare with previous prediction of Ac

oldexpr <- vroom::vroom(glue("{OUTPUT}/Ac-hki-large_geno__predict.txt")) %>% 
  select(-FID) %>%  # Remove the FID column
  mutate(IID = str_split(IID, "_", simplify = TRUE)[, 1])  # Keep the first part of IID

for(tis in tissuelist)
{
  new_pred_expr <- vroom::vroom(glue("{OUTPUT}/{tis}-filtered__predict.txt")) %>% 
  select(-FID) %>%  # Remove the FID column
  mutate(IID = str_split(IID, "_", simplify = TRUE)[, 1])  # Keep the first part of IID

  kk=calc_cor_matched_cols(oldexpr,new_pred_expr)
  hist(kk$cor,main=paste(tis, "vs. old Ac-hki-large_geno") )
  mtext(paste(sum(kk$cor<0), "genes with negative cor") )

}

read phenotype and check that ids are included in genotype fam file

## DO EVAL #| eval: FALSE
## check whether ids in the phenotype file are in the genotype file
pheno = read_tsv(glue("{INPUT}/data/phenotype/pheno.fam"),col_names = FALSE)
fam = read_table(glue("{INPUT}/data/genotype/rat6k_autosome.fam"),col_names = FALSE)
if( pheno %>% filter(X1 %in% fam$X1) %>% nrow != nrow(pheno) ) message("WARNING: missing pheno ids with missing genotypes") 
names(pheno) = c("FID","IID","bodylen","bmi")

read gene annotation

gene_annotation <- readRDS(glue("{INPUT}/data/expression/gene_annotation.RDS"))

calculate GRM

## - [ ] calculate GRM from genotype
## GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64" ## defined at the top
$GCTA --bfile $GENO0/rat6k_autosome --make-grm-bin --out $OUTPUT/rat6k_autosome
#Analysis finished at 00:31:49 CST on Fri Dec 01 2023
#Overall computational time: 1 minute 15 sec

calculate h2

## 1 after --mpheno will use bodylen as phenotype
$GCTA --grm $OUTPUT/rat6k_autosome --reml --pheno $INPUT/data/phenotype/pheno.fam --mpheno 1 --out $OUTPUT/bodylen_h2
## 2 after --mpheno will use bmi as phenotype
$GCTA --grm $OUTPUT/rat6k_autosome --reml --pheno $INPUT/data/phenotype/pheno.fam --mpheno 2 --out $OUTPUT/bmi_h2

read h2

tempo = read_tsv(glue("{OUTPUT}/bodylen_h2.hsq")) %>% filter(Source=="V(G)/Vp") 
bodylen_h2 = tempo %>% pull(Variance)
bodylen_se = tempo %>% pull(SE)
tempo = read_tsv(glue("{OUTPUT}/bmi_h2.hsq")) %>% filter(Source=="V(G)/Vp") 
bmi_h2 = tempo %>% pull(Variance)
bmi_se = tempo %>% pull(SE)

read grm matrix

grm_mat <- read_GRMBin(glue("{OUTPUT}/rat6k_autosome.grm"))

read predicted expression

read_pred_expr = function(filename)
{
  ##usage: Br_pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))
  pred_expr <- vroom::vroom(filename) %>% 
  select(-FID) %>%  # Remove the FID column
  mutate(IID = str_split(IID, "_", simplify = TRUE)[, 1])  # Keep the first part of IID
  pred_expr
}

show calibration of type I error REMOVE FROM TUTORIAL

simulate genetic effect sizes for nsnp

## on the terminal generate
## $PLINK --bfile $INPUT/data/genotype/rat6k_autosome --freq --out rat6k_autosome 
## simulate unrelated Y
nsim = 100 
## to get nsip, read bim file
#tempo = read_tsv(glue("{INPUT}/data/genotype/rat6k_autosome.bim"),col_names = FALSE)

tempo = read_table(glue("{INPUT}/data/genotype/rat6k_autosome.frq"),col_names = TRUE)
tempo = tempo %>% select(SNP,A1,A2,MAF)
nsnp = nrow(df_freq)
##set.seed(29444)
semilla = 32240
##semilla = round(runif(1)*1e5)
set.seed(semilla)
scoremat = matrix(rnorm(nsnp*nsim),nsnp,nsim)
## divide by maf
scoremat = sweep(scoremat,1, sqrt(2*tempo$MAF*(1-tempo$MAF)), "/" )
tempo = cbind(tempo, scoremat)
write_tsv(tempo, file = glue("{OUTPUT}/sim/sim_weights-{semilla}.txt"), col_names = FALSE)

use plink to calculate the sim phenotype $y_ = _k X_k $

An alternative way to simulate Y with related individuals would be to simulate unrelated normal rv. for each rat, then pre-multiply by the correlation matrix $\Sigma^{0.5}$. This approach would make it more obvious that premultiplying by $\Sigma^{-0.5}$ would yield a an uncorrelated trait across rats.

# Calculate PRS using plink
# https://www.cog-genomics.org/plink/1.9/score
# --score my.scores 3 2 1
# reads variant IDs from column 3, allele codes from column 2, and scores from column 1.
# > head(tempo)
# A tibble: 6 × 6
#     X1 X2               X3      X4 X5    X6   
#  <dbl> <chr>         <dbl>   <dbl> <chr> <chr>
#1     1 1_1643610_C_T     0 1643610 T     C    
#2     1 1_1646409_T_G     0 1646409 T     G    


## BEFORE 1/30/2024 --score $OUTPUT/sim/$WEIGHTS 2 6 $((6+SIMID)) \
## new column order
#> headleft(tempo,6) tempo has df with snp info and weights N(0,1)/sqrt(2 p (1-p))
#            SNP A1 A2    MAF           1          2
#1 1_1643610_C_T  T  C 0.4521  0.01446167 -0.1048587
#2 1_1646409_T_G  T  G 0.1253  0.88769963 -0.7607204
#3 1_1658435_A_G  G  A 0.4511  2.66557665 -0.7946988

# Define the range of SIMID values
NSIM=100  # Replace with your actual value
SEMILLA=32240
WEIGHTS="sim_weights-$SEMILLA.txt"
time (
  for ((SIMID=1; SIMID<=$NSIM; SIMID++)); do
    $PLINK --bfile $INPUT/data/genotype/rat6k_autosome \
          --score $OUTPUT/sim/$WEIGHTS 1 3 $((4+SIMID)) \
          --out $OUTPUT/sim/tempo/PRS_output_$SIMID-$SEMILLA
    
    # Print SIMID every 10 iterations
    if ((SIMID % 10 == 0)); then
      echo "Processed $SIMID simulations"
    fi
  done
)

#Processed 100 simulations
#real    2m1.887s
#user    1m45.025s
#sys     0m13.337s

format simulated scores into matrix

# semilla = 20
# set.seed(semilla) 
h2 = bodylen_h2
tailo="-32240"

if(F)
{
  simy = suppressMessages(read_table(glue("{OUTPUT}/sim/tempo/PRS_output_1{tailo}.profile")))
  nsim = 100
  phenomat = matrix(NA,nrow(simy),nsim)
  rownames(phenomat) = simy$IID
  phenomat[,1] = simy$SCORE
  for(simid in 2:nsim)
  {
    simy = suppressMessages(read_table(glue("{OUTPUT}/sim/tempo/PRS_output_{simid}{tailo}.profile")))
    phenomat[,simid] = simy$SCORE
  }
  
  ## scale phenomat
  phenomat = scale(phenomat)
  ## add noise
  #nx = nrow(phenomat)
  #phenomat = sqrt(h2) * phenomat + sqrt(1 - h2) * scale(matrix(rnorm(nx*nsim),nx, nsim))
  
  simpheno = cbind(simy[,1:2], as.data.frame(phenomat))
  ## CHECK WHAT NAME TO USE
  #saveRDS(simpheno,file=glue("{OUTPUT}/sim/tempo/simpheno_{h2}-{tailo}.RDS"))
  saveRDS(simpheno,file=glue("{OUTPUT}/sim/tempo/simpheno.RDS")) ## no error component
} else 
{
  simpheno = readRDS(file=glue("{OUTPUT}/sim/tempo/simpheno.RDS")) ## no error component
  #simpheno=readRDS(file=glue("{OUTPUT}/sim/tempo/simpheno_{h2}-{tailo}.RDS"))
  nsim = ncol(simpheno) - 2 ## subtract FID, and IID columns
}

visualize raw and corrected pvalues

myplot <- function(tempres, post_titulo="",semilla="") {
  # Create a data frame with specific columns
  df <- data.frame(
    p0.01_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.01)),
    p0.01_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.01)),
    p0.05_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.05)),
    p0.05_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.05)),
    p0.10_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.10)), 
    p0.10_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.10))
    # ... [rest of your code for creating df] ...
  )

# Pivot the data frame to long format, specifying the columns to keep
df_long <- pivot_longer(df, cols = starts_with("p"))

df_long <- df_long %>% separate(name,into = c("threshold","corrected"),sep="_") %>% rename(proportion=value)

# Rename the name column to replace p0.xx with p<0.xx
df_long <- df_long %>%
  mutate(threshold = gsub("p0\\.", "p<0.", threshold))

  # Create boxplots with mean
  pp <- ggplot(df_long, aes(x = threshold, y = proportion, fill = corrected)) +
    geom_boxplot(alpha = 0.6) +
    stat_summary(fun = mean, geom = "point", shape = 3, size = 2, stroke = 2, color = "blue",                  #position = position_dodge(width = 0.8)) +
                 position = position_dodge(width = -0.1)) +
    #stat_summary(fun = mean, geom = "crossbar",  size = .5, color = "blue") +
    #stat_summary(fun = mean, geom = "crossbar",  size = .5, color = "darkgray") +
    geom_hline(yintercept = c(0.01, 0.05, 0.10), linetype = "dashed", color = "gray") +
    theme_minimal(base_size = 15) +
    #ggtitle(glue("Type I Error Calibration {semilla} {post_titulo}")) +
    xlab("significance") + ylab("false positive rate")

  pp
}
#myplot(tempres_sim,post_titulo = glue("nsam:", {nsam},"\n"),semilla)

define function lmm with GRM

## HERE WE USE THE FULL GRM MATRIX AND CALCULATE THE INVERSE OF THE SIGMA MATRIX
## define lmm association function 
lmmGRM = function(pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=6:6,out=NULL)
{
  ## input pheno is a data frame with id column pheno_id_col=1 by default
  ## phenotype values are in pheno_value_cols, 6:6 by default (SCORE column location in plink output), it can have more than one phenotype
  ## but h2 has to be the same, this is useful when running simulations with different h2
  ## call lmmXcan(pheno %>% select(IID,SCORE))
  
  ## format pheno to matrix form
  phenomat <- as.matrix(pheno[,pheno_value_cols])
  rownames(phenomat) <- pheno[[pheno_id_col]]
  
  ## turn pred_expr into matrix with rownames =IID, keep only IIDs in ymat
  exp_mat = as.matrix(pred_expr %>% select(-IID))
  rownames(exp_mat) = pred_expr$IID

  ## align pheno and expr matrices
  idlist = intersect(rownames(phenomat), rownames(exp_mat))
  
  nsam = length(idlist)
  
  ## CALCULATE SIGMA
  ID_mat = diag(rep(1,nsam))
  
  #testing_scale_grm = TRUE
  #if(testing_scale_grm) grm_mat = sweep( sweep(grm_mat,2, 1/sqrt(diag(grm_mat)),"*"), 1, 1/sqrt(diag(grm_mat)),"*")    
  
  Sigma = grm_mat[idlist,idlist] * h2 + (1 - h2) * ID_mat
  
  Sig_eigen = eigen(Sigma)
  rownames(Sig_eigen$vectors) = rownames(Sigma)
  
  isighalf = Sig_eigen$vectors %*% diag( 1 / sqrt(  Sig_eigen$values  ) ) %*% t(Sig_eigen$vectors)
  
  ## perform raw association
  cormat_raw = matrix_lm(phenomat[idlist,, drop = FALSE], exp_mat[idlist,])
  pmat_raw = cor2pval(cormat_raw,nsam)
  colnames(pmat_raw) <- gsub("cor_", "pval_", colnames(pmat_raw))
  
  ## perform corrected association
  cormat_correct = matrix_lm(isighalf%*% phenomat[idlist,, drop = FALSE], isighalf %*% exp_mat[idlist,])
  pmat_correct = cor2pval(cormat_correct,nsam)
  colnames(pmat_correct) <- gsub("cor_", "pval_", colnames(pmat_correct))
  
  if(!is.null(out))
  {
    saveRDS(cormat_correct,file = glue("{out}_cormat_correct.RDS"))
    saveRDS(pmat_correct,  file = glue("{out}_pmat_correct.RDS"))
    saveRDS(cormat_raw,    file = glue("{out}_cormat_raw.RDS"))
    saveRDS(pmat_raw,      file = glue("{out}_pmat_raw.RDS"))
  }
  res = list(
    cormat_correct=cormat_correct, 
    pmat_correct=pmat_correct, 
    cormat_raw=cormat_raw, 
    pmat_raw=pmat_raw)
  res
  
}

run ratxcan null regression

##pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))

add_noise = function(simpheno,h2)
{
  phenomat = as.matrix(simpheno %>% select(-IID,-FID))
  phenomat = scale(phenomat)
  ## add noise
  nx = nrow(phenomat)
  nsim=ncol(phenomat)
  phenomat = sqrt(h2) * phenomat + sqrt(1 - h2) * scale(matrix(rnorm(nx*nsim),nx, nsim))
  cbind(simpheno[,1:2], as.data.frame(phenomat))
}

recalculate=FALSE
for(h2 in c(0.1,0.2,0.4,0.6,0.8))
{
  cat("---",h2,"---\n")
  if(recalculate)
  { 
    tic=Sys.time()
    tempres <- lmmGRM(add_noise(simpheno,h2),
                      grm_mat, h2,pred_expr,pheno_id_col=1, 
                      pheno_value_cols=2+(1:nsim))
    toc=Sys.time()
    print(toc - tic)
    saveRDS(tempres,file=glue("{OUTPUT}/tempres-null-h2-{h2}.RDS"))
  } else tempres = readRDS(glue("{OUTPUT}/tempres-null-h2-{h2}.RDS"))
  myplot(tempres_sim)
  ggsave(glue("{OUTPUT}/calib-figure-h2-{h2}.png"),width=6,height=5)
  png(glue("{OUTPUT}/hist-p-corrected-{h2}.png"))
  hist(tempres$pmat_correct,main=glue("corrected p-values - {h2}"))
  dev.off()
  png(glue("{OUTPUT}/hist-p-raw-{h2}.png"))
  hist(tempres$pmat_raw,main=glue("raw p-values - {h2}"))
  dev.off()
  png(glue("{OUTPUT}/qqunif-compare-raw-corrected-{h2}.png"))
  qqunif.compare(tempres$pmat_raw,tempres$pmat_correct,BF=FALSE,BF2=FALSE,main="qqunif {h2}")
  dev.off()
}
##%HERE

simulate $Y = Sigma^{1/2}\epsilon$ and run assoc with expr_mat

nsam=nrow(grm_mat)
ind=1:nsam
#ind=1:1000
test_mat = grm_mat[ind,ind]
nsam=nrow(test_mat)
Sigma = test_mat * h2 + (1 - h2) * diag(rep(1,nsam))
Sig_eigen = eigen(Sigma)
rownames(Sig_eigen$vectors) = rownames(Sigma)
##sighalf = Sig_eigen$vectors %*% diag( sqrt(  Sig_eigen$values  ) ) %*% t(Sig_eigen$vectors)
## make this multiplication more efficient using sweep
sighalf = Sig_eigen$vectors %*% sweep(t(Sig_eigen$vectors),1,sqrt(  Sig_eigen$values ),"*")

for(ii in 1:10)
{
sim_sigma_pheno = sighalf %*% matrix(rnorm(nsam * nsim), nsam, nsim) 
sim_sigma_pheno=cbind(FID=rownames(sim_sigma_pheno),IID=rownames(sim_sigma_pheno),as.data.frame(sim_sigma_pheno))


tic=Sys.time()
tempres_sigma_pheno <- lmmGRM(sim_sigma_pheno,grm_mat, h2,pred_expr,pheno_id_col=1, pheno_value_cols=2+(1:nsim))
toc=Sys.time()
toc - tic
pp<-myplot(tempres_sigma_pheno,post_titulo = glue("sigma_pheno n= {nsam} - ii={ii}"))
cat(ii,"\n")
print(pp)
ggsave(glue("{OUTPUT}/calib-sim-sigma-n{nsam}-ii{ii}.png"))
}

simulate $Y = GRM^{1/2} h2 + (1 - h2) \epsilon$ and run assoc with expr_mat

nsam=nrow(grm_mat)
ind=1:nsam
#ind=1:1000
test_mat = grm_mat[ind,ind]
nsam=nrow(test_mat)

test_eigen = eigen(test_mat)
rownames(test_eigen$vectors) = rownames(test_mat)
## show smallest eigenvalues of GRM
cat(sort(test_eigen$values) %>% head, "these numbers should be non negative \n")
## force eigenvalues to be nonnegative
test_eigen$values = pmax(test_eigen$values,0)
grmhalf = test_eigen$vectors %*% sweep(t(test_eigen$vectors), 1, sqrt(test_eigen$values),"*")

for(ii in 1:10)
{
  ## simulate phenomat as rv with h2*grm as cov + indep noise term (1-h2)
sim_grm_pheno = sqrt(h2) * grmhalf %*% matrix(rnorm(nsam * nsim), nsam, nsim) + sqrt(1-h2) * matrix(rnorm(nsam * nsim), nsam, nsim)
sim_grm_pheno=cbind(FID=rownames(sim_grm_pheno),IID=rownames(sim_grm_pheno),as.data.frame(sim_grm_pheno))

tic=Sys.time()
tempres_grm_pheno <- lmmGRM(sim_grm_pheno,grm_mat, h2,pred_expr,pheno_id_col=1, pheno_value_cols=2+(1:nsim))
toc=Sys.time()
toc - tic
pp<-myplot(tempres_grm_pheno,post_titulo = glue("sigma_pheno n= {nsam} - ii={ii}"))
cat(ii,"\n")
print(pp)
ggsave(glue("{OUTPUT}/calib-sim-grm-n{nsam}-ii{ii}.png"))
}

run bodylen regression with BR expr lmmGRM

pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))

recalculate = FALSE

trait = "bodylen"
h2 = bodylen_h2
h2se = bodylen_h2+bodylen_se
##pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=6:6,out=NULL
if(recalculate)
{
  tic=Sys.time()
  tempres_h2 <- lmmGRM(pheno,grm_mat, h2, pred_expr,pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
  toc=Sys.time()
  toc - tic
  saveRDS(tempres_h2,glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))
} else
tempres_h2 <- readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))

png(glue("{OUTPUT}/{trait}-BR-lmmGRM.png"))
qqunif.compare(tempres_h2$pmat_raw,tempres_h2$pmat_correct,main=glue("ratxcan {trait}") )
dev.off()

if(recalculate)
{
tic=Sys.time()
tempres_h2se <- lmmGRM(pheno,grm_mat, h2se,pred_expr,pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
toc=Sys.time()
toc - tic
saveRDS(tempres_h2se,glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS"))
} else
tempres_h2se <- readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS"))

run bmi regression with lmmGRM

recalculate = FALSE

h2 <- bmi_h2
h2se = bmi_h2 + bmi_se
trait = "bmi"

pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))

if(recalculate)
{
tic=Sys.time()
h2 = bmi_h2 + bmi_se
tempres_h2 <- lmmGRM(pheno,grm_mat, h2, pred_expr, pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
toc=Sys.time()
toc - tic
saveRDS(tempres_h2,glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))
} else tempres_h2 = readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2.RDS"))

png(glue("{OUTPUT}/{trait}-BR-lmmGRM.png"))
qqunif.compare(tempres_h2$pmat_correct,tempres_h2$pmat_raw,main=glue("ratxcan {trait}") )
dev.off()

if(recalculate)
{
tic=Sys.time()
tempres_h2se <- lmmGRM(pheno,grm_mat, h2se,pred_expr,pheno_id_col=1, pheno_value_cols=which(colnames(pheno)==trait))
toc=Sys.time()
toc - tic
saveRDS(tempres_h2se,glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS"))
} else tempres_h2se = readRDS(glue("{OUTPUT}/{trait}-BR-tempres_h2se.RDS"))
  
png(glue("{OUTPUT}/{trait}-BR-lmmGRM_h2_plus_se.png"))
qqunif.compare(tempres_h2se$pmat_correct,tempres_h2se$pmat_raw,main=glue("ratxcan {trait}") )
dev.off()

bodylen and bmi vs 5 brain region expr association

recalculate = FALSE
#tissuelist = c("AC", "IL", "LH", "PL", "VO")

for(trait in c("bodylen","bmi"))
{
  if(trait == "bmi") h2 = bmi_h2+bmi_se else
  if(trait == "bodylen") h2 = bodylen_h2 + bodylen_se else stop("unknown trait")
  
  print(trait)
  for(tis in tissuelist)
  {
    print(tis)
    if(recalculate)
    {
      pred_expr = read_pred_expr(glue("{OUTPUT}/{tis}-filtered__predict.txt"))
      
      ## run lmmGRM
      tic=Sys.time()
      tempres_h2se <- 
        lmmGRM(pheno,grm_mat,
               h2,
               pred_expr,
               pheno_id_col=1,
               pheno_value_cols=which(colnames(pheno)==trait) )
      toc=Sys.time()
      print(toc - tic)
      
      ## save results
      saveRDS(tempres_h2se,glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS"))
      
    } else
      tempres_h2se = readRDS(glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS"))
    
    png(glue("{OUTPUT}/{trait}-{tis}-lmmGRM_h2_plus_se.png"))
    qqunif.compare(tempres_h2se$pmat_raw,tempres_h2se$pmat_correct,main=glue("ratxcan {trait} {tis}") )
    dev.off()
  }

}

download rat gene annotation and human phenomexcan

#ensembl = biomaRt::useMart("ensembl", dataset = "rnorvegicus_gene_ensembl")
#annotation = biomaRt::getBM(attributes = c('ensembl_gene_id','external_gene_name', 'start_position', 'end_position', 'chromosome_name'),mart = ensembl)
new_ensembl_query=FALSE
if(new_ensembl_query)
{
  human = biomaRt::useEnsembl(biomart='ensembl', dataset="hsapiens_gene_ensembl", mirror = "useast")
#attributes <- listAttributes(human)
attributes = c("ensembl_gene_id", "external_gene_name", "rnorvegicus_homolog_ensembl_gene", "rnorvegicus_homolog_associated_gene_name")
orth.rats = biomaRt::getBM(attributes, filters="with_rnorvegicus_homolog",values=TRUE, mart = human, uniqueRows=TRUE)
saveRDS(orth.rats,file=glue("{INPUT}/data/expression/orth.rats.RDS"))
} else
  orth.rats = readRDS(file=glue("{INPUT}/data/expression/orth.rats.RDS"))

phenomexcan_results = readRDS(glue("{INPUT}/data/phenomexcan/phenomexcan_results_signs.RDS"))

create ratxcan association table to save as csv files

generate_results_table = function(tempres,out=NULL,phenoname)
{
  ## phenoname: "Body mass index (BMI) (21001_raw)" or "Standing height"
  pmat = tempres$pmat_correct
  df = data.frame(gene=rownames(pmat), p_correc=pmat, cor_correct=tempres$cormat_correct, p_raw=tempres$pmat_raw)
  names(df) = c("gene","p_correct","cor_correct","p_raw")
  rownames(df) = NULL
  
  ## annotate
  df = df %>% 
    left_join(gene_annotation %>% 
                select(genename=gene_name,
                       gene_id,
                       chr,
                       start), by=c("gene"="gene_id")) %>%
    arrange(p_correct)

  ## write csv file
  if(!is.null(out)) write_csv(df,file = out)
  
  df
}

df_all = data.frame()

tis="BR"
trait="bodylen"
#tissuelist = c("AC", "IL", "LH", "PL", "VO")

for(tis in c("BR",tissuelist))
{
  RDSpath = glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS")
  tempres = readRDS(RDSpath)
  df = generate_results_table(tempres, #out=glue("{OUTPUT}/{trait}-results.csv"),
                                    phenoname="Standing height")
  df$trait=trait
  df$tissue = tis
  df_all = rbind(df_all,df)
}

trait="bmi"
for(tis in c("BR",tissuelist))
{
  RDSpath = glue("{OUTPUT}/{trait}-{tis}-tempres_h2se.RDS")
  tempres = readRDS(RDSpath)
  df = generate_results_table(tempres, 
                                ##out=glue("{OUTPUT}/{trait}-results.csv"),
                                phenoname="Body mass index (BMI) (21001_raw)")
  df$trait=trait
  df$tissue = tis
  df_all = rbind(df_all,df)
}

#saveRDS(df_all,file = glue("{OUTPUT}/all-assoc-results.RDS"))

## pivot wider
df_wider <- df_all %>% select(gene,trait,tissue,p_correct)  %>% pivot_wider(
    names_from = tissue, 
    values_from = p_correct,
    id_cols = c(gene, trait)
  )


acat = function(pvec) 
{
  pvec = pvec[!is.na(pvec)]
  TT = sum( tan( (0.5 - pvec) *pi ) )
  .5 - atan(TT / length(pvec)) / pi
}

df_wider$p_acat_5 = apply(df_wider %>% select(all_of(tissuelist)) ,1,function(x) acat(x) ) 
 
df_wider$p_acat_6 = apply(df_wider %>% select(all_of(c("BR",tissuelist)),) ,1,function(x) acat(x) ) 

df_wider = df_wider %>% left_join(gene_annotation %>% select(gene_id,gene_name,chr,start),by=c("gene"="gene_id"))

collect signs

## pivot wider
df_wider_cor <- df_all %>% select(gene,trait,tissue,cor_correct)  %>% pivot_wider(
    names_from = tissue, 
    values_from = cor_correct,
    id_cols = c(gene, trait)
  )

df_wider_cor$max_value_original_sign = apply(df_wider_cor[, -c(1, 2)], 1, function(x) {
    idx_max_magnitude = which.max(abs(x))
    return(x[idx_max_magnitude])
})

df_wider_cor$best_sign = sign(df_wider_cor$max_value_original_sign)

for(ctrait in traitlist)
{tempo = df_wider_cor %>% 
  filter(trait == ctrait) %>% 
  inner_join(df_wider %>% 
               select(gene,trait,p_acat_6), by=c("gene"="gene","trait"="trait"))  %>%
  left_join(phenomexcan_results %>%
              filter(phenotype==phenolist[ctrait]) %>%
              select(p_human=pvalue,
                     hugo_gene=external_gene_name,
                     gene_id,rnorvegicus_homolog_ensembl_gene,
                     dir_effect_most_signif),
            by=c("gene"="rnorvegicus_homolog_ensembl_gene"),relationship = "many-to-many")

ind=tempo$p_acat_6 < 1e-4;
cat("---",ctrait,"---\n")
print(fisher.test(tempo$best_sign[ind], tempo$dir_effect_most_signif[ind]))
}

no concordance of signs

df_wider$qval = NA

## calculate qvalues
for(ctrait in traitlist)
{
  ind = df_wider$trait == ctrait
  qq = qvalue(df_wider$p_acat_6[ind])
  df_wider$qval[ind] = qq$qvalues
}

enrichment of human gene analysis

## annotate with human genes

## add human pvalues
#ctrait = "bmi"
for(ctrait in traitlist)
{
  tempo = df_wider %>% filter(trait == ctrait) %>%
  left_join(phenomexcan_results %>% 
              filter(phenotype==phenolist[ctrait]) %>%
              select(p_human=pvalue,hugo_gene=external_gene_name,
                     gene_id,rnorvegicus_homolog_ensembl_gene, dir_effect_most_signif), 
            by=c("gene"="rnorvegicus_homolog_ensembl_gene"),
            relationship = "many-to-many")
  
  mhuman = 17573
  mrat = nrow(tempo)
  
  cat("fisher test",ctrait,"p_acat_6 \n")
  for(thres in c(0.01,0.05,0.1))
  {
    cat('---',thres,'---\n')
    with(tempo, print(fisher.test(p_acat_6<thres/mrat,p_human<thres/mhuman)) )
    with(tempo, print(table(p_acat_6<thres/mrat,p_human<thres/mhuman)) )
  }
  
  cat("fisher test",ctrait,"p_acat_5 \n")
  for(thres in c(0.01,0.05,0.1))
  {
    cat('---',thres,'---\n')
    if(thres==0.05) cat('---\n--- UPDATE PAPER WITH THIS 0.05 ---\n---\n')
    with(tempo, print(fisher.test(p_acat_5<thres/mrat,p_human<thres/mhuman)) )
    with(tempo, print(table(p_acat_5<thres/mrat,p_human<thres/mhuman)) )
  }

  ind = tempo$p_acat_6 < 0.05/mrat 
  # ind = tempo$p_acat_6 < p_fdr05
  print(ctrait)
  qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,main=glue("human gene assoc for rat {ctrait} genes"),cex2=1.5)
  png(glue("{OUTPUT}/qqunif-compare-{ctrait}.png"),width = 640,height = 480)
  qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,col='maroon1',col2='dodgerblue',cex2=1.5)
  dev.off()
  cat("wilcoxon test")
  print(wilcox.test(tempo$p_human[!ind],tempo$p_human[ind]))
  

  ind = tempo$p_acat_5 < 0.05/(tempo %>% filter(!is.na(p_acat_5)) %>% nrow()) & !is.na(tempo$p_acat_5)
  # ind = tempo$p_acat_6 < p_fdr05
  print(ctrait)
  qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,main=glue("human gene assoc for rat {ctrait} genes"),cex2=1.5)
  png(glue("{OUTPUT}/qqunif-compare-{ctrait}-p_acat_05.png"),width = 640,height = 480)
  qqunif.compare(tempo$p_human,tempo$p_human[ind],BH=FALSE,BF2=FALSE,col='maroon1',col2='dodgerblue',cex2=1.5)
  dev.off()
  cat("wilcoxon test")
  print(wilcox.test(tempo$p_human[!ind],tempo$p_human[ind]))
  
  
  
    if(F) write_csv(tempo %>% arrange(p_acat_6) %>%
              select(gene_name,p_acat_6,chr,start,p_human,hugo_gene,
                     trait,gene,gene_id,BR,AC,IL,LH,PL,VO,p_acat_5,qval),
            file = glue("{OUTPUT}/ratxcan-{ctrait}-results-qval.csv"))
  
}

enrichment of Crouse’s consensus genes

consensus_df <- readxl::read_excel(glue("{INPUT}/data/expression/Table S10 Fat_consensus_alpha_0.01 PG.xlsx"),skip=1)

dim(consensus_df)

## enrichment of consensus genes among bmi associated genes in rats

tempo = df_wider %>% filter(trait=="bmi") %>% left_join(consensus_df %>% select(ensembl_gene_id, description...3),by=c("gene"="ensembl_gene_id")) %>% select(gene,p_acat_6,description...3) %>% mutate(consensus = !is.na(description...3))

qqunif.compare(tempo$p_acat_6,tempo$p_acat_6[tempo$consensus], main="no enrich. of consensus genes in rat bmi genes")

## enrichment of consensus genes among bmi associated genes in humans

tempo = phenomexcan_results %>% filter(phenotype=="Body mass index (BMI) (21001_raw)") %>% left_join(consensus_df %>% select(ensembl_gene_id, description...3),by=c("rnorvegicus_homolog_ensembl_gene"="ensembl_gene_id")) %>% mutate(consensus = !is.na(description...3)) %>% select(gene=rnorvegicus_homolog_ensembl_gene,p_human=pvalue,consensus) 

qqunif.compare(tempo$p_human,tempo$p_human[tempo$consensus], main="no enrich. of consensus genes in human bmi genes")

Crouse et al’s consensus genes are not enriched among human nor rat BMI genes

summarize results for the paper

BF_thres = 0.05/ (df_wider %>% filter(trait==ctrait) %>% filter(!is.na(p_acat_6)) %>% nrow())
for(ctrait in traitlist)
{
cat("BF threshold ", signif(BF_thres,3),"\n")
cat("---",ctrait,"---\n")
res = df_wider %>% 
  filter(trait==ctrait) %>% 
  filter(p_acat_6 < BF_thres) %>%
  select(trait,p_acat_6,gene_name,chr,start,gene) %>%
   left_join(phenomexcan_results %>% filter(phenotype==phenolist[ctrait]) %>%              select(rnorvegicus_homolog_ensembl_gene,p_human=pvalue,hugo=gene_name) , by=c("gene"="rnorvegicus_homolog_ensembl_gene")) %>% 
  arrange(p_acat_6)
print(res)
res = res %>% select(gene_name) %>% unique() %>% nrow()
cat("---\n")
cat(res," BF significant genes\n")
}

ntests = (df_wider %>% filter(trait==ctrait) %>% filter(!is.na(p_acat_5)) %>% nrow()) 
BF_thres = 0.05/ntests
for(ctrait in traitlist)
{
cat("BF threshold ", signif(BF_thres,3),"\n")
cat("---",ctrait,"--- p_acat_5 - UPDATE PAPER WITH THIS\n")
res = df_wider %>% 
  filter(trait==ctrait) %>% 
  filter(p_acat_5 < BF_thres) %>%
  select(trait,p_acat_5,gene_name,chr,start,gene) %>%
   left_join(phenomexcan_results %>% filter(phenotype==phenolist[ctrait]) %>%              select(rnorvegicus_homolog_ensembl_gene,p_human=pvalue,hugo=gene_name) , by=c("gene"="rnorvegicus_homolog_ensembl_gene")) %>% 
  arrange(p_acat_5)
print(res)
res = res %>% select(gene_name) %>% unique() %>% nrow()
cat("---\n")
cat(res," BF significant genes  p_acat_5 - UPDATE PAPER WITH THIS\n")
}

cat(BF_thres," BF threshold for  p_acat_5 - UPDATE PAPER WITH THIS\n")
cat(ntests," number of tests  p_acat_5 - UPDATE PAPER WITH THIS\n")

gene set enrichment

ctrait = "bmi"
thres = 1e-2

top_genes_thres = df_wider %>% filter(trait == ctrait) %>% filter(p_acat_6 < thres) %>% pull(gene)
write(top_genes_thres,file="~/Downloads/top_genes_thres.txt")

top_genes_n = df_wider %>% filter(trait == ctrait) %>% arrange(p_acat_6) %>% pull(gene) %>% head(500)
write(top_genes_n,file="~/Downloads/top_genes_n.txt")

#top_genes_hugo_thres = df_wider %>% filter(trait == ctrait) %>% filter(p_acat_6 < thres) %>% inner_join(orth.rats,by=c("gene"="rnorvegicus_homolog_ensembl_gene")) %>% pull(external_gene_name)
#write(top_genes_hugo,file="~/Downloads/top_genes_hugo_thres.txt")

suppressMessages(library(gprofiler2))
gostres <- gost(query = top_genes_thres,organism="rnorvegicus")
#gostres <- gost(query = top_genes_thres,organism="hsapiens")
gostplot(gostres)

define ggplot based manhattan plotting function

from Natasha

## here
## qq_manhattan(tempo %>% rename(pvalue=p_acat_6))

gg_manhattan <- function(df, titulo="",significance_threshold = 0.05) {
  ## USAGE: gg_manhattan(df,0.05)
  ## df has columns: pvalue, chr (numeric), and start (position)
  ## significance threshold gets divided by the number of tests
  ## 
  # Calculate cumulative base pair positions
  data_cum <- df %>%
    group_by(chr) %>%
    summarise(max_bp = as.numeric(max(start)), .groups = 'drop') %>%
    mutate(bp_add = lag(cumsum(max_bp), default = 0))

  gwas_data <- df %>%
    inner_join(data_cum, by = "chr") %>%
    mutate(bp_cum = start + bp_add)

  # Calculate axis labels
  axis_set <- gwas_data %>%
    group_by(chr) %>%
    summarize(center = mean(bp_cum), .groups = 'drop')

  # Determine the ylim based on the most significant p-value
  ylim <- gwas_data %>%
    filter(pvalue == min(pvalue)) %>%
    summarise(ylim = abs(floor(log10(pvalue))) + 2) %>%
    pull(ylim)

  # Calculate the genome-wide significance level
  sig <- significance_threshold / nrow(df)

  # Construct the Manhattan plot
  manhattan_plot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(pvalue), color = as.factor(chr), size = -log10(pvalue))) +
    geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
    geom_hline(yintercept = -log10(0.0001), color = "red", linetype = "dashed") +
    geom_point(alpha = 0.75, shape = 19) + # Simplified shape decision for clarity
    geom_label_repel(aes(label = ifelse(pvalue <= sig, gene_name, "")), size = 3) +
    ylim(c(0, ylim)) +
    scale_x_continuous(labels = axis_set$chr, breaks = axis_set$center) +
    scale_color_manual(values = rep(c("dodgerblue4", "midnightblue"), length(unique(axis_set$chr)))) +
    scale_size_continuous(range = c(0.5, 3)) +
    labs(x = NULL, y = expression(-log[10](italic(p)))) +
    theme_minimal() +
    theme(legend.position = "none",
          panel.border = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          axis.text.x = element_text(angle = 90, size = 12),
          axis.text.y = element_text(size = 12, vjust = 0),
          axis.title = element_text(size = 20))

  if(titulo !="") manhattan_plot = manhattan_plot + ggtitle(titulo)
  return(manhattan_plot)
}

plot manhattan plots of bmi and body length

for(ctrait in traitlist)
{
  tempo = df_wider %>% filter(trait==ctrait) %>% mutate(chr=as.numeric(chr))
  gg = gg_manhattan(tempo %>% rename(pvalue=p_acat_6), titulo=ctrait)
  print(gg)
  ggsave(glue("{OUTPUT}/{ctrait}-manhattan-p_acat_6.png"))
}

for(ctrait in traitlist)
{
  tempo = df_wider %>% filter(trait==ctrait) %>% mutate(chr=as.numeric(chr))
  gg = gg_manhattan(tempo %>% rename(pvalue=p_acat_5) %>% filter(!is.na(pvalue)), titulo=ctrait)
  print(gg)
  ggsave(glue("{OUTPUT}/{ctrait}-manhattan-p_acat_5.png"))
}

double checking that the saved qvalues are correct

kk5 = read_tsv("/Users/haekyungim/Downloads/ratxcan-bmi-results - ratxcan-bmi-results(1).tsv")
kk6 = read_tsv("/Users/haekyungim/Downloads/ratxcan-bodylen-results - ratxcan-bodylen-results(1).tsv")
all.equal(qvalue(kk5$p_acat_6)$qvalues,kk5$qval)
all.equal(qvalue(kk6$p_acat_6)$qvalues,kk6$qval)
plot(qvalue(kk5$p_acat_6)$qvalues,kk5$qval); abline(0,1)
plot(qvalue(kk6$p_acat_6)$qvalues,kk6$qval); abline(0,1)

# kk1 = read_tsv("/Users/haekyungim/Downloads/ratxcan-bmi-results - ratxcan-bmi-results.tsv")
# kk2 = read_tsv("/Users/haekyungim/Downloads/with-qval-ratxcan-bmi-results - ratxcan-bmi-results.tsv")
# kk3 = read_tsv("/Users/haekyungim/Downloads/ratxcan-bodylen-results - ratxcan-bodylen-results.tsv")
# kk4 = read_tsv("/Users/haekyungim/Downloads/with-qval-ratxcan-bodylen-results - ratxcan-bodylen-results.tsv")

# all.equal(kk1$p_acat_6,kk2$p_acat_6)
# all.equal(kk3$p_acat_6,kk4$p_acat_6)
# 
# all.equal(qvalue(kk1$p_acat_6)$qvalues,kk2$qval)
# all.equal(qvalue(kk3$p_acat_6)$qvalues,kk4$qval)

## small difference between qvalues were calculated before joining with human data
plot(qvalue(kk1$p_acat_6)$qvalues,kk2$qval); abline(0,1)
plot(qvalue(kk3$p_acat_6)$qvalues,kk4$qval); abline(0,1)

RatXcan analysis Figure 3 to 4

top

input data

load libraries and functions

download data for the tutorial

define data and software paths for R

define data and software for the terminal

set up conda environment to run predict expression

predict expression

compare with previous prediction of Ac

read phenotype and check that ids are included in genotype fam file

read gene annotation

calculate GRM

calculate h2

read h2

read grm matrix

read predicted expression

show calibration of type I error REMOVE FROM TUTORIAL

simulate genetic effect sizes for nsnp

use plink to calculate the sim phenotype $y_ = _k X_k $

format simulated scores into matrix

visualize raw and corrected pvalues

define function lmm with GRM

run ratxcan null regression

simulate \(Y = Sigma^{1/2}\epsilon\) and run assoc with expr_mat

simulate \(Y = GRM^{1/2} h2 + (1 - h2) \epsilon\) and run assoc with expr_mat

run bodylen regression with BR expr lmmGRM

run bmi regression with lmmGRM

bodylen and bmi vs 5 brain region expr association

download rat gene annotation and human phenomexcan

create ratxcan association table to save as csv files

collect signs

enrichment of human gene analysis

enrichment of Crouse’s consensus genes

summarize results for the paper

gene set enrichment

define ggplot based manhattan plotting function

plot manhattan plots of bmi and body length

double checking that the saved qvalues are correct