Load packages and functions

# Installed the following packages by runing:
# > conda env create -f env.yml
# > conda activate HMGCR-Causality

suppressMessages(library(dplyr))
suppressMessages(library(vroom))
suppressMessages(library(rtracklayer))
suppressMessages(library(reticulate))
suppressMessages(library(ggplot2))
suppressMessages(library(reshape2))
suppressMessages(library(curl))
suppressMessages(library(MendelianRandomization))
suppressMessages(library(coloc))
suppressMessages(library(susieR))

# devtools::install_github("ash-res/prop-coloc")
suppressMessages(library(prop.coloc))
# install.packages("colocPropTest", dependencies = T)
suppressMessages(library(colocPropTest))
# devtools::install_github("kassambara/easyGgplot2")
suppressMessages(library(easyGgplot2))

# devtools::install_github("lapsumchan/MVMR-cML-SuSiE")
suppressMessages(library(MVMRcMLSuSiE))
# devtools::install_github("xue-hr/MRcML")
suppressMessages(library(MRcML))
# install.packages("conflicted")
suppressMessages(library(conflicted))

source("PCA-LIML-function.R")
source("summary_mvMR_BF.R")
source("summary_mvMR_SSS.R")
source("utils.R")

suppressMessages(conflict_prefer("mr_cML", "MRcML"))
suppressMessages(conflicts_prefer(base::setdiff))
suppressMessages(conflicts_prefer(base::intersect))
suppressMessages(conflicts_prefer(combinat::combn))
suppressMessages(conflicts_prefer(dplyr::filter))

Download data

# Download LD data using [AWS](https://registry.opendata.aws/ukbb-ld/) by running the following command and save the downloaded data in the data/UKBB_LD/ directory.
# > aws s3 cp s3://broad-alkesgroup-ukbb-ld/ . --recursive --no-sign-request --exclude "*" --include "UKBB_LD/chr5_74000001_77000001*"
filepath_ld_mat <- "data/UKBB_LD/chr5_74000001_77000001.npz"
filepath_ld_meta <- "data/UKBB_LD/chr5_74000001_77000001.gz"

# Go to https://diagram-consortium.org/downloads.html
# Download Ancestry specific GWAS meta-analysis summary statistics: European
# Save the downloaded data in data/T2D/ directory.
# This was published in Mahajan et al (2022)
filepath_T2D <- "data/T2D/DIAMANTE-EUR.sumstat.txt"

Load summary data

gene_of_interest <- "HMGCR"
window_size <- 10000

filepath_gwas_processed <- paste0("data/RData/", gene_of_interest, "_window_", window_size, "_gwas_processed.RDS")

if (!file.exists(filepath_gwas_processed)) {
  lst_data_full <- list()

  lst_data_full[["T2D"]] <- load_T2D(filepath_T2D, gene_of_interest, window_size)
  lst_data_full[["BMI"]] <- load_BMI(gene_of_interest, window_size)

  lst_pheno_GWAS <- c("Acute Insulin response", "Fasting Insulin", "Fasting Glucose", "LDL-C", "HDL-C", "Triglyceride", "Leptin", "Sterol", "Cortisol", "Estradiol", "Vitamin D", "Bile acid", "Aldosterone", "Ubiquinone", "CAD", "Testosterone")
  for (pheno in lst_pheno_GWAS) {
    lst_data_full[[pheno]] <- load_GWAS(pheno, gene_of_interest, window_size)
  }

  check_dir("data/RData/")
  saveRDS(lst_data_full, file = filepath_gwas_processed)
} else {
  lst_data_full <- readRDS(filepath_gwas_processed)
}
sapply(lst_data_full, nrow)
##                    T2D                    BMI Acute Insulin response 
##                    134                    275                     98 
##        Fasting Insulin        Fasting Glucose                  LDL-C 
##                     30                     30                    661 
##                  HDL-C           Triglyceride                 Leptin 
##                    660                    664                      5 
##                 Sterol               Cortisol              Estradiol 
##                     11                    185                     94 
##              Vitamin D              Bile acid            Aldosterone 
##                     78                     11                     84 
##             Ubiquinone                    CAD           Testosterone 
##                    178                    137                     95

Run Proportional colocalization analysis for all possible pairs

input_risk_factors1 <- as.data.frame(t(combn(x = names(lst_data_full), m = 2, simplify = T)))
dir_results <- file.path("results", paste0("window_", window_size), gene_of_interest)

for (idx in 1:nrow(input_risk_factors1)) {
  names_risk_factor <- sort(as.character(input_risk_factors1[idx, ]))

  lst_data <- list(
    risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
    outcome = NULL
  )
  res <- harmonize(
    gene_of_interest, window_size,
    lst_data,
    filepath_ld_mat = filepath_ld_mat,
    filepath_ld_meta = filepath_ld_meta,
    dir_output = dir_results
  )

  # Run Proportional colocalization analysis
  run_colocProp(res, dir_results)
}

Identify phenotypic heterogeneity

df_propcoloc <- get_propcoloc_res(dir_results, list_factors = names(lst_data_full))
list_pairs <- df_propcoloc$filtered$group
list_of_outcomes <- c("CAD", "T2D")
list_factors <- setdiff(sort(unique(unlist(strsplit(list_pairs, "_")))), list_of_outcomes)

for (idx1 in 1:length(list_factors)) {
  for (idx2 in 1:idx1) {
    if (idx1 == idx2) next
    RF1 <- list_factors[idx1]
    RF2 <- list_factors[idx2]
    names_risk_factor <- sort(c(RF1, RF2))

    lst_data <- list(
      risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
      outcome = NULL
    )
    res <- harmonize(
      gene_of_interest, window_size,
      lst_data,
      filepath_ld_mat = filepath_ld_mat,
      filepath_ld_meta = filepath_ld_meta,
      dir_output = dir_results
    )
    run_coloc(res, dir_results)
    run_susie(res, dir_results)
    run_propcoloc_Wallace(res, dir_results)
  }
}
pairs_phenotypic_heterogeneity <- select_phenotypic_heterogeneity(dir_results, list_factors)

Run MVMR for the phenotypic heterogeneity pairs

for (name_outcome in list_of_outcomes) {
  for (idx in 1:length(pairs_phenotypic_heterogeneity)) {
    names_risk_factor <- sort(unlist(strsplit(pairs_phenotypic_heterogeneity[idx], "_")))

    lst_data <- list(
      risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
      outcome = sapply(name_outcome, function(x) lst_data_full[[x]], simplify = F)
    )
    res <- harmonize(
      gene_of_interest, window_size,
      lst_data,
      filepath_ld_mat = filepath_ld_mat,
      filepath_ld_meta = filepath_ld_meta,
      dir_output = dir_results
    )

    run_PCA_liml(res, dir_results)
  }
}

Plot

dir_figs <- file.path(dir_results, "figures")
check_dir(dir_figs)
plot_propcoloc_and_susie_barplots_pairwise(
  gene_of_interest,
  list_factors,
  dir_results,
  fig_name = file.path(dir_figs, "Figure2_heatmap_of_colocalization_results_from_propcoloc_and_susie.pdf")
)
## Using GROUP, RF1, RF2 as id variables
## Using GROUP, RF1, RF2 as id variables
## Using GROUP, RF1, RF2 as id variables
## Warning in geom_bar(stat = stat, position = position, width = pms$width, :
## Ignoring unknown parameters: `binwidth`

plot_propcoloc_and_wallace_barplots_pairwise(
  gene_of_interest,
  list_factors,
  dir_results,
  fig_name = file.path(dir_figs, "SupFigure1_heatmap_of_colocalization_results_from_propcoloc_and_colocPropTest.pdf")
)
## Using GROUP, RF1, RF2 as id variables
## Using GROUP, RF1, RF2 as id variables
## Warning in geom_bar(stat = stat, position = position, width = pms$width, :
## Ignoring unknown parameters: `binwidth`

forestplot_OR(
  list_of_outcomes,
  dir_results,
  fig_name = file.path(dir_figs, "Figure3_MVMR.pdf")
)

Table 1

lst1 <- c("LDL-C", "BMI")
lst2 <- c("CAD", "T2D")
for (RF1 in lst1) {
  for (RF2 in lst2) {
    names_risk_factor <- sort(c(RF1, RF2))

    lst_data <- list(
      risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
      outcome = NULL
    )
    res <- harmonize(
      gene_of_interest, window_size,
      lst_data,
      filepath_ld_mat = filepath_ld_mat,
      filepath_ld_meta = filepath_ld_meta,
      dir_output = dir_results
    )
    run_colocProp(res, dir_results)
    run_coloc(res, dir_results)
    run_susie(res, dir_results)
    run_propcoloc_Wallace(res, dir_results)
  }
}
lst <- sort(c(lst1, lst2))
df_propcoloc <- get_propcoloc_res(dir_results, lst)$full %>% select("group", "p_cond", "LM_cond")
df_susie <- get_susie_res(dir_results, lst)$full %>% select("group", "H3", "H4")
df_coloc <- get_coloc_res(dir_results, lst)$full %>% select("group", "H3", "H4")
df_wallace <- get_colocPropTest_res(dir_results, lst)$full %>% select("group", "min_fdr")

merged <- Reduce(function(x, y) merge(x, y, by = "group", all = T), list(df_susie, df_coloc, df_propcoloc, df_wallace))
rownames(merged) <- merged$group
print(merged[c("CAD_LDL-C", "LDL-C_T2D", "BMI_CAD", "BMI_T2D"), ])
##               group         H3.x      H4.x         H3.y      H4.y p_cond
## CAD_LDL-C CAD_LDL-C 0.0003498526 0.9996447 0.0002420084 0.9997193   0.44
## LDL-C_T2D LDL-C_T2D 0.9182634185 0.0775393 0.8665002870 0.1202444   0.59
## BMI_CAD     BMI_CAD 0.1814497277 0.8181700 0.0875704716 0.9096733   0.04
## BMI_T2D     BMI_T2D           NA        NA 0.0289479815 0.9694342   0.97
##                LM_cond    min_fdr
## CAD_LDL-C 1.842154e-05 1.00000000
## LDL-C_T2D 1.320231e-03 0.67997497
## BMI_CAD   3.341441e-05 0.01049868
## BMI_T2D   1.921592e-19 1.00000000

Table 2

for (name_outcome in list_of_outcomes) {
  names_risk_factor <- list_factors

  lst_data <- list(
    risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
    outcome = sapply(name_outcome, function(x) lst_data_full[[x]], simplify = F)
  )
  res <- harmonize(
    gene_of_interest, window_size,
    lst_data,
    filepath_ld_mat = filepath_ld_mat,
    filepath_ld_meta = filepath_ld_meta,
    dir_output = dir_results
  )

  print(run_BMA(res, dir_results))
}
## NULL
## NULL

Supplementary Table3

names_risk_factor <- sort(c("BMI", "LDL-C"))
n_PCs_thres <- df_Table3 <- list()
for (name_outcome in list_of_outcomes) {
  n_PCs_thres[[name_outcome]] <- list()

  lst_data <- list(
    risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
    outcome = sapply(name_outcome, function(x) lst_data_full[[x]], simplify = F)
  )
  res <- harmonize(
    gene_of_interest, window_size,
    lst_data,
    filepath_ld_mat = filepath_ld_mat,
    filepath_ld_meta = filepath_ld_meta,
    dir_output = dir_results
  )

  n_PCs_thres[[name_outcome]] <- list()
  for (thres in c(0.99, 0.999, 0.9999)) {
    n_PCs_thres[[name_outcome]]$pca.no <- rbind(n_PCs_thres[[name_outcome]]$pca.no, data.frame(thres, n_PCs = pca.no(res, thres)))
  }
  n_PCs_range <- with(n_PCs_thres[[name_outcome]]$pca.no, min(n_PCs):max(n_PCs))
  for (n_PCs in n_PCs_range) {
    out_PCA_liml <- run_PCA_liml(res, dir_results, n_PCs)
    n_PCs_thres[[name_outcome]]$tab <- rbind(n_PCs_thres[[name_outcome]]$tab, out_PCA_liml)
  }
  print(n_PCs_thres[[name_outcome]]$tab)
}
## NULL
## NULL

Supplementary Table4

names_risk_factor <- sort(c("LDL-C", "T2D"))

lst_data <- list(
  risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
  outcome = NULL
)
res <- harmonize(
  gene_of_interest, window_size,
  lst_data,
  filepath_ld_mat = filepath_ld_mat,
  filepath_ld_meta = filepath_ld_meta,
  dir_output = dir_results
)
print(run_susie(res, dir_results))
## NULL

Implements the MVMR-cML-SuSiE pipeline

This script is based on the tutorial provided at: https://github.com/lapsumchan/MVMR-cML-SuSiE This implements the MVMR-cML-SuSiE pipeline for multivariable Mendelian randomization analysis, including harmonization, LD pruning, and estimation steps (step 2 and step 3).

dir_results <- file.path("results", "MRcML", gene_of_interest)

name_outcome <- list_of_outcomes[1]
names_risk_factor <- list_factors
filepath_gwas_processed <- paste0("data/RData/", gene_of_interest, "_window_", window_size, "_gwas_processed.RDS")
lst_data_full <- readRDS(filepath_gwas_processed)
# Remove rows with missing SNPs
lst_data_full <- lapply(lst_data_full, function(x) x[!is.na(x$snp), ])
# Remove duplicated SNP entries
lst_data_full <- lapply(lst_data_full, function(x) x[!duplicated(x$snp), ])

lst_data <- list(
  risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
  outcome = sapply(name_outcome, function(x) lst_data_full[[x]], simplify = F)
)
res <- harmonize(
  gene_of_interest, window_size,
  lst_data,
  filepath_ld_mat = filepath_ld_mat,
  filepath_ld_meta = filepath_ld_meta,
  dir_output = dir_results
)

# Set LD threshold for pruning
threshold <- 0.9
ld_matrix <- res$ld

# Get SNP IDs and initialize selection flags
snp_ids <- colnames(ld_matrix)
n_snps <- length(snp_ids)
to_keep <- rep(TRUE, n_snps)

# LD pruning: exclude SNPs highly correlated (r² > threshold) with previous ones
for (i in seq_len(n_snps)) {
  if (!to_keep[i]) next

  if (i < n_snps) {
    high_ld <- ld_matrix[i, (i + 1):n_snps] > threshold
    to_keep[(i + 1):n_snps][high_ld] <- FALSE
  }
}

# Compute average sample size for each risk factor and outcome
sample.sizes <- sapply(res[names_risk_factor], function(x) mean(x$nsample))
sample.sizes <- c(sample.sizes, mean(res[[name_outcome]]$nsample))

# Extract effect sizes (beta) and standard errors (se) for valid SNPs
beta.exposure.ls <- lapply(res[names_risk_factor], function(x) x$beta[to_keep])
se.exposure.ls <- lapply(res[names_risk_factor], function(x) x$se[to_keep])
beta.outcome.ls <- rep(list(res[[name_outcome]]$beta[to_keep]), length(beta.exposure.ls))
se.outcome.ls <- rep(list(res[[name_outcome]]$se[to_keep]), length(beta.exposure.ls))

# Convert lists to matrices/vectors for modeling
beta.exposure.mat <- as.matrix(as.data.frame(beta.exposure.ls))
se.exposure.mat <- as.matrix(as.data.frame(se.exposure.ls))
beta.outcome.vec <- beta.outcome.ls[[1]]
se.outcome.vec <- se.outcome.ls[[1]]
pval.exposure.mat <- as.matrix(as.data.frame(lapply(res[names_risk_factor], function(x) x$pval[to_keep])))

# Step 2: Run MVMR (multivariable Mendelian randomization) estimation using cML (constrained maximum likelihood)
suppressWarnings(
  step2.res <- mvmr.cml.susie.step2(
    sample.sizes.subset = sample.sizes,
    beta.exposure.mat = beta.exposure.mat,
    se.exposure.mat = se.exposure.mat,
    beta.outcome.vec = beta.outcome.vec,
    se.outcome.vec = se.outcome.vec,
    pval.exposure.mat = pval.exposure.mat,
    use.openGWAS = FALSE,
    cutoff = 1
  )
)
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
# Prepare correlation matrix (rho) for risk factors and outcome
names <- setdiff(names(res), c("ld", "names"))
rho.mat <- matrix(0, length(list_factors) + 1, length(list_factors) + 1)
colnames(rho.mat) <- rownames(rho.mat) <- names
for (RF1 in names) {
  for (RF2 in names) {
    rho.mat[RF1, RF2] <- cor(res[[RF1]]$beta, res[[RF2]]$beta)
  }
}

# Step 3: Final MVMR estimation with adjustment for invalid instruments and correlation
step3.res <- mvmr.cml.susie.step3(step2.res$mvdat, step2.res$invalid.idx, step2.res$theta.vec, rho.mat)
step3.res$alpha
##      Acute.Insulin.response         BMI     Cortisol        HDL.C     LDL.C
## [1,]           6.259638e-09 0.001155587 1.334981e-08 4.271625e-07 0.9988349
## [2,]           1.505234e-01 0.155030092 1.726508e-01 2.008739e-01 0.1472448
## [3,]           1.515507e-01 0.155642360 1.723577e-01 1.985329e-01 0.1484457
## [4,]           1.556348e-01 0.158276998 1.710340e-01 1.895390e-01 0.1532693
## [5,]           1.594127e-01 0.160930374 1.696594e-01 1.815018e-01 0.1578017
## [6,]           1.621931e-01 0.163024383 1.685642e-01 1.757333e-01 0.1611766
##        Ubiquinone
## [1,] 9.090055e-06
## [2,] 1.736769e-01
## [3,] 1.734705e-01
## [4,] 1.722459e-01
## [5,] 1.706940e-01
## [6,] 1.693084e-01