# Installed the following packages by runing:
# > conda env create -f env.yml
# > conda activate HMGCR-Causality
suppressMessages(library(dplyr))
suppressMessages(library(vroom))
suppressMessages(library(rtracklayer))
suppressMessages(library(reticulate))
suppressMessages(library(ggplot2))
suppressMessages(library(reshape2))
suppressMessages(library(curl))
suppressMessages(library(MendelianRandomization))
suppressMessages(library(coloc))
suppressMessages(library(susieR))
# devtools::install_github("ash-res/prop-coloc")
suppressMessages(library(prop.coloc))
# install.packages("colocPropTest", dependencies = T)
suppressMessages(library(colocPropTest))
# devtools::install_github("kassambara/easyGgplot2")
suppressMessages(library(easyGgplot2))
# devtools::install_github("lapsumchan/MVMR-cML-SuSiE")
suppressMessages(library(MVMRcMLSuSiE))
# devtools::install_github("xue-hr/MRcML")
suppressMessages(library(MRcML))
# install.packages("conflicted")
suppressMessages(library(conflicted))
source("PCA-LIML-function.R")
source("summary_mvMR_BF.R")
source("summary_mvMR_SSS.R")
source("utils.R")
suppressMessages(conflict_prefer("mr_cML", "MRcML"))
suppressMessages(conflicts_prefer(base::setdiff))
suppressMessages(conflicts_prefer(base::intersect))
suppressMessages(conflicts_prefer(combinat::combn))
suppressMessages(conflicts_prefer(dplyr::filter))
# Download LD data using [AWS](https://registry.opendata.aws/ukbb-ld/) by running the following command and save the downloaded data in the data/UKBB_LD/ directory.
# > aws s3 cp s3://broad-alkesgroup-ukbb-ld/ . --recursive --no-sign-request --exclude "*" --include "UKBB_LD/chr5_74000001_77000001*"
filepath_ld_mat <- "data/UKBB_LD/chr5_74000001_77000001.npz"
filepath_ld_meta <- "data/UKBB_LD/chr5_74000001_77000001.gz"
# Go to https://diagram-consortium.org/downloads.html
# Download Ancestry specific GWAS meta-analysis summary statistics: European
# Save the downloaded data in data/T2D/ directory.
# This was published in Mahajan et al (2022)
filepath_T2D <- "data/T2D/DIAMANTE-EUR.sumstat.txt"
gene_of_interest <- "HMGCR"
window_size <- 10000
filepath_gwas_processed <- paste0("data/RData/", gene_of_interest, "_window_", window_size, "_gwas_processed.RDS")
if (!file.exists(filepath_gwas_processed)) {
lst_data_full <- list()
lst_data_full[["T2D"]] <- load_T2D(filepath_T2D, gene_of_interest, window_size)
lst_data_full[["BMI"]] <- load_BMI(gene_of_interest, window_size)
lst_pheno_GWAS <- c("Acute Insulin response", "Fasting Insulin", "Fasting Glucose", "LDL-C", "HDL-C", "Triglyceride", "Leptin", "Sterol", "Cortisol", "Estradiol", "Vitamin D", "Bile acid", "Aldosterone", "Ubiquinone", "CAD", "Testosterone")
for (pheno in lst_pheno_GWAS) {
lst_data_full[[pheno]] <- load_GWAS(pheno, gene_of_interest, window_size)
}
check_dir("data/RData/")
saveRDS(lst_data_full, file = filepath_gwas_processed)
} else {
lst_data_full <- readRDS(filepath_gwas_processed)
}
sapply(lst_data_full, nrow)
## T2D BMI Acute Insulin response
## 134 275 98
## Fasting Insulin Fasting Glucose LDL-C
## 30 30 661
## HDL-C Triglyceride Leptin
## 660 664 5
## Sterol Cortisol Estradiol
## 11 185 94
## Vitamin D Bile acid Aldosterone
## 78 11 84
## Ubiquinone CAD Testosterone
## 178 137 95
input_risk_factors1 <- as.data.frame(t(combn(x = names(lst_data_full), m = 2, simplify = T)))
dir_results <- file.path("results", paste0("window_", window_size), gene_of_interest)
for (idx in 1:nrow(input_risk_factors1)) {
names_risk_factor <- sort(as.character(input_risk_factors1[idx, ]))
lst_data <- list(
risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
outcome = NULL
)
res <- harmonize(
gene_of_interest, window_size,
lst_data,
filepath_ld_mat = filepath_ld_mat,
filepath_ld_meta = filepath_ld_meta,
dir_output = dir_results
)
# Run Proportional colocalization analysis
run_colocProp(res, dir_results)
}
df_propcoloc <- get_propcoloc_res(dir_results, list_factors = names(lst_data_full))
list_pairs <- df_propcoloc$filtered$group
list_of_outcomes <- c("CAD", "T2D")
list_factors <- setdiff(sort(unique(unlist(strsplit(list_pairs, "_")))), list_of_outcomes)
for (idx1 in 1:length(list_factors)) {
for (idx2 in 1:idx1) {
if (idx1 == idx2) next
RF1 <- list_factors[idx1]
RF2 <- list_factors[idx2]
names_risk_factor <- sort(c(RF1, RF2))
lst_data <- list(
risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
outcome = NULL
)
res <- harmonize(
gene_of_interest, window_size,
lst_data,
filepath_ld_mat = filepath_ld_mat,
filepath_ld_meta = filepath_ld_meta,
dir_output = dir_results
)
run_coloc(res, dir_results)
run_susie(res, dir_results)
run_propcoloc_Wallace(res, dir_results)
}
}
pairs_phenotypic_heterogeneity <- select_phenotypic_heterogeneity(dir_results, list_factors)
for (name_outcome in list_of_outcomes) {
for (idx in 1:length(pairs_phenotypic_heterogeneity)) {
names_risk_factor <- sort(unlist(strsplit(pairs_phenotypic_heterogeneity[idx], "_")))
lst_data <- list(
risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
outcome = sapply(name_outcome, function(x) lst_data_full[[x]], simplify = F)
)
res <- harmonize(
gene_of_interest, window_size,
lst_data,
filepath_ld_mat = filepath_ld_mat,
filepath_ld_meta = filepath_ld_meta,
dir_output = dir_results
)
run_PCA_liml(res, dir_results)
}
}
dir_figs <- file.path(dir_results, "figures")
check_dir(dir_figs)
plot_propcoloc_and_susie_barplots_pairwise(
gene_of_interest,
list_factors,
dir_results,
fig_name = file.path(dir_figs, "Figure2_heatmap_of_colocalization_results_from_propcoloc_and_susie.pdf")
)
## Using GROUP, RF1, RF2 as id variables
## Using GROUP, RF1, RF2 as id variables
## Using GROUP, RF1, RF2 as id variables
## Warning in geom_bar(stat = stat, position = position, width = pms$width, :
## Ignoring unknown parameters: `binwidth`
plot_propcoloc_and_wallace_barplots_pairwise(
gene_of_interest,
list_factors,
dir_results,
fig_name = file.path(dir_figs, "SupFigure1_heatmap_of_colocalization_results_from_propcoloc_and_colocPropTest.pdf")
)
## Using GROUP, RF1, RF2 as id variables
## Using GROUP, RF1, RF2 as id variables
## Warning in geom_bar(stat = stat, position = position, width = pms$width, :
## Ignoring unknown parameters: `binwidth`
forestplot_OR(
list_of_outcomes,
dir_results,
fig_name = file.path(dir_figs, "Figure3_MVMR.pdf")
)
lst1 <- c("LDL-C", "BMI")
lst2 <- c("CAD", "T2D")
for (RF1 in lst1) {
for (RF2 in lst2) {
names_risk_factor <- sort(c(RF1, RF2))
lst_data <- list(
risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
outcome = NULL
)
res <- harmonize(
gene_of_interest, window_size,
lst_data,
filepath_ld_mat = filepath_ld_mat,
filepath_ld_meta = filepath_ld_meta,
dir_output = dir_results
)
run_colocProp(res, dir_results)
run_coloc(res, dir_results)
run_susie(res, dir_results)
run_propcoloc_Wallace(res, dir_results)
}
}
lst <- sort(c(lst1, lst2))
df_propcoloc <- get_propcoloc_res(dir_results, lst)$full %>% select("group", "p_cond", "LM_cond")
df_susie <- get_susie_res(dir_results, lst)$full %>% select("group", "H3", "H4")
df_coloc <- get_coloc_res(dir_results, lst)$full %>% select("group", "H3", "H4")
df_wallace <- get_colocPropTest_res(dir_results, lst)$full %>% select("group", "min_fdr")
merged <- Reduce(function(x, y) merge(x, y, by = "group", all = T), list(df_susie, df_coloc, df_propcoloc, df_wallace))
rownames(merged) <- merged$group
print(merged[c("CAD_LDL-C", "LDL-C_T2D", "BMI_CAD", "BMI_T2D"), ])
## group H3.x H4.x H3.y H4.y p_cond
## CAD_LDL-C CAD_LDL-C 0.0003498526 0.9996447 0.0002420084 0.9997193 0.44
## LDL-C_T2D LDL-C_T2D 0.9182634185 0.0775393 0.8665002870 0.1202444 0.59
## BMI_CAD BMI_CAD 0.1814497277 0.8181700 0.0875704716 0.9096733 0.04
## BMI_T2D BMI_T2D NA NA 0.0289479815 0.9694342 0.97
## LM_cond min_fdr
## CAD_LDL-C 1.842154e-05 1.00000000
## LDL-C_T2D 1.320231e-03 0.67997497
## BMI_CAD 3.341441e-05 0.01049868
## BMI_T2D 1.921592e-19 1.00000000
for (name_outcome in list_of_outcomes) {
names_risk_factor <- list_factors
lst_data <- list(
risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
outcome = sapply(name_outcome, function(x) lst_data_full[[x]], simplify = F)
)
res <- harmonize(
gene_of_interest, window_size,
lst_data,
filepath_ld_mat = filepath_ld_mat,
filepath_ld_meta = filepath_ld_meta,
dir_output = dir_results
)
print(run_BMA(res, dir_results))
}
## NULL
## NULL
names_risk_factor <- sort(c("BMI", "LDL-C"))
n_PCs_thres <- df_Table3 <- list()
for (name_outcome in list_of_outcomes) {
n_PCs_thres[[name_outcome]] <- list()
lst_data <- list(
risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
outcome = sapply(name_outcome, function(x) lst_data_full[[x]], simplify = F)
)
res <- harmonize(
gene_of_interest, window_size,
lst_data,
filepath_ld_mat = filepath_ld_mat,
filepath_ld_meta = filepath_ld_meta,
dir_output = dir_results
)
n_PCs_thres[[name_outcome]] <- list()
for (thres in c(0.99, 0.999, 0.9999)) {
n_PCs_thres[[name_outcome]]$pca.no <- rbind(n_PCs_thres[[name_outcome]]$pca.no, data.frame(thres, n_PCs = pca.no(res, thres)))
}
n_PCs_range <- with(n_PCs_thres[[name_outcome]]$pca.no, min(n_PCs):max(n_PCs))
for (n_PCs in n_PCs_range) {
out_PCA_liml <- run_PCA_liml(res, dir_results, n_PCs)
n_PCs_thres[[name_outcome]]$tab <- rbind(n_PCs_thres[[name_outcome]]$tab, out_PCA_liml)
}
print(n_PCs_thres[[name_outcome]]$tab)
}
## NULL
## NULL
names_risk_factor <- sort(c("LDL-C", "T2D"))
lst_data <- list(
risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
outcome = NULL
)
res <- harmonize(
gene_of_interest, window_size,
lst_data,
filepath_ld_mat = filepath_ld_mat,
filepath_ld_meta = filepath_ld_meta,
dir_output = dir_results
)
print(run_susie(res, dir_results))
## NULL
This script is based on the tutorial provided at: https://github.com/lapsumchan/MVMR-cML-SuSiE This implements the MVMR-cML-SuSiE pipeline for multivariable Mendelian randomization analysis, including harmonization, LD pruning, and estimation steps (step 2 and step 3).
dir_results <- file.path("results", "MRcML", gene_of_interest)
name_outcome <- list_of_outcomes[1]
names_risk_factor <- list_factors
filepath_gwas_processed <- paste0("data/RData/", gene_of_interest, "_window_", window_size, "_gwas_processed.RDS")
lst_data_full <- readRDS(filepath_gwas_processed)
# Remove rows with missing SNPs
lst_data_full <- lapply(lst_data_full, function(x) x[!is.na(x$snp), ])
# Remove duplicated SNP entries
lst_data_full <- lapply(lst_data_full, function(x) x[!duplicated(x$snp), ])
lst_data <- list(
risk_factors = sapply(names_risk_factor, function(x) lst_data_full[[x]], simplify = F),
outcome = sapply(name_outcome, function(x) lst_data_full[[x]], simplify = F)
)
res <- harmonize(
gene_of_interest, window_size,
lst_data,
filepath_ld_mat = filepath_ld_mat,
filepath_ld_meta = filepath_ld_meta,
dir_output = dir_results
)
# Set LD threshold for pruning
threshold <- 0.9
ld_matrix <- res$ld
# Get SNP IDs and initialize selection flags
snp_ids <- colnames(ld_matrix)
n_snps <- length(snp_ids)
to_keep <- rep(TRUE, n_snps)
# LD pruning: exclude SNPs highly correlated (r² > threshold) with previous ones
for (i in seq_len(n_snps)) {
if (!to_keep[i]) next
if (i < n_snps) {
high_ld <- ld_matrix[i, (i + 1):n_snps] > threshold
to_keep[(i + 1):n_snps][high_ld] <- FALSE
}
}
# Compute average sample size for each risk factor and outcome
sample.sizes <- sapply(res[names_risk_factor], function(x) mean(x$nsample))
sample.sizes <- c(sample.sizes, mean(res[[name_outcome]]$nsample))
# Extract effect sizes (beta) and standard errors (se) for valid SNPs
beta.exposure.ls <- lapply(res[names_risk_factor], function(x) x$beta[to_keep])
se.exposure.ls <- lapply(res[names_risk_factor], function(x) x$se[to_keep])
beta.outcome.ls <- rep(list(res[[name_outcome]]$beta[to_keep]), length(beta.exposure.ls))
se.outcome.ls <- rep(list(res[[name_outcome]]$se[to_keep]), length(beta.exposure.ls))
# Convert lists to matrices/vectors for modeling
beta.exposure.mat <- as.matrix(as.data.frame(beta.exposure.ls))
se.exposure.mat <- as.matrix(as.data.frame(se.exposure.ls))
beta.outcome.vec <- beta.outcome.ls[[1]]
se.outcome.vec <- se.outcome.ls[[1]]
pval.exposure.mat <- as.matrix(as.data.frame(lapply(res[names_risk_factor], function(x) x$pval[to_keep])))
# Step 2: Run MVMR (multivariable Mendelian randomization) estimation using cML (constrained maximum likelihood)
suppressWarnings(
step2.res <- mvmr.cml.susie.step2(
sample.sizes.subset = sample.sizes,
beta.exposure.mat = beta.exposure.mat,
se.exposure.mat = se.exposure.mat,
beta.outcome.vec = beta.outcome.vec,
se.outcome.vec = se.outcome.vec,
pval.exposure.mat = pval.exposure.mat,
use.openGWAS = FALSE,
cutoff = 1
)
)
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
# Prepare correlation matrix (rho) for risk factors and outcome
names <- setdiff(names(res), c("ld", "names"))
rho.mat <- matrix(0, length(list_factors) + 1, length(list_factors) + 1)
colnames(rho.mat) <- rownames(rho.mat) <- names
for (RF1 in names) {
for (RF2 in names) {
rho.mat[RF1, RF2] <- cor(res[[RF1]]$beta, res[[RF2]]$beta)
}
}
# Step 3: Final MVMR estimation with adjustment for invalid instruments and correlation
step3.res <- mvmr.cml.susie.step3(step2.res$mvdat, step2.res$invalid.idx, step2.res$theta.vec, rho.mat)
step3.res$alpha
## Acute.Insulin.response BMI Cortisol HDL.C LDL.C
## [1,] 6.259638e-09 0.001155587 1.334981e-08 4.271625e-07 0.9988349
## [2,] 1.505234e-01 0.155030092 1.726508e-01 2.008739e-01 0.1472448
## [3,] 1.515507e-01 0.155642360 1.723577e-01 1.985329e-01 0.1484457
## [4,] 1.556348e-01 0.158276998 1.710340e-01 1.895390e-01 0.1532693
## [5,] 1.594127e-01 0.160930374 1.696594e-01 1.815018e-01 0.1578017
## [6,] 1.621931e-01 0.163024383 1.685642e-01 1.757333e-01 0.1611766
## Ubiquinone
## [1,] 9.090055e-06
## [2,] 1.736769e-01
## [3,] 1.734705e-01
## [4,] 1.722459e-01
## [5,] 1.706940e-01
## [6,] 1.693084e-01