To install the required packages, run the following in your terminal:
conda env create -f env.yml
conda activate pQTL_comparison
If synapser fails to install, fix the Python path
with:
export RETICULATE_PYTHON=~/miniconda3/envs/pQTL_comparison/bin/python
# Load project-specific functions
source("utils.R")
# devtools::install_github("ash-res/prop-coloc@cd00a469fc1149709db8960e0d722d9df6eb9e03", force = TRUE)
# install.packages("colocPropTest", dependencies = T)
suppressMessages(library(colocPropTest))
# devtools::install_github("kassambara/easyGgplot2")
suppressMessages(library(easyGgplot2))
# remotes::install_github("daynefiler/gnomadR")
suppressMessages(library(gnomadR))
suppressMessages(library(reticulate))
# install.packages("synapser", repos = c("http://ran.synapse.org", "https://cloud.r-project.org"))
suppressMessages(library(synapser))
# install.packages("synapserutils", repos = c("http://ran.synapse.org", "https://cloud.r-project.org"))
suppressMessages(library(synapserutils))
# Constants
BASE_DATA_DIR <- "data/RData"
WINDOW_SIZE <- 50000
The pQTL summary statistics used in this study were obtained from the FinnGen Data Freeze 10 release. To download the summary statistics, refer to the official FinnGen portal: https://www.finngen.fi/en/access_results. Specifically, follow the instructions under the section: “How to access the summary statistics”
# wget https://storage.googleapis.com/finngen-public-data-r10/omics/proteomics/release_2023_03_02/data/Olink/probe_map.tsv -O data/FinnGen/pQTL/Olink/probe_map.tsv
# wget https://storage.googleapis.com/finngen-public-data-r10/omics/proteomics/release_2023_03_02/data/Somascan/probe_map.tsv -O data/FinnGen/pQTL/Somascan/probe_map.tsv
Finn_olink <- read.delim("data/FinnGen/pQTL/Olink/probe_map.tsv")
Finn_somascan <- read.delim("data/FinnGen/pQTL/Somascan/probe_map.tsv")
Finn_merged <- merge(Finn_olink, Finn_somascan, by = c("geneName", "chr", "start", "end", "chr2"))
Finn_merged <- Finn_merged[!duplicated(Finn_merged$geneName), ]
genes_FinnGen <- Finn_merged$geneName
for (gene_of_interest in genes_FinnGen) {
filepath_gwas_processed <- file.path(BASE_DATA_DIR, "FinnGen", paste0(gene_of_interest, "_processed.RDS"))
if (!file.exists(filepath_gwas_processed)) {
# message("Processing ", gene_of_interest)
data_FinnGen <- list()
for (platform in c("Somascan", "Olink")) {
data_FinnGen[[platform]] <- load_pQTL_Finngen(gene_of_interest, platform, WINDOW_SIZE)
}
ensure_dir(dirname(filepath_gwas_processed))
saveRDS(data_FinnGen, file = filepath_gwas_processed)
}
}
Synapse_login()
## NULL
UKBB_info <- Synapse_download_data()
for (idx in 1:nrow(UKBB_info)) {
gene_of_interest <- UKBB_info[idx, "genename"]
filepath_gwas_processed <- file.path(BASE_DATA_DIR, "UKBB", paste0(gene_of_interest, "_processed.RDS"))
if (!file.exists(filepath_gwas_processed)) {
# message("Processing ", gene_of_interest)
target_id <- UKBB_info[idx, "id"]
filename_tar <- UKBB_info[idx, "name"]
data_UKBB <- list()
data_UKBB[["Olink"]] <- load_pQTL_UKBB(target_id, filename_tar, gene_of_interest, WINDOW_SIZE)
ensure_dir(dirname(filepath_gwas_processed))
saveRDS(data_UKBB, file = filepath_gwas_processed)
}
}
# This file was obtained from the following URL: https://www.ebi.ac.uk/gwas/publications/29875488.
EGA_info <- read.delim("data/EGA/PMID29875488_studies_export.tsv")
# Example reportedTrait: "Protein levels (XYZ.P12345)" → genename = "XYZ"
EGA_info <- EGA_info %>%
mutate(genename = sub(".*\\(([^.]+)\\..*\\)", "\\1", reportedTrait))
# Subset to genes present in both FinnGen and UKBB datasets
EGA_subset <- EGA_info %>%
filter(genename %in% intersect(genes_FinnGen, UKBB_info$genename)) %>%
filter(!duplicated(genename))
for (idx in 1:nrow(EGA_subset)) {
gene_of_interest <- EGA_subset[idx, "genename"]
filepath_gwas_processed <- file.path(BASE_DATA_DIR, "EGA", paste0(gene_of_interest, "_processed.RDS"))
if (!file.exists(filepath_gwas_processed)) {
# message("Processing ", gene_of_interest)
# Extract accession ID and summary statistics download URL
accessionId <- EGA_subset[idx, "accessionId"]
addr <- EGA_subset[idx, "summaryStatistics"]
filepath <- paste0(addr, "/harmonised/", accessionId, ".h.tsv.gz")
data_EGA <- list()
data_EGA[["Somascan"]] <- load_pQTL_EGA(gene_of_interest, filepath, WINDOW_SIZE)
ensure_dir(dirname(filepath_gwas_processed))
saveRDS(data_EGA, file = filepath_gwas_processed)
}
}
# Step 1: Identify genes that are common across all three datasets.
# Only genes present in all datasets are considered for downstream analysis.
genes_common <- Reduce(intersect, list(UKBB_info$genename, EGA_info$genename, genes_FinnGen))
# Define coordinates of the MHC (Major Histocompatibility Complex) region on chromosome 6.
# Genes in this region are excluded due to complex LD structure that confounds colocalization.
# Source: https://www.ncbi.nlm.nih.gov/grc/human/regions/MHC
MHC <- list(CHR = 6, start = 28510120, end = 33480577)
gene_regions <- do.call(rbind, lapply(genes_common, function(g) as.data.frame(get_gene_region_GRCh38_UKBB(g, WINDOW_SIZE))))
# Identify genes that fall within the MHC region (to be excluded).
genes_in_MHC <- genes_common[gene_regions$CHR == MHC$CHR & gene_regions$locus_lower > MHC$start & gene_regions$locus_upper < MHC$end]
# Identify genes located on the X chromosome (to be excluded).
# Reason: UKBB LD reference is not available for chromosome X.
genes_Xchr <- genes_common[gene_regions$CHR == "X"]
# Step 2: Exclude MHC and X chromosome genes from the common gene list.
filtered_genes <- setdiff(genes_common, genes_in_MHC) # Remove MHC genes (1132 remaining)
filtered_genes <- setdiff(filtered_genes, genes_Xchr) # Remove X chr genes (1100 remaining)
# Step 3: Further filter to include only genes with at least one significant SNP (p < 1e-8)
genes_final <- NULL
for (gene_of_interest in filtered_genes) {
data_FinnGen <- readRDS(file.path(BASE_DATA_DIR, "FinnGen", paste0(gene_of_interest, "_processed.RDS")))
data_UKBB <- readRDS(file.path(BASE_DATA_DIR, "UKBB", paste0(gene_of_interest, "_processed.RDS")))
data_EGA <- readRDS(file.path(BASE_DATA_DIR, "EGA", paste0(gene_of_interest, "_processed.RDS")))
# check_significant_hits() returns TRUE if there is at least one SNP with p-value < pval_cutoff in the specified dataset and platform
if (check_significant_hits(data_FinnGen, gene_of_interest, "Olink") &&
check_significant_hits(data_FinnGen, gene_of_interest, "Somascan") &&
check_significant_hits(data_UKBB, gene_of_interest, "Olink") &&
check_significant_hits(data_EGA, gene_of_interest, "Somascan")) {
genes_final <- c(genes_final, gene_of_interest)
}
}
print(length(genes_final))
## [1] 153
for (gene_of_interest in genes_final) {
# cat("Processing gene:", gene_of_interest, "\n")
# Load pre-processed pQTL summary statistics for each gene from multiple sources:
# - FinnGen (Finnish ancestry)
# - UK Biobank (British/Irish ancestry)
# - EGA (European ancestry summary-level data)
data_FinnGen <- readRDS(file.path(BASE_DATA_DIR, "FinnGen", paste0(gene_of_interest, "_processed.RDS")))
data_UKBB <- readRDS(file.path(BASE_DATA_DIR, "UKBB", paste0(gene_of_interest, "_processed.RDS")))
data_EGA <- readRDS(file.path(BASE_DATA_DIR, "EGA", paste0(gene_of_interest, "_processed.RDS")))
# CASE 1: Same ancestry (British/Irish), same platform
# Baseline consistency check using data from UK Biobank (Olink vs Olink within-UKBB)
data_CASE1 <- readRDS(file.path(BASE_DATA_DIR, "CASE1", paste0(gene_of_interest, "_processed.RDS")))
# CASE 2: Same ancestry, different platforms
# - data_CASE2F: Finnish individuals measured with one platform (e.g., FinnGen)
# - data_CASE2B: British/Irish individuals measured with both Olink and SomaScan
data_CASE2F <- data_FinnGen
data_CASE2B <- list(
Olink = data_UKBB$Olink,
Somascan = data_EGA$Somascan
)
# CASE 3: Different ancestries, same platform (Olink)
# Comparing Finnish (FinnGen) and British/Irish (UKBB) Olink-based pQTLs
data_CASE3 <- list(
FinnGen = data_FinnGen$Olink,
UKBB = data_UKBB$Olink
)
# CASE 4: Different ancestries, same platform (SomaScan)
# Comparing Finnish (FinnGen) and European (EGA) SomaScan pQTLs
data_CASE4 <- list(
FinnGen = data_FinnGen$Somascan,
EGA = data_EGA$Somascan
)
# Run colocalization analysis for each case using the appropriate LD reference
# - LD_type: either a single LD source (e.g., "UKBB") or a pair (e.g., c("UKBB", "FinnGen"))
run_colocalization_analysis("CASE1", gene_of_interest, WINDOW_SIZE, data_CASE1, LD_type = "UKBB") # CASE 1: Same ancestry, same platform
run_colocalization_analysis("CASE2F", gene_of_interest, WINDOW_SIZE, data_CASE2F, LD_type = "FinnGen") # CASE 2: Finnish data (same ancestry, different platform)
run_colocalization_analysis("CASE2B", gene_of_interest, WINDOW_SIZE, data_CASE2B, LD_type = "UKBB") # CASE 2: British/Irish data (same ancestry, different platform)
run_colocalization_analysis("CASE3", gene_of_interest, WINDOW_SIZE, data_CASE3, LD_type = c("FinnGen", "UKBB")) # CASE 3: Different ancestry, same platform (Olink)
run_colocalization_analysis("CASE4", gene_of_interest, WINDOW_SIZE, data_CASE4, LD_type = c("FinnGen", "UKBB")) # CASE 4: Different ancestry, same platform (SomaScan)
}
library(knitr)
# Combine colocalization results from all comparison scenarios (CASE1–4),
# including separate LD reference runs (e.g., CASE3F_LD = Finnish LD, CASE3B_LD = British LD)
runIDs <- c("CASE1", "CASE2B", "CASE2F", "CASE3", "CASE3F_LD", "CASE3B_LD", "CASE4", "CASE4F_LD", "CASE4B_LD")
runIDs_rename <- c("CASE 1", "CASE 2-B", "CASE 2-F", "CASE 3-OM", "CASE 3-OF", "CASE 3-OB", "CASE 4-SM", "CASE 4-SF", "CASE 4-SB")
combined <- bind_rows(lapply(runIDs, get_all_results))
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
rownames(combined) <- runIDs_rename
combined$runID <- NULL
kable(combined)
| coloc-C | coloc-NC | coloc-IS | susie-C | susie-NC | susie-IS | sharepro-C | sharepro-NC | sharepro-IS | propcoloc-C | propcoloc-NC | propcoloc-IS | colocPropTest-C | colocPropTest-NC | colocPropTest-IS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CASE 1 | 0.95 | 0.05 | 0.01 | 1 | 0 | 0 | 1 | 0 | 0 | 0.87 | 0.13 | 0.00 | 0.97 | 0.03 | 0.00 |
| CASE 2-B | 0.63 | 0.31 | 0.07 | 0.83 | 0.10 | 0.07 | 0.98 | 0.02 | 0.00 | 0.51 | 0.43 | 0.07 | 0.45 | 0.55 | 0.00 |
| CASE 2-F | 0.76 | 0.18 | 0.06 | 0.71 | 0.03 | 0.26 | 0.94 | 0.06 | 0.00 | 0.50 | 0.09 | 0.41 | 0.86 | 0.14 | 0.00 |
| CASE 3-OM | 0.79 | 0.11 | 0.10 | 0.56 | 0.17 | 0.27 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CASE 3-OF | 0 | 0 | 1 | 0.71 | 0.02 | 0.27 | 0.98 | 0.02 | 0.00 | 0.32 | 0.25 | 0.43 | 0.65 | 0.35 | 0.00 |
| CASE 3-OB | 0 | 0 | 1 | 0.76 | 0.14 | 0.10 | 0.98 | 0.02 | 0.00 | 0.40 | 0.31 | 0.29 | 0.43 | 0.57 | 0.00 |
| CASE 4-SM | 0.75 | 0.17 | 0.08 | 0.64 | 0.20 | 0.16 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CASE 4-SF | 0 | 0 | 1 | 0.80 | 0.01 | 0.18 | 0.97 | 0.03 | 0.00 | 0.49 | 0.17 | 0.34 | 0.61 | 0.38 | 0.01 |
| CASE 4-SB | 0 | 0 | 1 | 0.82 | 0.14 | 0.04 | 0.97 | 0.03 | 0.00 | 0.42 | 0.39 | 0.19 | 0.48 | 0.52 | 0.01 |
write.table(combined, "results/tab_combined_prop_test.txt", quote = F, row.names = T, col.names = T, sep = "\t")
library(ComplexUpset)
library(patchwork)
LD_types <- c("FinnGen", "UKBB")
runIDs_filtered <- grep("_LD", runIDs, invert = TRUE, value = TRUE)
encoded <- lapply(LD_types, function(this_LD) {
do.call(rbind, lapply(runIDs_filtered, function(id) {
get_encoded(id, LD_type = this_LD)
}))
})
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
## Warning in max(share): no non-missing arguments to max; returning -Inf
names(encoded) <- LD_types
input_expanded1 <- create_input_expanded(encoded[["FinnGen"]]) %>% filter(runID %in% c("CASE 1", "CASE 2-B", "CASE 2-F"))
input_expanded2 <- create_input_expanded(encoded[["FinnGen"]]) %>% filter(runID %in% c("CASE 3-OF", "CASE 4-SF"))
input_expanded3 <- create_input_expanded(encoded[["UKBB"]]) %>% filter(runID %in% c("CASE 3-OB", "CASE 4-SB"))
# pdf("results/fig_upset.pdf", width = 12, height = 12)
p <- patchwork::wrap_plots(c(
plot_upset(input_expanded1),
plot_upset(input_expanded2),
list(ggplot()),
plot_upset(input_expanded3)
), ncol = 3, nrow = 3)
plot(p)
# dev.off()