To install the required packages, run the following in your terminal:
conda env create -f env.yml
conda activate pQTL_comparison
If synapser
fails to install, fix the Python path
with:
export RETICULATE_PYTHON=~/miniconda3/envs/pQTL_comparison/bin/python
# Load project-specific functions
source("utils.R")
# devtools::install_github("ash-res/prop-coloc")
suppressMessages(library(prop.coloc))
# install.packages("colocPropTest", dependencies = T)
suppressMessages(library(colocPropTest))
# devtools::install_github("kassambara/easyGgplot2")
suppressMessages(library(easyGgplot2))
# remotes::install_github("daynefiler/gnomadR")
suppressMessages(library(gnomadR))
suppressMessages(library(reticulate))
# install.packages("synapser", repos = c("http://ran.synapse.org", "https://cloud.r-project.org"))
suppressMessages(library(synapser))
# install.packages("synapserutils", repos = c("http://ran.synapse.org", "https://cloud.r-project.org"))
suppressMessages(library(synapserutils))
# Constants
BASE_DATA_DIR <- "data/RData"
WINDOW_SIZE <- 50000
The pQTL summary statistics used in this study were obtained from the FinnGen Data Freeze 10 release. To download the summary statistics, refer to the official FinnGen portal: https://www.finngen.fi/en/access_results. Specifically, follow the instructions under the section: “How to access the summary statistics”
# wget https://storage.googleapis.com/finngen-public-data-r10/omics/proteomics/release_2023_03_02/data/Olink/probe_map.tsv -O data/FinnGen/pQTL/Olink/probe_map.tsv
# wget https://storage.googleapis.com/finngen-public-data-r10/omics/proteomics/release_2023_03_02/data/Somascan/probe_map.tsv -O data/FinnGen/pQTL/Somascan/probe_map.tsv
Finn_olink <- read.delim("data/FinnGen/pQTL/Olink/probe_map.tsv")
Finn_somascan <- read.delim("data/FinnGen/pQTL/Somascan/probe_map.tsv")
Finn_merged <- merge(Finn_olink, Finn_somascan, by = c("geneName", "chr", "start", "end", "chr2"))
Finn_merged <- Finn_merged[!duplicated(Finn_merged$geneName), ]
genes_FinnGen <- Finn_merged$geneName
for (gene_of_interest in genes_FinnGen) {
filepath_gwas_processed <- file.path(BASE_DATA_DIR, "FinnGen", paste0(gene_of_interest, "_processed.RDS"))
if (!file.exists(filepath_gwas_processed)) {
# message("Processing ", gene_of_interest)
data_FinnGen <- list()
for (platform in c("Somascan", "Olink")) {
data_FinnGen[[platform]] <- load_pQTL_Finngen(gene_of_interest, platform, WINDOW_SIZE)
}
ensure_dir(dirname(filepath_gwas_processed))
saveRDS(data_FinnGen, file = filepath_gwas_processed)
}
}
Synapse_login()
## NULL
UKBB_info <- Synapse_download_data()
for (idx in 1:nrow(UKBB_info)) {
gene_of_interest <- UKBB_info[idx, "genename"]
filepath_gwas_processed <- file.path(BASE_DATA_DIR, "UKBB", paste0(gene_of_interest, "_processed.RDS"))
if (!file.exists(filepath_gwas_processed)) {
# message("Processing ", gene_of_interest)
target_id <- UKBB_info[idx, "id"]
filename_tar <- UKBB_info[idx, "name"]
data_UKBB <- list()
data_UKBB[["Olink"]] <- load_pQTL_UKBB(target_id, filename_tar, gene_of_interest, WINDOW_SIZE)
ensure_dir(dirname(filepath_gwas_processed))
saveRDS(data_UKBB, file = filepath_gwas_processed)
}
}
# This file was obtained from the following URL: https://www.ebi.ac.uk/gwas/publications/29875488.
EGA_info <- read.delim("data/EGA/PMID29875488_studies_export.tsv")
# Example reportedTrait: "Protein levels (XYZ.P12345)" → genename = "XYZ"
EGA_info <- EGA_info %>%
mutate(genename = sub(".*\\(([^.]+)\\..*\\)", "\\1", reportedTrait))
# Subset to genes present in both FinnGen and UKBB datasets
EGA_subset <- EGA_info %>%
filter(genename %in% intersect(genes_FinnGen, UKBB_info$genename)) %>%
filter(!duplicated(genename))
for (idx in 1:nrow(EGA_subset)) {
gene_of_interest <- EGA_subset[idx, "genename"]
filepath_gwas_processed <- file.path(BASE_DATA_DIR, "EGA", paste0(gene_of_interest, "_processed.RDS"))
if (!file.exists(filepath_gwas_processed)) {
# message("Processing ", gene_of_interest)
# Extract accession ID and summary statistics download URL
accessionId <- EGA_subset[idx, "accessionId"]
addr <- EGA_subset[idx, "summaryStatistics"]
filepath <- paste0(addr, "/harmonised/", accessionId, ".h.tsv.gz")
data_EGA <- list()
data_EGA[["Somascan"]] <- load_pQTL_EGA(gene_of_interest, filepath, WINDOW_SIZE)
ensure_dir(dirname(filepath_gwas_processed))
saveRDS(data_EGA, file = filepath_gwas_processed)
}
}
# Step 1: Identify genes that are common across all three datasets.
# Only genes present in all datasets are considered for downstream analysis.
genes_common <- Reduce(intersect, list(UKBB_info$genename, EGA_info$genename, genes_FinnGen))
# Define coordinates of the MHC (Major Histocompatibility Complex) region on chromosome 6.
# Genes in this region are excluded due to complex LD structure that confounds colocalization.
# Source: https://www.ncbi.nlm.nih.gov/grc/human/regions/MHC
MHC <- list(CHR = 6, start = 28510120, end = 33480577)
gene_regions <- do.call(rbind, lapply(genes_common, function(g) as.data.frame(get_gene_region_GRCh38_UKBB(g, WINDOW_SIZE))))
# Identify genes that fall within the MHC region (to be excluded).
genes_in_MHC <- genes_common[gene_regions$CHR == MHC$CHR & gene_regions$locus_lower > MHC$start & gene_regions$locus_upper < MHC$end]
# Identify genes located on the X chromosome (to be excluded).
# Reason: UKBB LD reference is not available for chromosome X.
genes_Xchr <- genes_common[gene_regions$CHR == "X"]
# Step 2: Exclude MHC and X chromosome genes from the common gene list.
filtered_genes <- setdiff(genes_common, genes_in_MHC) # Remove MHC genes (1132 remaining)
filtered_genes <- setdiff(filtered_genes, genes_Xchr) # Remove X chr genes (1100 remaining)
# Step 3: Further filter to include only genes with at least one significant SNP (p < 1e-8)
genes_final <- NULL
for (gene_of_interest in filtered_genes) {
data_FinnGen <- readRDS(file.path(BASE_DATA_DIR, "FinnGen", paste0(gene_of_interest, "_processed.RDS")))
data_UKBB <- readRDS(file.path(BASE_DATA_DIR, "UKBB", paste0(gene_of_interest, "_processed.RDS")))
data_EGA <- readRDS(file.path(BASE_DATA_DIR, "EGA", paste0(gene_of_interest, "_processed.RDS")))
# check_significant_hits() returns TRUE if there is at least one SNP with p-value < pval_cutoff in the specified dataset and platform
if (check_significant_hits(data_FinnGen, gene_of_interest, "Olink") &&
check_significant_hits(data_FinnGen, gene_of_interest, "Somascan") &&
check_significant_hits(data_UKBB, gene_of_interest, "Olink") &&
check_significant_hits(data_EGA, gene_of_interest, "Somascan")) {
genes_final <- c(genes_final, gene_of_interest)
}
}
print(length(genes_final))
## [1] 153
for (gene_of_interest in genes_final) {
# cat("Processing gene:", gene_of_interest, "\n")
# Load pre-processed pQTL summary statistics for each gene from multiple sources:
# - FinnGen (Finnish ancestry)
# - UK Biobank (British/Irish ancestry)
# - EGA (European ancestry summary-level data)
data_FinnGen <- readRDS(file.path(BASE_DATA_DIR, "FinnGen", paste0(gene_of_interest, "_processed.RDS")))
data_UKBB <- readRDS(file.path(BASE_DATA_DIR, "UKBB", paste0(gene_of_interest, "_processed.RDS")))
data_EGA <- readRDS(file.path(BASE_DATA_DIR, "EGA", paste0(gene_of_interest, "_processed.RDS")))
# CASE 1: Same ancestry (British/Irish), same platform
# Baseline consistency check using data from UK Biobank (Olink vs Olink within-UKBB)
data_CASE1 <- readRDS(file.path(BASE_DATA_DIR, "CASE1", paste0(gene_of_interest, "_processed.RDS")))
# CASE 2: Same ancestry, different platforms
# - data_CASE2F: Finnish individuals measured with one platform (e.g., FinnGen)
# - data_CASE2B: British/Irish individuals measured with both Olink and SomaScan
data_CASE2F <- data_FinnGen
data_CASE2B <- list(
Olink = data_UKBB$Olink,
Somascan = data_EGA$Somascan
)
# CASE 3: Different ancestries, same platform (Olink)
# Comparing Finnish (FinnGen) and British/Irish (UKBB) Olink-based pQTLs
data_CASE3 <- list(
FinnGen = data_FinnGen$Olink,
UKBB = data_UKBB$Olink
)
# CASE 4: Different ancestries, same platform (SomaScan)
# Comparing Finnish (FinnGen) and European (EGA) SomaScan pQTLs
data_CASE4 <- list(
FinnGen = data_FinnGen$Somascan,
EGA = data_EGA$Somascan
)
# Run colocalization analysis for each case using the appropriate LD reference
# - LD_type: either a single LD source (e.g., "UKBB") or a pair (e.g., c("UKBB", "FinnGen"))
run_colocalization_analysis("CASE1", gene_of_interest, WINDOW_SIZE, data_CASE1, LD_type = "UKBB") # CASE 1: Same ancestry, same platform
run_colocalization_analysis("CASE2F", gene_of_interest, WINDOW_SIZE, data_CASE2F, LD_type = "FinnGen") # CASE 2: Finnish data (same ancestry, different platform)
run_colocalization_analysis("CASE2B", gene_of_interest, WINDOW_SIZE, data_CASE2B, LD_type = "UKBB") # CASE 2: British/Irish data (same ancestry, different platform)
run_colocalization_analysis("CASE3", gene_of_interest, WINDOW_SIZE, data_CASE3, LD_type = c("FinnGen", "UKBB")) # CASE 3: Different ancestry, same platform (Olink)
run_colocalization_analysis("CASE4", gene_of_interest, WINDOW_SIZE, data_CASE4, LD_type = c("FinnGen", "UKBB")) # CASE 4: Different ancestry, same platform (SomaScan)
}
# Combine colocalization results from all comparison scenarios (CASE1–4),
# including separate LD reference runs (e.g., CASE3F_LD = Finnish LD, CASE3B_LD = British LD)
combined <- bind_rows(
get_all_results("CASE1"), # Same ancestry, same platform
get_all_results("CASE2B"), # Same ancestry, different platform (British/Irish)
get_all_results("CASE2F"), # Same ancestry, different platform (Finnish)
get_all_results("CASE3"), # Different ancestry (British vs Finnish), Olink
get_all_results("CASE3F_LD"), # Same as CASE3, using Finnish LD
get_all_results("CASE3B_LD"), # Same as CASE3, using British LD
get_all_results("CASE4"), # Different ancestry (European vs Finnish), SomaScan
get_all_results("CASE4F_LD"), # Same as CASE4, using Finnish LD
get_all_results("CASE4B_LD") # Same as CASE4, using British LD
)
# Define list of colocalization methods and result categories:
# - C = Colocalized
# - NC = Not colocalized
# - IS = Insufficient
methods <- c("coloc", "susie", "propcoloc", "colocPropTest")
cat_abbr <- c("C", "NC", "IS")
colnames(combined) <- c("runID", paste0(rep(methods, rep(3, length(methods))), "-", cat_abbr))
rownames(combined) <- NULL
print(combined)
## runID coloc-C coloc-NC coloc-IS susie-C susie-NC
## 1 CASE1 0.95424837 0.04575163 0.00000000 1 0
## 2 CASE2B 0.6666667 0.3333333 0.0000000 0.83660131 0.09150327
## 3 CASE2F 0.8039216 0.1960784 0.0000000 0.71241830 0.03267974
## 4 CASE3 0.830065359 0.163398693 0.006535948 0.5751634 0.1503268
## 5 CASE3F_LD 0 0 1 0.7189542 0.0130719
## 6 CASE3B_LD 0 0 1 0.81045752 0.09150327
## 7 CASE4 0.790849673 0.202614379 0.006535948 0.6535948 0.1895425
## 8 CASE4F_LD 0 0 1 0.8104575 0.0130719
## 9 CASE4B_LD 0 0 1 0.89542484 0.07843137
## susie-IS propcoloc-C propcoloc-NC propcoloc-IS colocPropTest-C
## 1 0 0.875817 0.124183 0.000000 0.96732026
## 2 0.07189542 0.50326797 0.39869281 0.09803922 0.4509804
## 3 0.25490196 0.75816993 0.21568627 0.02614379 0.8627451
## 4 0.2745098 0 0 1 0
## 5 0.2679739 0.5555556 0.4313725 0.0130719 0.6470588
## 6 0.09803922 0.5555556 0.3333333 0.1111111 0.4313725
## 7 0.1568627 0 0 1 0
## 8 0.1764706 0.65359477 0.30065359 0.04575163 0.614379085
## 9 0.02614379 0.5032680 0.3856209 0.1111111 0.477124183
## colocPropTest-NC colocPropTest-IS
## 1 0.03267974 0.00000000
## 2 0.5490196 0.0000000
## 3 0.1372549 0.0000000
## 4 0 1
## 5 0.3529412 0.0000000
## 6 0.5686275 0.0000000
## 7 0 1
## 8 0.379084967 0.006535948
## 9 0.516339869 0.006535948
write.table(combined, "results/combined.txt", quote = F, row.names = F, col.names = T, sep = "\t")