Initialization & Package Loading

To install the required packages, run the following in your terminal:

conda env create -f env.yml
conda activate pQTL_comparison

If synapser fails to install, fix the Python path with:

export RETICULATE_PYTHON=~/miniconda3/envs/pQTL_comparison/bin/python
# Load project-specific functions
source("utils.R")

# devtools::install_github("ash-res/prop-coloc")
suppressMessages(library(prop.coloc))
# install.packages("colocPropTest", dependencies = T)
suppressMessages(library(colocPropTest))
# devtools::install_github("kassambara/easyGgplot2")
suppressMessages(library(easyGgplot2))
# remotes::install_github("daynefiler/gnomadR")
suppressMessages(library(gnomadR))

suppressMessages(library(reticulate))
# install.packages("synapser", repos = c("http://ran.synapse.org", "https://cloud.r-project.org"))
suppressMessages(library(synapser))
# install.packages("synapserutils", repos = c("http://ran.synapse.org", "https://cloud.r-project.org"))
suppressMessages(library(synapserutils))

# Constants
BASE_DATA_DIR <- "data/RData"
WINDOW_SIZE <- 50000

Download and process FinnGen summary data

The pQTL summary statistics used in this study were obtained from the FinnGen Data Freeze 10 release. To download the summary statistics, refer to the official FinnGen portal: https://www.finngen.fi/en/access_results. Specifically, follow the instructions under the section: “How to access the summary statistics”

# wget https://storage.googleapis.com/finngen-public-data-r10/omics/proteomics/release_2023_03_02/data/Olink/probe_map.tsv -O data/FinnGen/pQTL/Olink/probe_map.tsv
# wget https://storage.googleapis.com/finngen-public-data-r10/omics/proteomics/release_2023_03_02/data/Somascan/probe_map.tsv -O data/FinnGen/pQTL/Somascan/probe_map.tsv
Finn_olink <- read.delim("data/FinnGen/pQTL/Olink/probe_map.tsv")
Finn_somascan <- read.delim("data/FinnGen/pQTL/Somascan/probe_map.tsv")

Finn_merged <- merge(Finn_olink, Finn_somascan, by = c("geneName", "chr", "start", "end", "chr2"))
Finn_merged <- Finn_merged[!duplicated(Finn_merged$geneName), ]
genes_FinnGen <- Finn_merged$geneName

for (gene_of_interest in genes_FinnGen) {
  filepath_gwas_processed <- file.path(BASE_DATA_DIR, "FinnGen", paste0(gene_of_interest, "_processed.RDS"))

  if (!file.exists(filepath_gwas_processed)) {
    # message("Processing ", gene_of_interest)
    data_FinnGen <- list()

    for (platform in c("Somascan", "Olink")) {
      data_FinnGen[[platform]] <- load_pQTL_Finngen(gene_of_interest, platform, WINDOW_SIZE)
    }
    ensure_dir(dirname(filepath_gwas_processed))
    saveRDS(data_FinnGen, file = filepath_gwas_processed)
  }
}

Download and process UKBB summary data

Synapse_login()
## NULL
UKBB_info <- Synapse_download_data()

for (idx in 1:nrow(UKBB_info)) {
  gene_of_interest <- UKBB_info[idx, "genename"]
  filepath_gwas_processed <- file.path(BASE_DATA_DIR, "UKBB", paste0(gene_of_interest, "_processed.RDS"))

  if (!file.exists(filepath_gwas_processed)) {
    # message("Processing ", gene_of_interest)
    target_id <- UKBB_info[idx, "id"]
    filename_tar <- UKBB_info[idx, "name"]

    data_UKBB <- list()
    data_UKBB[["Olink"]] <- load_pQTL_UKBB(target_id, filename_tar, gene_of_interest, WINDOW_SIZE)
    ensure_dir(dirname(filepath_gwas_processed))
    saveRDS(data_UKBB, file = filepath_gwas_processed)
  }
}

Download and process EGA summary data

# This file was obtained from the following URL: https://www.ebi.ac.uk/gwas/publications/29875488.
EGA_info <- read.delim("data/EGA/PMID29875488_studies_export.tsv")

# Example reportedTrait: "Protein levels (XYZ.P12345)" → genename = "XYZ"
EGA_info <- EGA_info %>%
  mutate(genename = sub(".*\\(([^.]+)\\..*\\)", "\\1", reportedTrait))

# Subset to genes present in both FinnGen and UKBB datasets
EGA_subset <- EGA_info %>%
  filter(genename %in% intersect(genes_FinnGen, UKBB_info$genename)) %>%
  filter(!duplicated(genename))

for (idx in 1:nrow(EGA_subset)) {
  gene_of_interest <- EGA_subset[idx, "genename"]
  filepath_gwas_processed <- file.path(BASE_DATA_DIR, "EGA", paste0(gene_of_interest, "_processed.RDS"))
  if (!file.exists(filepath_gwas_processed)) {
    # message("Processing ", gene_of_interest)
    # Extract accession ID and summary statistics download URL
    accessionId <- EGA_subset[idx, "accessionId"]
    addr <- EGA_subset[idx, "summaryStatistics"]
    filepath <- paste0(addr, "/harmonised/", accessionId, ".h.tsv.gz")

    data_EGA <- list()
    data_EGA[["Somascan"]] <- load_pQTL_EGA(gene_of_interest, filepath, WINDOW_SIZE)
    ensure_dir(dirname(filepath_gwas_processed))
    saveRDS(data_EGA, file = filepath_gwas_processed)
  }
}

Gene filtering step

  • Step 1: Identify genes that are common across all three datasets.
  • Step 2: Exclude MHC and X chromosome genes from the common gene list.
  • Step 3: Further filter to include only genes with at least one significant SNP (p < 1e-8)
# Step 1: Identify genes that are common across all three datasets.
# Only genes present in all datasets are considered for downstream analysis.
genes_common <- Reduce(intersect, list(UKBB_info$genename, EGA_info$genename, genes_FinnGen))

# Define coordinates of the MHC (Major Histocompatibility Complex) region on chromosome 6.
# Genes in this region are excluded due to complex LD structure that confounds colocalization.
# Source: https://www.ncbi.nlm.nih.gov/grc/human/regions/MHC
MHC <- list(CHR = 6, start = 28510120, end = 33480577)

gene_regions <- do.call(rbind, lapply(genes_common, function(g) as.data.frame(get_gene_region_GRCh38_UKBB(g, WINDOW_SIZE))))
# Identify genes that fall within the MHC region (to be excluded).
genes_in_MHC <- genes_common[gene_regions$CHR == MHC$CHR & gene_regions$locus_lower > MHC$start & gene_regions$locus_upper < MHC$end]
# Identify genes located on the X chromosome (to be excluded).
# Reason: UKBB LD reference is not available for chromosome X.
genes_Xchr <- genes_common[gene_regions$CHR == "X"]

# Step 2: Exclude MHC and X chromosome genes from the common gene list.
filtered_genes <- setdiff(genes_common, genes_in_MHC)  # Remove MHC genes (1132 remaining)
filtered_genes <- setdiff(filtered_genes, genes_Xchr)  # Remove X chr genes (1100 remaining)

# Step 3: Further filter to include only genes with at least one significant SNP (p < 1e-8)
genes_final <- NULL
for (gene_of_interest in filtered_genes) {
  data_FinnGen <- readRDS(file.path(BASE_DATA_DIR, "FinnGen", paste0(gene_of_interest, "_processed.RDS")))
  data_UKBB <- readRDS(file.path(BASE_DATA_DIR, "UKBB", paste0(gene_of_interest, "_processed.RDS")))
  data_EGA <- readRDS(file.path(BASE_DATA_DIR, "EGA", paste0(gene_of_interest, "_processed.RDS")))

  # check_significant_hits() returns TRUE if there is at least one SNP with p-value < pval_cutoff in the specified dataset and platform
  if (check_significant_hits(data_FinnGen, gene_of_interest, "Olink") &&
    check_significant_hits(data_FinnGen, gene_of_interest, "Somascan") &&
    check_significant_hits(data_UKBB, gene_of_interest, "Olink") &&
    check_significant_hits(data_EGA, gene_of_interest, "Somascan")) {
    genes_final <- c(genes_final, gene_of_interest)
  }
}
print(length(genes_final))
## [1] 153

Run colocalization analysis for each case

  • CASE 1: Same ancestry, same platform
  • CASE 2: Finnish data (same ancestry, different platform)
  • CASE 2: British/Irish data (same ancestry, different platform)
  • CASE 3: Different ancestry, same platform (Olink)
  • CASE 4: Different ancestry, same platform (SomaScan)
for (gene_of_interest in genes_final) {
  # cat("Processing gene:", gene_of_interest, "\n")
  # Load pre-processed pQTL summary statistics for each gene from multiple sources:
  # - FinnGen (Finnish ancestry)
  # - UK Biobank (British/Irish ancestry)
  # - EGA (European ancestry summary-level data)
  data_FinnGen <- readRDS(file.path(BASE_DATA_DIR, "FinnGen", paste0(gene_of_interest, "_processed.RDS")))
  data_UKBB <- readRDS(file.path(BASE_DATA_DIR, "UKBB", paste0(gene_of_interest, "_processed.RDS")))
  data_EGA <- readRDS(file.path(BASE_DATA_DIR, "EGA", paste0(gene_of_interest, "_processed.RDS")))

  # CASE 1: Same ancestry (British/Irish), same platform
  # Baseline consistency check using data from UK Biobank (Olink vs Olink within-UKBB)
  data_CASE1 <- readRDS(file.path(BASE_DATA_DIR, "CASE1", paste0(gene_of_interest, "_processed.RDS")))
  # CASE 2: Same ancestry, different platforms
  # - data_CASE2F: Finnish individuals measured with one platform (e.g., FinnGen)
  # - data_CASE2B: British/Irish individuals measured with both Olink and SomaScan
  data_CASE2F <- data_FinnGen
  data_CASE2B <- list(
    Olink = data_UKBB$Olink,
    Somascan = data_EGA$Somascan
  )
  # CASE 3: Different ancestries, same platform (Olink)
  # Comparing Finnish (FinnGen) and British/Irish (UKBB) Olink-based pQTLs
  data_CASE3 <- list(
    FinnGen = data_FinnGen$Olink,
    UKBB = data_UKBB$Olink
  )
  # CASE 4: Different ancestries, same platform (SomaScan)
  # Comparing Finnish (FinnGen) and European (EGA) SomaScan pQTLs
  data_CASE4 <- list(
    FinnGen = data_FinnGen$Somascan,
    EGA = data_EGA$Somascan
  )

  # Run colocalization analysis for each case using the appropriate LD reference
  # - LD_type: either a single LD source (e.g., "UKBB") or a pair (e.g., c("UKBB", "FinnGen"))
  run_colocalization_analysis("CASE1", gene_of_interest, WINDOW_SIZE, data_CASE1, LD_type = "UKBB") # CASE 1: Same ancestry, same platform
  run_colocalization_analysis("CASE2F", gene_of_interest, WINDOW_SIZE, data_CASE2F, LD_type = "FinnGen") # CASE 2: Finnish data (same ancestry, different platform)
  run_colocalization_analysis("CASE2B", gene_of_interest, WINDOW_SIZE, data_CASE2B, LD_type = "UKBB") # CASE 2: British/Irish data (same ancestry, different platform)
  run_colocalization_analysis("CASE3", gene_of_interest, WINDOW_SIZE, data_CASE3, LD_type = c("FinnGen", "UKBB")) # CASE 3: Different ancestry, same platform (Olink)
  run_colocalization_analysis("CASE4", gene_of_interest, WINDOW_SIZE, data_CASE4, LD_type = c("FinnGen", "UKBB")) # CASE 4: Different ancestry, same platform (SomaScan)
}

Combine results

# Combine colocalization results from all comparison scenarios (CASE1–4),
# including separate LD reference runs (e.g., CASE3F_LD = Finnish LD, CASE3B_LD = British LD)
combined <- bind_rows(
  get_all_results("CASE1"),        # Same ancestry, same platform
  get_all_results("CASE2B"),       # Same ancestry, different platform (British/Irish)
  get_all_results("CASE2F"),       # Same ancestry, different platform (Finnish)
  get_all_results("CASE3"),        # Different ancestry (British vs Finnish), Olink
  get_all_results("CASE3F_LD"),    # Same as CASE3, using Finnish LD
  get_all_results("CASE3B_LD"),    # Same as CASE3, using British LD
  get_all_results("CASE4"),        # Different ancestry (European vs Finnish), SomaScan
  get_all_results("CASE4F_LD"),    # Same as CASE4, using Finnish LD
  get_all_results("CASE4B_LD")     # Same as CASE4, using British LD
)

# Define list of colocalization methods and result categories:
# - C = Colocalized
# - NC = Not colocalized
# - IS = Insufficient
methods <- c("coloc", "susie", "propcoloc", "colocPropTest")
cat_abbr <- c("C", "NC", "IS")
colnames(combined) <- c("runID", paste0(rep(methods, rep(3, length(methods))), "-", cat_abbr))
rownames(combined) <- NULL
print(combined)
##       runID     coloc-C    coloc-NC    coloc-IS    susie-C   susie-NC
## 1     CASE1  0.95424837  0.04575163  0.00000000          1          0
## 2    CASE2B   0.6666667   0.3333333   0.0000000 0.83660131 0.09150327
## 3    CASE2F   0.8039216   0.1960784   0.0000000 0.71241830 0.03267974
## 4     CASE3 0.830065359 0.163398693 0.006535948  0.5751634  0.1503268
## 5 CASE3F_LD           0           0           1  0.7189542  0.0130719
## 6 CASE3B_LD           0           0           1 0.81045752 0.09150327
## 7     CASE4 0.790849673 0.202614379 0.006535948  0.6535948  0.1895425
## 8 CASE4F_LD           0           0           1  0.8104575  0.0130719
## 9 CASE4B_LD           0           0           1 0.89542484 0.07843137
##     susie-IS propcoloc-C propcoloc-NC propcoloc-IS colocPropTest-C
## 1          0    0.875817     0.124183     0.000000      0.96732026
## 2 0.07189542  0.50326797   0.39869281   0.09803922       0.4509804
## 3 0.25490196  0.75816993   0.21568627   0.02614379       0.8627451
## 4  0.2745098           0            0            1               0
## 5  0.2679739   0.5555556    0.4313725    0.0130719       0.6470588
## 6 0.09803922   0.5555556    0.3333333    0.1111111       0.4313725
## 7  0.1568627           0            0            1               0
## 8  0.1764706  0.65359477   0.30065359   0.04575163     0.614379085
## 9 0.02614379   0.5032680    0.3856209    0.1111111     0.477124183
##   colocPropTest-NC colocPropTest-IS
## 1       0.03267974       0.00000000
## 2        0.5490196        0.0000000
## 3        0.1372549        0.0000000
## 4                0                1
## 5        0.3529412        0.0000000
## 6        0.5686275        0.0000000
## 7                0                1
## 8      0.379084967      0.006535948
## 9      0.516339869      0.006535948
write.table(combined, "results/combined.txt", quote = F, row.names = F, col.names = T, sep = "\t")