This RMarkdown document demonstrates how key elements from the notebook for case study 1 in the EpiGraphDB paper can be achieved using the R package. For detailed explanations of the case study please refer to the paper or the case study notebook.
Mendelian randomization (MR), a technique to evaluate the causal role of modifiable exposures on a health outcome using a set of genetic instruments, tacitly assumes that a genetic variant (e.g. SNP) is only related to an outcome of interest through an exposure (i.e. the “exclusion restriction criterion”). Horizontal pleiotropy, where a SNP is associated with multiple phenotypes independently of the exposure of interest, potentially violates this assumption delivering misleading conclusions. In contrast, vertical pleiotropy, where a SNP is associated with multiple phenotypes on the same biological pathway, does not violate this assumption.
Here, we use external evidence on biological pathways and protein-protein interactions to assess the pleiotropic profile for a genetic variant based on the genes (and proteins) with which that variant is associated. In a graph representation, we will show that:
This case study goes as follows:
library("magrittr")
library("dplyr")
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library("purrr")
#>
#> Attaching package: 'purrr'
#> The following object is masked from 'package:magrittr':
#>
#> set_names
library("glue")
#>
#> Attaching package: 'glue'
#> The following object is masked from 'package:dplyr':
#>
#> collapse
library("igraph")
#>
#> Attaching package: 'igraph'
#> The following objects are masked from 'package:purrr':
#>
#> compose, simplify
#> The following objects are masked from 'package:dplyr':
#>
#> as_data_frame, groups, union
#> The following objects are masked from 'package:stats':
#>
#> decompose, spectrum
#> The following object is masked from 'package:base':
#>
#> union
library("epigraphdb")
#>
#> EpiGraphDB v0.3 (API: https://api.epigraphdb.org)
#>
Here we configure the parameters used in the case study example. In this example we will look at the variant rs12720356, a SNP located in chromosome 19 that has been associated with Crohn’s disease and psoriasis.**
"rs12720356"
SNP <-
c("ZGLP1", "FDX1L", "MRPL4", "ICAM5", "TYK2", "GRAP2", "KRI1", "TMED1", "ICAM1")
GENELIST <-
1 PPI_N_INTERMEDIATE_PROTEINS <-
The first step of the analysis is to map each of these genes to their protein product.
function(genelist) {
get_gene_protein <- "/mappings/gene-to-protein"
endpoint <- list(
params <-gene_name_list = genelist %>% I()
) query_epigraphdb(
r <-route = endpoint,
params = params,
mode = "table",
method = "POST"
) r
protein_df <-if (nrow(protein_df) > 0) {
protein_df %>%
res_df <- select(protein_name = `gene.name`, uniprot_id = `protein.uniprot_id`)
else {
} tibble() %>% set_names(c("protein_name", "uniprot_id"))
res_df <-
}
res_df
}
get_gene_protein(genelist = GENELIST)
gene_protein_df <-
gene_protein_df#> # A tibble: 9 x 2
#> protein_name uniprot_id
#> <chr> <chr>
#> 1 TMED1 Q13445
#> 2 MRPL4 Q9BYD3
#> 3 TYK2 P29597
#> 4 KRI1 Q8N9T8
#> 5 FDX1L Q6P4F2
#> 6 GRAP2 O75791
#> 7 ICAM1 P05362
#> 8 ICAM5 Q9UMF0
#> 9 ZGLP1 P0C6A0
For each protein we retrieve the pathways they are involved in.
function(gene_protein_df) {
get_protein_pathway <- "/protein/in-pathway"
endpoint <- list(
params <-uniprot_id_list = gene_protein_df %>% pull(`uniprot_id`) %>% I()
) query_epigraphdb(route = endpoint, params = params, mode = "table", method = "POST")
df <-
if (nrow(df) > 0) {
gene_protein_df %>%
res_df <- select(`uniprot_id`) %>%
left_join(df, by = c("uniprot_id"))
else {
} gene_protein_df %>%
res_df <- select(`uniprot_id`) %>%
mutate(pathway_count = NA_integer_, pathway_reactome_id = NA_character_)
} res_df %>%
res_df <- mutate(
pathway_count = ifelse(is.na(pathway_count), 0L, as.integer(pathway_count)),
pathway_reactome_id = ifelse(is.na(pathway_reactome_id), c(), pathway_reactome_id)
)
res_df
} get_protein_pathway(gene_protein_df = gene_protein_df)
pathway_df <-
pathway_df#> # A tibble: 9 x 3
#> uniprot_id pathway_count pathway_reactome_id
#> <chr> <int> <list>
#> 1 Q13445 0 <NULL>
#> 2 Q9BYD3 0 <NULL>
#> 3 P29597 3 <chr [3]>
#> 4 Q8N9T8 0 <NULL>
#> 5 Q6P4F2 1 <chr [1]>
#> 6 O75791 3 <chr [3]>
#> 7 P05362 5 <chr [5]>
#> 8 Q9UMF0 2 <chr [2]>
#> 9 P0C6A0 0 <NULL>
Now for each pair of proteins we match the pathways they have in common.
function(pathway_df) {
get_shared_pathway <-# For the protein-pathway data
# Get protein-protein permutations where they share pathways
function(pathway_df, permutation) {
per_permutation <- pathway_df %>% filter(uniprot_id %in% permutation)
df <- pathway_df %>%
primary_pathway <- filter(uniprot_id == permutation[1]) %>%
pull(pathway_reactome_id) %>%
unlist()
pathway_df %>%
assoc_pathway <- filter(uniprot_id == permutation[2]) %>%
pull(pathway_reactome_id) %>%
unlist()
intersect(primary_pathway, assoc_pathway)
}
pathway_df %>%
pairwise_permutations <- pull(`uniprot_id`) %>%
gtools::permutations(n = length(.), r = 2, v = .)
tibble(
shared_pathway_df <-protein = pairwise_permutations[, 1],
assoc_protein = pairwise_permutations[, 2]
%>%
) mutate(
shared_pathway = map2(`protein`, `assoc_protein`, function(x, y) {
per_permutation(pathway_df = pathway_df, permutation = c(x, y))
}),combination = map2_chr(`protein`, `assoc_protein`, function(x, y) {
sort(c(x, y))
comb <-paste(comb, collapse = ",")
}),count = map_int(`shared_pathway`, function(x) na.omit(x) %>% length()),
connected = count > 0
)
shared_pathway_df
} get_shared_pathway(pathway_df)
shared_pathway_df <- length(shared_pathway_df %>% filter(count > 0))
n_pairs <-print(glue::glue("Num. shared_pathway pairs: {n_pairs}"))
#> Num. shared_pathway pairs: 6
%>% arrange(desc(count))
shared_pathway_df #> # A tibble: 72 x 6
#> protein assoc_protein shared_pathway combination count connected
#> <chr> <chr> <list> <chr> <int> <lgl>
#> 1 P05362 Q9UMF0 <chr [2]> P05362,Q9UMF0 2 TRUE
#> 2 Q9UMF0 P05362 <chr [2]> P05362,Q9UMF0 2 TRUE
#> 3 P05362 P29597 <chr [1]> P05362,P29597 1 TRUE
#> 4 P29597 P05362 <chr [1]> P05362,P29597 1 TRUE
#> 5 O75791 P05362 <chr [0]> O75791,P05362 0 FALSE
#> 6 O75791 P0C6A0 <NULL> O75791,P0C6A0 0 FALSE
#> 7 O75791 P29597 <chr [0]> O75791,P29597 0 FALSE
#> 8 O75791 Q13445 <NULL> O75791,Q13445 0 FALSE
#> 9 O75791 Q6P4F2 <chr [0]> O75791,Q6P4F2 0 FALSE
#> 10 O75791 Q8N9T8 <NULL> O75791,Q8N9T8 0 FALSE
#> # … with 62 more rows
We can further query EpiGraphDB regarding the detailed pathway information using GET /meta/nodes/Pathway/search
.
function(reactome_id) {
get_pathway_info <- "/meta/nodes/Pathway/search"
endpoint <- list(id = reactome_id)
params <- query_epigraphdb(route = endpoint, params = params, mode = "table")
df <-
df
}
shared_pathway_df %>%
pathway <- pull(shared_pathway) %>%
unlist() %>%
unique()
pathway %>% map_df(get_pathway_info)
pathway_info <-%>% print()
pathway_info #> # A tibble: 3 x 3
#> node.in_disease node.name node.reactome_id
#> <chr> <chr> <chr>
#> 1 FALSE Interleukin-4 and Interleukin-13 signaling R-HSA-6785807
#> 2 FALSE Integrin cell surface interactions R-HSA-216083
#> 3 FALSE Immunoregulatory interactions between a Lymp… R-HSA-198933
In order to extract protein groups from the shared pathways, the last step for this query is to convert the shared pathway data into a graph where
We then count the number of nodes in each connected community and plot the graph.
function(df) {
protein_df_to_graph <- df %>%
df_connected <- filter(connected) %>%
distinct(`combination`, .keep_all = TRUE)
df %>%
nodes <- pull(protein) %>%
unique()
igraph::graph_from_data_frame(
graph <-
df_connected,directed = FALSE, vertices = nodes
)$layout <- igraph::layout_with_kk
graph
graph
}
function(graph) {
graph_to_protein_groups <-%>%
graph igraph::components() %>%
igraph::groups() %>%
tibble(group_member = .) %>%
mutate(group_size = map_int(`group_member`, length)) %>%
arrange(desc(group_size))
}
shared_pathway_df %>% protein_df_to_graph()
pathway_protein_graph <- pathway_protein_graph %>% graph_to_protein_groups()
pathway_protein_groups <-%>% str()
pathway_protein_groups #> tibble [7 × 2] (S3: tbl_df/tbl/data.frame)
#> $ group_member:List of 7
#> ..$ 2: chr [1:3] "P05362" "P29597" "Q9UMF0"
#> ..$ 1: chr "O75791"
#> ..$ 3: chr "P0C6A0"
#> ..$ 4: chr "Q13445"
#> ..$ 5: chr "Q6P4F2"
#> ..$ 6: chr "Q8N9T8"
#> ..$ 7: chr "Q9BYD3"
#> ..- attr(*, "dim")= int 7
#> ..- attr(*, "dimnames")=List of 1
#> .. ..$ : chr [1:7] "2" "1" "3" "4" ...
#> $ group_size : Named int [1:7] 3 1 1 1 1 1 1
#> ..- attr(*, "names")= chr [1:7] "2" "1" "3" "4" ...
plot(pathway_protein_graph)
sessionInfo
sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Manjaro Linux
#>
#> Matrix products: default
#> BLAS: /usr/lib/libblas.so.3.9.0
#> LAPACK: /usr/lib/liblapack.so.3.9.0
#>
#> locale:
#> [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] epigraphdb_0.2.1 igraph_1.2.5 glue_1.4.1 purrr_0.3.4
#> [5] dplyr_1.0.2 magrittr_1.5
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.4.6 knitr_1.28 tidyselect_1.1.0 R6_2.4.1
#> [5] rlang_0.4.7 fansi_0.4.1 stringr_1.4.0 httr_1.4.2
#> [9] tools_4.0.2 xfun_0.14 utf8_1.1.4 cli_2.0.2
#> [13] gtools_3.8.2 htmltools_0.4.0 ellipsis_0.3.1 assertthat_0.2.1
#> [17] yaml_2.2.1 digest_0.6.25 tibble_3.0.3 lifecycle_0.2.0
#> [21] crayon_1.3.4 vctrs_0.3.2 curl_4.3 evaluate_0.14
#> [25] rmarkdown_2.1 stringi_1.4.6 compiler_4.0.2 pillar_1.4.6
#> [29] generics_0.0.2 jsonlite_1.7.0 pkgconfig_2.0.3