library(tidyverse)
library(TCGAbiolinks)
library(SummarizedExperiment)
library(clusterProfiler)
library(DOSE)
library(AnnotationHub)
library(neo4r)
library(GO.db)
library(GOxploreR)
library(data.table)
library(annotate)
library(DT)
library(visNetwork)
The full pipeline analysis of the raw data can be found here. More info on the original project here
Raw data are first downloaded, pre-processsed, normalized and filtered. Then, the differential expression analysis is computed with the standard pipeline provided by the TCGAbiolinks package.
Alternatively, one can directly load the final result table in the next chunck .
To extract the annotations to the Gene Ontology, we are gonna use the AnnotationHub service, using the code to extract data for Homo Sapiens, key: “AH111575”
## AnnotationHub with 26629 records
## # snapshotDate(): 2023-04-25
## # $dataprovider: BroadInstitute, UCSC, Ensembl, GENCODE, UWashington, Stanfo...
## # $species: Homo sapiens, homo sapiens
## # $rdataclass: GRanges, BigWigFile, Rle, ChainFile, TwoBitFile, list, TxDb, ...
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
## # rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH5012"]]'
##
## title
## AH5012 | Chromosome Band
## AH5013 | STS Markers
## AH5014 | FISH Clones
## AH5015 | Recomb Rate
## AH5016 | ENCODE Pilot
## ... ...
## AH111393 | LRBaseDb for Homo sapiens (Human, v005)
## AH111495 | MeSHDb for Homo sapiens (Human, v005)
## AH111575 | org.Hs.eg.db.sqlite
## AH111585 | TxDb.Hsapiens.UCSC.hg38.knownGene.sqlite
## AH113665 | Ensembl 110 EnsDb for Homo sapiens
Compute an ORA analysis on the previously extracted list of differentially expression genes and Gene Ontology.
See ?enrichGO for details on the arguments.
We use the list of significantly differentially expressed genes and the whole list of protein coding genes in the assay as universe. No contrain applied on the size of GO terms sets.
Then, we plot !
ego <- enrichGO(gene = DE.set,
universe = universe,
OrgDb = HS.annotation,
ont = "BP",
keyType = "SYMBOL",
pAdjustMethod = "BH",
minGSSize = 1,
maxGSSize = 100000)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
Same as with GO, but with KEGG pathways.
Adapt the identifiers for KEGG: needs ENTREZID instead of gene SYMBOLS
# Mapping between Gene symbol and Entrez ID
universe2 <- bitr(universe, fromType="SYMBOL", toType = "ENTREZID", OrgDb = HS.annotation)
## Warning in bitr(universe, fromType = "SYMBOL", toType = "ENTREZID", OrgDb =
## HS.annotation): 1.95% of input gene IDs are fail to map...
universe2 <- universe2$ENTREZID
DE.set2 <- bitr(DE.set, fromType="SYMBOL", toType = "ENTREZID", OrgDb = HS.annotation)
## Warning in bitr(DE.set, fromType = "SYMBOL", toType = "ENTREZID", OrgDb =
## HS.annotation): 1.59% of input gene IDs are fail to map...
First, we are gonna define an helper functions in order to plot the hierarchical structure on the Gene Ontology in a dynamic vizualisation using VizNetwork
However, the whole graph is big, so let’s focus first on a particular sub-region: the “nuclear division” GO-term (GO:0000280)
The visualization of the DAG will allow you to appreciate the hierarchical relations between the enriched terms and identify the different levels of enrichment.
# Get ancestors and direct children
GO.term <- "GO:0000280"
GO_ancestors <- sapply(GO.term, function(x) {GOBPANCESTOR[[x]]})
GO_children <- sapply(GO.term, function(x) {GOBPCHILDREN[[x]]})
GO_hierarchy <- unique(c(GO.term, unlist(GO_ancestors), unlist(GO_children)))
# Map enrichment results
GO_hierarchy <- GO_hierarchy[! is.na(GO_hierarchy)]
# Prepare nodes
dom_graph <- prep_parents_graph(GO_hierarchy)
nodes <- dom_graph$nodes
fdrScalePal <- colorRampPalette(c('#E0F4FF', "#ff8829"))
fdrScale <- fdrScalePal(100)
# Prepare data for plot
nodes <- nodes %>%
left_join((ego@result %>% dplyr::select(ID, GeneRatio, BgRatio, qvalue)), by = c("id"="ID")) %>%
mutate(label = paste(label, paste0("GeneRatio=", GeneRatio), paste0("BgRatio=", BgRatio), paste0("qvalue=", signif(qvalue, 3)), sep = "\n"), fdrscale.val=-log10(qvalue)) %>%
replace_na(list(fdrscale.val=0)) %>%
mutate(color.background = fdrScale[cut(fdrscale.val, breaks = 100)]) %>%
dplyr::select(-c(GeneRatio, qvalue, fdrscale.val))
# Plot graph
plot <- visNetwork(nodes, dom_graph$edges, width="100%", height = 1000,
main=paste0("Explore GO BP enrichment in GO DAG ontology"),) %>%
visOptions(highlightNearest = list(enabled = TRUE, algorithm = "hierarchical"), selectedBy = "label") %>%
visPhysics(solver = "hierarchicalRepulsion", hierarchicalRepulsion = list(avoidOverlap = 1)) %>%
visHierarchicalLayout(direction = "UD", blockShifting=FALSE, nodeSpacing=200) %>%
visLegend(addEdges = dom_graph$leg_edges)
visSave(plot, "nuclear_division_dag_vizu.html")
Visualize the graph on the exported html document: nuclear_division_dag_vizu.html
Select only the GO terms enriched with a q.value \(< 1e-2\) and the union of all their ancestors.
Different regions of the DAG are enriched.
# Get ancestors
selected_go_bp <- ego@result[ego@result$qvalue <= 1e-2, ]$ID
all_ego_dag <- sapply(selected_go_bp, function(x) {GOBPANCESTOR[[x]]})
all_ego_dag <- unique(c(selected_go_bp, unlist(all_ego_dag)))
# Map enrichment results
all_ego_dag <- all_ego_dag[! is.na(all_ego_dag)]
# Prepare nodes
dom_graph <- prep_parents_graph(all_ego_dag)
nodes <- dom_graph$nodes
fdrScalePal <- colorRampPalette(c('#E0F4FF', "#ff8829"))
fdrScale <- fdrScalePal(100)
nodes <- nodes %>%
left_join((ego@result %>% dplyr::select(ID, GeneRatio, BgRatio, qvalue)), by = c("id"="ID")) %>%
mutate(label = paste(label, paste0("GeneRatio=", GeneRatio), paste0("BgRatio=", BgRatio), paste0("qvalue=", signif(qvalue, 3)), sep = "\n"), fdrscale.val=-log10(qvalue)) %>%
replace_na(list(fdrscale.val=0)) %>%
mutate(color.background = fdrScale[cut(fdrscale.val, breaks = 100)]) %>%
dplyr::select(-c(GeneRatio, qvalue, fdrscale.val))
# Plot graph
plot <- visNetwork(nodes, dom_graph$edges, width="100%", height = 1000,
main=paste0("Explore GO BP enrichment in GO DAG ontology"),) %>%
visOptions(highlightNearest = list(enabled = TRUE, algorithm = "hierarchical"), selectedBy = "label") %>%
visPhysics(solver = "hierarchicalRepulsion", hierarchicalRepulsion = list(avoidOverlap = 1)) %>%
visHierarchicalLayout(direction = "UD") %>%
visLegend(addEdges = dom_graph$leg_edges)
visSave(plot, "ego_dag_vizu.html")
Visualize the graph on the exported html document: ego_dag_vizu.html
Have a look at the GO-term “neurotransmitter transport” (GO:0006836)
To build the contingency table, we need:
# Get all annotation between NCBI Gene Symbol and GO-terms
all <- AnnotationDbi::select(HS.annotation, keytype="SYMBOL", keys=universe, columns="GOALL")
# Retrieve all GENE Symbols associated to neurotransmitter transport
table.GO.0006836 <- AnnotationDbi::select(HS.annotation, keytype="GOALL", keys="GO:0006836", columns="SYMBOL")
GO.0006836 <- unique(table.GO.0006836$SYMBOL)
# get the "real" universe size : remove all symbols without annotation to any BP from the universe
all_BP_universe <- all %>% filter(! is.na(ONTOLOGYALL)) %>% filter(ONTOLOGYALL == "BP")
manual_universe <- universe[universe %in% all_BP_universe$SYMBOL]
# get the "real" gene set size also by excluding genes without annotation to BP
manual_gene_set <- DE.set[DE.set %in% all_BP_universe$SYMBOL]
# Remove symbols associated to GO:0006836 that are not in our universe
GO.0006836 <- GO.0006836[GO.0006836 %in% manual_universe]
# Get intersection size
intersection <- sum(manual_gene_set %in% GO.0006836)
Building the contingency table:
L_GO_set <- length(GO.0006836)
L_DE_set <- length(manual_gene_set)
L_universe <- length(manual_universe)
contingency.table <- matrix(c(intersection, L_DE_set - intersection, L_GO_set - intersection, L_universe - intersection - (L_DE_set - intersection) - (L_GO_set - intersection)), ncol = 2, byrow = T)
f.test <- fisher.test(contingency.table, alternative = "greater")
print(f.test$p.value)
## [1] 0.002881322
We retrieve the exact p-value as computed by ClusterProfiler.
phyper(q = (intersection - 1), m = L_GO_set, n = L_universe - L_GO_set, k = L_DE_set, lower.tail = F)
## [1] 0.002881322
To better grasp what do these probabilities mean, we are gonna randomly sample \(k\) gene symbols from the universe, where \(k\) is our gene set size, and estimate our frequently we observed at least as much gene that overlap with the set of the selected GO term (GO:0006836).
# Draw 1000.000 sample of size L_DE_set from manual_universe
N_SAMPLES <- 1000000
samples <- vector(mode = "numeric", length = N_SAMPLES)
for(i in 1:N_SAMPLES){
s <- sample(size = L_DE_set, x = manual_universe, replace = F)
samples[i] <- sum(s %in% GO.0006836)
}
hist(x=samples, freq = F, breaks = seq(0,33,1))
## [1] "Estinate: 0.002864"
We get a good estimate of the previously observed p.values.
ego.no.bg <- enrichGO(gene = DE.set,
universe = NULL, # Don't precise the universe !
OrgDb = HS.annotation,
ont = "BP",
keyType = "SYMBOL",
pAdjustMethod = "BH",
minGSSize = 1,
maxGSSize = 100000)
# Align, order, select top 20 and plot
comparison.bg <- ego.no.bg@result %>% dplyr::select(ID, pvalue) %>% left_join((ego@result %>% dplyr::select(ID, pvalue)), by = "ID")
rownames(comparison.bg) <- NULL
colnames(comparison.bg) <- c("GO.ID", "unspecific.universe", "universe")
comparison.bg <- comparison.bg[order(comparison.bg$unspecific.universe, decreasing = F), ]
top.comparison.bg <- comparison.bg[1:20, ]
data.plot <- top.comparison.bg %>% pivot_longer(cols = c("unspecific.universe", "universe"))
data.plot$GO.ID <- factor(data.plot$GO.ID, levels = top.comparison.bg$GO.ID)
How do the p.values for the top-10 enriched GO-terms evolved ?
data.plot %>% ggplot(aes(x = GO.ID, y = -log10(value), fill = name)) + geom_bar(stat = "identity", position = "dodge") + theme_classic() + theme(axis.text.x = element_text(angle=90, hjust=1), axis.text = element_text(size = 25), axis.title = element_text(size = 22), legend.title = element_text(size=20), legend.text = element_text(size=20), legend.key.size = unit(2, 'cm'))
How much significantly enriched GO-terms are detected with the both universe?
print(paste("Number of enriched BP with assay-specific universe: ", nrow(ego@result[ego@result$qvalue <= 1e-3, ])))
## [1] "Number of enriched BP with assay-specific universe: 133"
print(paste("Number of enriched BP without assay-specific universe: ", nrow(ego.no.bg@result[ego.no.bg@result$qvalue <= 1e-3, ])))
## [1] "Number of enriched BP without assay-specific universe: 218"
The number of enriched GO terms is different.
The p.value is over-estimated.
There are two main other choices that can affect the enrichment results.
We are gonna test 6 combinations: (GO BP v.2013, GO BP v.2021, GO BP v.2023) \(\times\) (\(q.value < 0.1\), \(q.value < 0.01\))
# Loading different versions of the GO-BPs from EnrichR database.
path <- "./BP-archive/"
ontology_files <- c("GO_Biological_Process_2023", "GO_Biological_Process_2021", "GO_Biological_Process_2013") # , "GO_Biological_Process_2017",
# RETURN TERM2GENE and TERM2NAME
parse_ontology <- function(ontology_file){
terms <- c()
genes <- c()
names <- c()
for(line in read_lines(ontology_file)){
parsed_line <- str_split(line, pattern = "\t", simplify = T)
preprocessed_name <- parsed_line[1]
parsed_preprocessed_name <- str_split(preprocessed_name, pattern = "[()]", simplify = T)
name <- substr(parsed_preprocessed_name[1], 1, nchar(parsed_preprocessed_name[1]) - 1)
term <- parsed_preprocessed_name[length(parsed_preprocessed_name) - 1]
genes_list <- parsed_line[3:length(parsed_line)]
genes_list <- genes_list[genes_list != ""]
terms <- c(terms, rep(term, length(genes_list)))
genes <- c(genes, genes_list)
names <- c(names, name)
}
term2genes <- data.frame(TERM=terms, GENE=genes)
term2name <- data.frame(TERM=unique(terms), NAME = names)
return(list("term2gene" = term2genes, "term2name" = term2name))
}
threshold_vector <- list(c(1, 0.01), c(1, 0.1)) # c(1, 0.05),
out <- data.frame()
for(ontology_file in ontology_files){
ont <- parse_ontology(file.path(path, ontology_file))
for(thresholds in threshold_vector){
th_fc <- thresholds[1]
th_pv <- thresholds[2]
# Filtering data and extract symbols
DE.set.eval <- unique(data[abs(data$logFC) > th_fc & data$FDR < th_pv, ]$gene_name)
print(paste("Number of signficiant genes with logFC >", th_fc, "q.value <", th_pv, ":", length(DE.set.eval)))
ego.eval <- enricher(gene = DE.set.eval,
universe = universe,
TERM2GENE = ont$term2gene,
TERM2NAME = ont$term2name,
minGSSize = 1,
maxGSSize = 100000,
pAdjustMethod = "BH")
sub.out <- ego.eval@result
sub.out["threshold"] <- paste0("|LogFC| > ", th_fc, "; q.v < ", th_pv)
sub.out["ontology"] <- ontology_file
out <- rbind(out, sub.out)
}
}
## [1] "Number of signficiant genes with logFC > 1 q.value < 0.01 : 1068"
## [1] "Number of signficiant genes with logFC > 1 q.value < 0.1 : 2736"
## [1] "Number of signficiant genes with logFC > 1 q.value < 0.01 : 1068"
## [1] "Number of signficiant genes with logFC > 1 q.value < 0.1 : 2736"
## [1] "Number of signficiant genes with logFC > 1 q.value < 0.01 : 1068"
## [1] "Number of signficiant genes with logFC > 1 q.value < 0.1 : 2736"
reorder_within <- function(x, by, within1, within2, fun = mean, sep = "___", ...) {
new_x <- paste(x, paste0(within1, within2), sep = sep)
stats::reorder(new_x, by, FUN = fun)
}
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
# Plot
out %>% dplyr::select(ID, Description, pvalue, threshold, ontology) %>%
group_by(threshold, ontology) %>%
slice(1:10) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(Description, -log10(pvalue), threshold, ontology), y = -log10(pvalue))) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_brewer(palette="Spectral") +
theme_classic() +
scale_x_reordered() +
theme(axis.text = element_text(size = 10)) +
facet_wrap(as.factor(threshold) ~ as.factor(ontology), scales="free_y")
The bias effects combine !
The enriched terms and their order are affected.
On the cutoff choices, there is a trade-off between noise and detection sensibility.
Maybe there is an other method that can prevent from imposing a threshold ?
We need a ranking metric: we choose the LogFC. But p.value, or a mixture of the both can also be used.
We set no constrains on the GO terms gene set sizes.
# apply gsea (with gene set permutation)
ordered.genes <- data %>% arrange(desc(logFC))
gsea.input <- ordered.genes$logFC
names(gsea.input) <- ordered.genes$gene_name
res.gsea.go <- gseGO(geneList = gsea.input,
ont = "BP",
OrgDb = HS.annotation,
keyType = "SYMBOL",
pAdjustMethod = "BH",
minGSSize = 1,
maxGSSize = 100000,
pvalueCutoff = 1)
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.04% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 8 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
dotplot(res.gsea.go, font.size=20, showCategory=10, split=".sign", x="GeneRatio", decreasing = TRUE, color="qvalue") +
scale_colour_gradient(low="red", high="blue", limits = c(1e-10, 0.01), guide=guide_colorbar(reverse=FALSE) ) +
facet_grid(. ~ .sign) +
theme(legend.key.size = unit(2, 'cm'), legend.text = element_text(size=25), legend.title = element_text(size=25))
monocarboxylic acid metabolic process (GO:0032787) which is suppressed.
neurotransmitter transport (GO:0000280) which is activated.
Visualize the GO-terms relationships around the two enriched GO terms.
# Get only ancestors
# selected_go_bp_gsea <- res.gsea.go@result[res.gsea.go@result$qvalue <= 1e-2, ]$ID
selected_go_bp_gsea <- c("GO:0000280", "GO:0032787")
all_ego_dag_gsea <- sapply(selected_go_bp_gsea, function(x) {GOBPANCESTOR[[x]]})
all_ego_dag_gsea <- unique(c(selected_go_bp_gsea, unlist(all_ego_dag_gsea)))
# Map enrichment results
all_ego_dag_gsea <- all_ego_dag_gsea[! is.na(all_ego_dag_gsea)]
# Prepare nodes
dom_graph <- prep_parents_graph(all_ego_dag_gsea)
nodes <- dom_graph$nodes
fdrScalePal <- colorRampPalette(c('#E0F4FF', "#ff8829"))
fdrScale <- fdrScalePal(100)
nodes <- nodes %>%
left_join((res.gsea.go@result %>% dplyr::select(ID, setSize, NES, qvalue)), by = c("id"="ID")) %>%
mutate(label = paste(label, paste0("setSize=", setSize), paste0("NES=", signif(NES, 3)), paste0("qvalue=", signif(qvalue, 3)), sep = "\n"), fdrscale.val=-log10(qvalue)) %>%
replace_na(list(fdrscale.val=0)) %>%
mutate(color.background = fdrScale[cut(fdrscale.val, breaks = 100)]) %>%
mutate(borderWidth = 3) %>%
mutate(color.border = ifelse(NES > 0, "red", "blue")) %>%
dplyr::select(-c(setSize, NES, fdrscale.val))
# Plot graph
plot <- visNetwork(nodes, dom_graph$edges, width="100%", height = 1000,
main=paste0("Explore GO BP enrichment in GO DAG ontology - GSEA"),) %>%
visOptions(highlightNearest = list(enabled = TRUE, algorithm = "hierarchical"), selectedBy = "label") %>%
visPhysics(solver = "hierarchicalRepulsion", hierarchicalRepulsion = list(avoidOverlap = 1)) %>%
visHierarchicalLayout(direction = "UD") %>%
visLegend(addEdges = dom_graph$leg_edges)
visSave(plot, "gsea_dag_vizu.html")
Visualize the graph on the exported html document: gsea_dag_vizu.html
Comparing significantly enriched GO terms by the both methods: \(q.value \le 1e-3\). We are gonna vizualise the differences in terms of enrichment directly on the DAG graph.
# Select enriched GO terms
selected_go_bp_gsea <- res.gsea.go@result[res.gsea.go@result$qvalue <= 1e-3, ]$ID
selected_go_bp_ora <- ego@result[ego@result$qvalue <= 1e-3, ]$ID
selected_go_bp_comparison <- unique(c(selected_go_bp_gsea, selected_go_bp_ora))
# Get their ancestors
all_ego_comparison <- sapply(selected_go_bp_comparison, function(x) {GOBPANCESTOR[[x]]})
all_ego_comparison <- unique(c(selected_go_bp_comparison, unlist(all_ego_comparison)))
# Map enrichment results
all_ego_comparison <- all_ego_comparison[! is.na(all_ego_comparison)]
# Prepare nodes
dom_graph <- prep_parents_graph(all_ego_comparison)
nodes <- dom_graph$nodes
# -1 = only GSEA, 0 = both, 1 = only ORA
nodes <- nodes %>%
mutate(category = ifelse(id %in% selected_go_bp_gsea, -1, 0) + ifelse(id %in% selected_go_bp_ora, 1, 0)) %>%
dplyr::select(-color.background) %>%
left_join(data.frame(category=c(-1, 0, 1), color.background=c("#cd69a7", "#7fbeaf", "#ee9b69")))
# set bg color to white for those simply not significants
nodes[! nodes$id %in% selected_go_bp_comparison, ]$color.background <- "white"
# Plot graph
plot <- visNetwork(nodes, dom_graph$edges, width="100%", height = 1000,
main=paste0("Explore GO BP enrichment in GO DAG ontology - GSEA"),) %>%
visOptions(highlightNearest = list(enabled = TRUE, algorithm = "hierarchical"), selectedBy = "label") %>%
visPhysics(solver = "hierarchicalRepulsion", hierarchicalRepulsion = list(avoidOverlap = 1)) %>%
visHierarchicalLayout(direction = "UD") %>%
visLegend(addEdges = dom_graph$leg_edges)
visSave(plot, "gsea_comp_ora_dag_vizu.html")
Visualize the graph on the exported html document: gsea_comp_ora_dag_vizu.html
While GSEA is much more sensible, there is a significant overlap between the both methods results.
The differences follow the hierarchy.
# build the request
query.1 = paste0("MATCH (d:Disease)-[r2:ASSOCIATES_DaG]->(g:Gene) WHERE g.name IN [",
paste0("'", paste0(filtered_data$gene_name, collapse = "' ,'"), "'"),
"] RETURN d.identifier, d.name")
query.2 = paste0("MATCH (c:Compound)-[r1:TREATS_CtD]->(d:Disease)-[r2:ASSOCIATES_DaG]->(g:Gene) WHERE g.name IN [",
paste0("'", paste0(filtered_data$gene_name, collapse = "' ,'"), "'"),
"] RETURN c.identifier, c.name")
To visualize the corresponding graph, go send the query and visualize the graph on the Neo4J browzer. But before, replace the return part of the query with : “RETURN c, r1, d, r2, g”
up.regualted <- filtered_data[filtered_data$logFC > 5, ]
query.4.2 = paste0("MATCH (c:Compound)-[r1:DOWNREGULATES_CdG]->(g:Gene)<-[r2:UPREGULATES_DuG]-(d:Disease) WHERE g.name IN [",
paste0("'", paste0(up.regualted$gene_name, collapse = "' ,'"), "'"),
"] AND d.name = 'breast cancer' RETURN c.identifier, c.name, g.name, d.identifier, d.name, r1, r2")
result.4.2 <- query.4.2 %>% call_neo4j(con)
d.table.2 <- data.frame(c.name = result.4.2$c.name$value, g.name = result.4.2$g.name$value, d.name = result.4.2$d.name$value, r1 = result.4.2$r1, r2 = result.4.2$r2)
See also the details on the relations.
What are the PharmacologicClass of drugs that targets the differentially expressed genes ?
query.bgset <- "MATCH (p:PharmacologicClass)-[r:INCLUDES_PCiC]->(c:Compound)-[r2]->(g:Gene) RETURN DISTINCT g.name, p.identifier, p.name"
result.bgset <- query.bgset %>% call_neo4j(con)
hetionet.TERM2GENE <- data.frame(TERM = result.bgset$p.identifier$value, GENE = result.bgset$g.name$value)
hetionet.TERM2NAME <- data.frame(TERM = result.bgset$p.identifier$value, NAME = result.bgset$p.name$value) %>% distinct()
modules.wgcna <- unique(data.WGCNA$moduleColor)
universe.wgcna <- unique(data.WGCNA$...1)
out <- data.frame()
ont <- "BP"
for(module in modules.wgcna){
set <- data.WGCNA %>% dplyr::filter(moduleColor == module)
ego.module <- enrichGO(gene = set$geneSymbol,
universe = universe.wgcna,
OrgDb = HS.annotation,
ont = ont,
keyType = "SYMBOL",
pAdjustMethod = "BH")
sub.out <- ego.module@result
sub.out["moduleColor"] <- module
out <- rbind(out, sub.out)
}
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
new_x <- paste(x, within, sep = sep)
stats::reorder(new_x, by, FUN = fun)
}
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
cols <- c("black"="black", "blue"="#2448c9", "brown"="#a12f2f", "green"="#2cc937", "grey"="#706c6c", "pink"="#e346a4", "red"="#e0282b", "turquoise"="turquoise", "yellow"="#e0cb28")
# Colors
# Plot
out %>% dplyr::select(ID, Description, qvalue, moduleColor) %>%
group_by(moduleColor) %>%
slice(1:10) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(Description, -log10(qvalue), moduleColor), y = -log10(qvalue), fill=moduleColor)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_brewer(palette="Spectral") +
theme_classic() +
scale_x_reordered() +
scale_fill_manual(values = cols) +
theme(axis.text = element_text(size = 8)) +
facet_wrap(. ~ as.factor(moduleColor), scales="free_y") +
ggtitle(paste("WGCNA - ", ont))
modules.top.GO <- out %>% dplyr::select(ID, GeneRatio, BgRatio, qvalue, moduleColor) %>%
group_by(moduleColor) %>%
slice(1:10)
GO.modules <- unique(modules.top.GO$ID)
# Get all ancestors of each modules
GO.modules.ancestors <- sapply(GO.modules, function(x) {GOBPANCESTOR[[x]]})
GO.modules.hierarchy <- unique(c(GO.modules, unlist(GO.modules.ancestors)))
# Map enrichment results
GO.modules.hierarchy <- GO.modules.hierarchy[! is.na(GO.modules.hierarchy)]
# Prepare nodes
dom_graph <- prep_parents_graph(GO.modules.hierarchy)
nodes <- dom_graph$nodes
nodes <- nodes %>%
left_join((ego@result %>% dplyr::select(ID, GeneRatio, BgRatio, qvalue)), by = c("id"="ID")) %>%
mutate(label = paste(label, paste0("GeneRatio=", GeneRatio), paste0("BgRatio=", BgRatio), paste0("qvalue=", signif(qvalue, 3)), sep = "\n")) %>%
dplyr::select(-c(GeneRatio, qvalue)) %>%
left_join( (modules.top.GO %>% dplyr::select(ID, moduleColor)), by = c("id"="ID")) %>%
mutate(color.background = moduleColor) %>%
replace_na(list(color.background = "white"))
nodes[nodes$color.background == "black", ]$font.color <- "white"
# Plot graph
plot <- visNetwork(nodes, dom_graph$edges, width="100%", height = 1000,
main=paste0("Explore GO BP enrichment in GO DAG ontology"),) %>%
visOptions(highlightNearest = list(enabled = TRUE, algorithm = "hierarchical"), selectedBy = "label") %>%
visPhysics(solver = "hierarchicalRepulsion", hierarchicalRepulsion = list(avoidOverlap = 1)) %>%
visHierarchicalLayout(direction = "UD", blockShifting=FALSE, nodeSpacing=200) %>%
visLegend(addEdges = dom_graph$leg_edges)
visSave(plot, "WGCNA_dag_vizu.html")
Visualize the graph on the exported html document: WGCNA_dag_vizu.html
query.module = paste0("MATCH (g:Gene)-[r]->(g2:Gene) WHERE g.name IN [",
paste0("'", paste0(data.WGCNA[data.WGCNA$moduleColor == "black", ]$geneSymbol, collapse = "' ,'"), "'"),
"] AND g2.name IN [",
paste0("'", paste0(data.WGCNA[data.WGCNA$moduleColor == "black", ]$geneSymbol, collapse = "' ,'"), "'"),
"] RETURN g, r, g2")
In this requests, we extracted:
Therefore, these Fatty Acyls compounds associated with fish oils are related to the same therapeutic effects (Cardiovascular Agents or Hematologic Agents) as some compounds used as treatment for Raynaud’s Disease. Here we retrieve for instance Eicosapentaenoic acid (EPA), which indeed have the same therapeutic effect (Platelet Aggregation Inhibitors) as PGE1 or PGI2 which are used to treat Raynaud’s disease.
See more in the reference article.
DEFINE input:inference "schema-inference-rules"
select distinct (strafter(STR(?other_compound),"http://rdf.ncbi.nlm.nih.gov/pubchem/compound/CID")
as ?CID)
from <https://forum.semantic-metabolomics.org/ClassyFire/direct-parent/2020>
from <https://forum.semantic-metabolomics.org/EnrichmentAnalysis/CID_MESH/2021>
from <https://forum.semantic-metabolomics.org/ChemOnt/2016-08-27>
from <https://forum.semantic-metabolomics.org/MeSHRDF/2022-01-04>
where
{
{
select ?other_compound
where
{
mesh:D011928 skos:related ?RD_c .
?RD_c skos:related ?mesh_TU .
?mesh_TU (meshv:treeNumber|meshv:treeNumber/meshv:parentTreeNumber+) ?tn .
?mesh_selected meshv:treeNumber ?tn .
VALUES ?mesh_selected { mesh:D002317 mesh:D006401 }
?other_compound skos:related ?mesh_TU .
?other_compound skos:related mesh:D005395 .
?other_compound a chemont:0003909 .
}
}
FILTER NOT EXISTS {?other_compound skos:related mesh:D011928}
}
SELECT ?gene ?ensembl_geme_id ?entrez_id ?civic_id ?hgnc_id ?OMID_id ?mesh_id WHERE {
?gene wdt:P31 wd:Q7187 ;
wdt:P703 wd:Q15978631 .
OPTIONAL { ?gene wdt:P594 ?ensembl_geme_id . }
OPTIONAL { ?gene wdt:P351 ?entrez_id . }
OPTIONAL { ?gene wdt:P11277 ?civic_id . }
OPTIONAL { ?gene wdt:P354 ?hgnc_id . }
OPTIONAL { ?gene wdt:P492 ?OMID_id . }
OPTIONAL { ?gene wdt:P486 ?mesh_id . }
}
LIMIT 1000