{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "831c3dbe-e06b-4fbe-94d8-1336aff37b49",
   "metadata": {},
   "source": [
    "```\n",
    "---\n",
    "title: Preprocessing for Enrichment analysis of proteins based on their GeneOntology annotations\n",
    "tags: GeneOntology, UniProt, enrichmentAnalysis, SPARQL\n",
    "lang: en\n",
    "version: 1.0\n",
    "date: 2026-03-30\n",
    "---\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "341d7454-25db-4f4b-80d0-41878043c3c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#import graphviz\n",
    "#import gzip\n",
    "#import importlib\n",
    "import IPython\n",
    "#import json\n",
    "#import math\n",
    "#import matplotlib.pyplot as plt\n",
    "#import networkx as nx\n",
    "#import os\n",
    "import pandas as pd\n",
    "#import rdflib\n",
    "#import rdflib.namespace\n",
    "#import scipy.stats as stats\n",
    "#import sparqldataframe\n",
    "from SPARQLWrapper import SPARQLWrapper, JSON, POST\n",
    "import sparqldataframe\n",
    "#import sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "af982919-04b1-4fd9-be7a-94ce11413130",
   "metadata": {},
   "outputs": [],
   "source": [
    "goaEndpointURL = \"http://localhost:3030/goa/query\"\n",
    "#goEndpointURL = \"http://localhost:3030/go/query\"\n",
    "uniprotEndpointURL = \"http://sparql.uniprot.org/sparql/\"\n",
    "rdfFormat = \"turtle\"\n",
    "\n",
    "taxon = {\"id\":\"9606\", \"name\":\"human\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9ba9cfe4-76e6-43d1-9814-606bade11b36",
   "metadata": {},
   "outputs": [],
   "source": [
    "prefixes = \"\"\"\n",
    "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n",
    "PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>\n",
    "PREFIX owl: <http://www.w3.org/2002/07/owl#>\n",
    "PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n",
    "PREFIX dc: <http://purl.org/dc/elements/1.1/>\n",
    "PREFIX dcterms: <http://purl.org/dc/terms/>\n",
    "PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>\n",
    "\n",
    "\n",
    "PREFIX taxon: <http://purl.uniprot.org/taxonomy/>\n",
    "PREFIX uniprot: <http://purl.uniprot.org/uniprot/>\n",
    "PREFIX up:<http://purl.uniprot.org/core/>\n",
    "\n",
    "PREFIX go: <http://purl.obolibrary.org/obo/GO_>\n",
    "PREFIX goavoc: <http://bio2rdf.org/goa_vocabulary:>\n",
    "\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "892af2e8-34e3-43c5-81af-43418bb10267",
   "metadata": {},
   "source": [
    "# 1. Retrieve GO and GOA"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1fa2c4a-5224-47a7-8119-5123e8db4cef",
   "metadata": {},
   "source": [
    "- Gene Ontology: retrieve `go.owl` from https://purl.obolibrary.org/obo/go.owl (use `wget https://purl.obolibrary.org/obo/go.owl` from the command line)\n",
    "- GOA annotations\n",
    "    - clone https://gitlab.com/odameron/goa2rdf\n",
    "    - run `retrieveGeneOntologyAnnotations_ebi.bash` from the command line\n",
    "- ECO Evidence and Conclusion Ontology: retrieve `eco.owl` from https://raw.githubusercontent.com/evidenceontology/evidenceontology/master/eco.owl (use `wget https://raw.githubusercontent.com/evidenceontology/evidenceontology/master/eco.owl` from the command line)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "34eafac8-2b62-4464-8dc3-f96a939a1351",
   "metadata": {},
   "source": [
    "# 2. Set up a local SPARQL endpoint"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "425ed548-1357-4e40-80fd-7d300ed0d3c0",
   "metadata": {},
   "source": [
    "- install Apache fuseki (instructions at https://gitlab.com/odameron/fusekiInstallationUsage)\n",
    "- in a terminal run `${FUSEKI_HOME}/fuseki-server --file ontologies/goa_human.gaf.ttl --file ontologies/go-latest.owl --file ontologies/evidenceCode.owl /goa`"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "69cd8eb1-ea18-48d6-991b-56ca53f495fd",
   "metadata": {},
   "source": [
    "# 3. Preprocess GO and GOA"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aa188eda-42f6-4af7-8f30-7503236e7fb8",
   "metadata": {},
   "source": [
    "## 3.1 Retrieve GO terms and labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5a3268a-030d-4612-8d21-78f8dcff9e86",
   "metadata": {},
   "outputs": [],
   "source": [
    "with gzip.open(\"results/GeneOntology_label.tsv.gz\", \"wt\") as destFile :\n",
    "    query = \"\"\"\n",
    "SELECT DISTINCT ?goIdent (SAMPLE(?goIdentLabel) AS ?goLabel)\n",
    "WHERE {\n",
    "  VALUES ?goRoot { go:0008150 go:0005575 go:0003674 }\n",
    "  ?goIdent rdfs:subClassOf* ?goRoot .\n",
    "  ?goIdent rdf:type owl:Class .\n",
    "  ?goIdent rdfs:label ?goIdentLabel .\n",
    "  FILTER (STRSTARTS(STR(?goIdent), \"http://purl.obolibrary.org/obo/GO_\") )\n",
    "  FILTER NOT EXISTS {\n",
    "    ?goIdent owl:deprecated \"true\"^^xsd:boolean .\n",
    "  }\n",
    "}\n",
    "GROUP BY ?goIdent\n",
    "\"\"\"\n",
    "    sparql = SPARQLWrapper(goaEndpointURL)\n",
    "    sparql.setQuery(prefixes + query)\n",
    "    sparql.setReturnFormat(JSON)\n",
    "    results = sparql.queryAndConvert()\n",
    "    for result in results[\"results\"][\"bindings\"]:\n",
    "        destFile.write(result[\"goIdent\"][\"value\"] + \"\\t\" + result[\"goLabel\"][\"value\"] + \"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e0496e9b-99ed-4fe9-9269-922b5559d38d",
   "metadata": {},
   "source": [
    "## 3.2 Retrieve GO hierarchy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "669256b8-1c05-4da5-8fc2-891e7313623a",
   "metadata": {},
   "outputs": [],
   "source": [
    "with gzip.open(\"results/GeneOntology_parents.tsv.gz\", \"wt\") as destFile :\n",
    "    query = \"\"\"\n",
    "SELECT DISTINCT ?goIdent ?goParentIdent\n",
    "WHERE {\n",
    "  VALUES ?goRoot { go:0008150 go:0005575 go:0003674 }\n",
    "  ?goIdent rdfs:subClassOf* ?goRoot .\n",
    "  ?goIdent rdf:type owl:Class .\n",
    "  FILTER (STRSTARTS(STR(?goIdent), \"http://purl.obolibrary.org/obo/GO_\") )\n",
    "  FILTER NOT EXISTS {\n",
    "    ?goIdent owl:deprecated \"true\"^^xsd:boolean .\n",
    "  }\n",
    "  ?goIdent rdfs:subClassOf ?goParentIdent .\n",
    "  ?goParentIdent rdf:type owl:Class .\n",
    "  FILTER (STRSTARTS(STR(?goParentIdent), \"http://purl.obolibrary.org/obo/GO_\") )\n",
    "  FILTER NOT EXISTS {\n",
    "    ?goParentIdent owl:deprecated \"true\"^^xsd:boolean .\n",
    "  }\n",
    "}\n",
    "\"\"\"\n",
    "    sparql = SPARQLWrapper(goaEndpointURL)\n",
    "    sparql.setQuery(prefixes + query)\n",
    "    sparql.setReturnFormat(JSON)\n",
    "    results = sparql.queryAndConvert()\n",
    "    for result in results[\"results\"][\"bindings\"]:\n",
    "        destFile.write(result[\"goIdent\"][\"value\"] + \"\\t\" + result[\"goParentIdent\"][\"value\"] + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59704ef8-da03-44ff-99f9-5f47c6e8434f",
   "metadata": {},
   "outputs": [],
   "source": [
    "with gzip.open(\"results/GeneOntology_ancestors.tsv.gz\", \"wt\") as destFile :\n",
    "    query = \"\"\"\n",
    "SELECT DISTINCT ?goIdent ?goAncestorIdent\n",
    "WHERE {\n",
    "  VALUES ?goRoot { go:0008150 go:0005575 go:0003674 }\n",
    "  ?goIdent rdfs:subClassOf* ?goRoot .\n",
    "  ?goIdent rdf:type owl:Class .\n",
    "  FILTER (STRSTARTS(STR(?goIdent), \"http://purl.obolibrary.org/obo/GO_\") )\n",
    "  FILTER NOT EXISTS {\n",
    "    ?goIdent owl:deprecated \"true\"^^xsd:boolean .\n",
    "  }\n",
    "  ?goIdent rdfs:subClassOf+ ?goAncestorIdent .\n",
    "  ?goAncestorIdent rdf:type owl:Class .\n",
    "  FILTER (STRSTARTS(STR(?goAncestorIdent), \"http://purl.obolibrary.org/obo/GO_\") )\n",
    "  FILTER NOT EXISTS {\n",
    "    ?goAncestorIdent owl:deprecated \"true\"^^xsd:boolean .\n",
    "  }\n",
    "}\n",
    "\"\"\"\n",
    "    sparql = SPARQLWrapper(goaEndpointURL)\n",
    "    sparql.setQuery(prefixes + query)\n",
    "    sparql.setReturnFormat(JSON)\n",
    "    results = sparql.queryAndConvert()\n",
    "    for result in results[\"results\"][\"bindings\"]:\n",
    "        destFile.write(result[\"goIdent\"][\"value\"] + \"\\t\" + result[\"goAncestorIdent\"][\"value\"] + \"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71aac625-8d06-4ac0-8e94-94891cda50e0",
   "metadata": {},
   "source": [
    "## 3.3 Retrieve proteins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ccd9ecff-5dc0-44d3-b522-ec058ad0481d",
   "metadata": {},
   "outputs": [],
   "source": [
    "with gzip.open(\"results/\" + taxon[\"name\"] + \"_Proteins_label.tsv.gz\", \"wt\") as destFile :\n",
    "    query = \"\"\"\n",
    "SELECT DISTINCT ?protein ?proteinLabel\n",
    "WHERE {{\n",
    "  ?protein rdf:type up:Protein .\n",
    "  ?protein up:organism taxon:{} .\n",
    "  ?protein rdfs:label ?proteinLabel .\n",
    "}}\n",
    "\"\"\".format(taxon[\"id\"])\n",
    "    sparql = SPARQLWrapper(goaEndpointURL)\n",
    "    sparql.setQuery(prefixes + query)\n",
    "    sparql.setReturnFormat(JSON)\n",
    "    results = sparql.queryAndConvert()\n",
    "    for result in results[\"results\"][\"bindings\"]:\n",
    "        destFile.write(result[\"protein\"][\"value\"] + \"\\t\" + result[\"proteinLabel\"][\"value\"] + \"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd04a79d-2e97-48e0-aabb-7a80f1c28695",
   "metadata": {},
   "source": [
    "## 3.4 Retrieve proteins GO annotations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43c65f7c-dbda-42a6-bbde-25ab2f7f9af1",
   "metadata": {},
   "outputs": [],
   "source": [
    "with gzip.open(\"results/\" + taxon[\"name\"] + \"_Proteins_annotations.tsv.gz\", \"wt\") as destFile :\n",
    "    query = \"\"\"\n",
    "SELECT DISTINCT ?protein ?annotation\n",
    "WHERE {{\n",
    "  ?protein rdf:type up:Protein .\n",
    "  ?protein up:organism taxon:{} .\n",
    "  ?protein goavoc:process|goavoc:component|goavoc:function ?annotationDirect .\n",
    "  ?annotationDirect rdfs:subClassOf* ?annotation .\n",
    "  ?annotation rdf:type owl:Class .\n",
    "  FILTER (STRSTARTS(STR(?annotation), \"http://purl.obolibrary.org/obo/GO_\") )\n",
    "  FILTER NOT EXISTS {{\n",
    "    ?annotation owl:deprecated \"true\"^^xsd:boolean .\n",
    "  }}\n",
    "}}\n",
    "\"\"\".format(taxon[\"id\"])\n",
    "    sparql = SPARQLWrapper(goaEndpointURL)\n",
    "    sparql.setQuery(prefixes + query)\n",
    "    sparql.setReturnFormat(JSON)\n",
    "    results = sparql.queryAndConvert()\n",
    "    for result in results[\"results\"][\"bindings\"]:\n",
    "        destFile.write(result[\"protein\"][\"value\"] + \"\\t\" + result[\"annotation\"][\"value\"] + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
