{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "061a47dc-9b82-4117-966a-ca585da7fb93",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import os, sys\n",
    "import pandas as pd\n",
    "from rdflib import Graph, Literal, RDF, URIRef\n",
    "from SPARQLWrapper import SPARQLWrapper, JSON"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6058ab39-567a-4608-8693-939422e13c08",
   "metadata": {},
   "source": [
    "# 1. Uniprot KG\n",
    "\n",
    "Here is a an example of the Uniprot data schema: \n",
    "\n",
    "![:scale 50%](fig/uniprot-PPI.png)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bea4c102-feb4-482c-899a-305c80a964b6",
   "metadata": {},
   "source": [
    "# 2. Which resource is named \"At3g17840\"\n",
    "You can use the following SPARQL prefixes as shortcuts for URLs:\n",
    "```\n",
    "    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n",
    "    PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>\n",
    "    PREFIX owl: <http://www.w3.org/2002/07/owl#>\n",
    "    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n",
    "    PREFIX dc: <http://purl.org/dc/elements/1.1/>\n",
    "    PREFIX dcterms: <http://purl.org/dc/terms/>\n",
    "    \n",
    "    PREFIX taxon: <http://purl.uniprot.org/taxonomy/>\n",
    "    PREFIX uniprot: <http://purl.uniprot.org/uniprot/>\n",
    "    PREFIX up:<http://purl.uniprot.org/core/>\n",
    "```\n",
    "\n",
    "Remember the form of a SPARQL query: `SELECT ?x ?y ... WHERE { ... }`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7fcd5078-4a03-458f-9b07-6ffe1e2e858c",
   "metadata": {},
   "outputs": [],
   "source": [
    "uniprot_query = \"\"\"\n",
    "    \n",
    "\"\"\"\n",
    "\n",
    "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n",
    "sparql.setQuery(uniprot_query)\n",
    "sparql.setReturnFormat(JSON)\n",
    "results = sparql.query().convert()\n",
    "for r in results['results']['bindings']:\n",
    "    print(r[\"x\"][\"value\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4a33bbe-5b3e-4809-b34b-5394eabf07a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# SOLUTION:\n",
    "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CgogICAgU0VMRUNUID94IFdIRVJFIHsKICAgICAgICA/eCB1cDpsb2N1c05hbWUgIkF0M2cxNzg0MCIgLgogICAgfQo=\" | base64 --decode"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d46c35ce-618a-4fc5-882b-7cfcac62ad37",
   "metadata": {},
   "source": [
    "# 3. What is the type of this resource ?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f843fe2-f5ca-4cb0-9eaf-7f25260c95f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "uniprot_query = \"\"\"\n",
    "    \n",
    "\"\"\"\n",
    "\n",
    "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n",
    "sparql.setQuery(uniprot_query)\n",
    "sparql.setReturnFormat(JSON)\n",
    "results = sparql.query().convert()\n",
    "for r in results['results']['bindings']:\n",
    "    print(r[\"r\"][\"value\"], r[\"type\"][\"value\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a04315e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import base64\n",
    "encoded = base64.b64encode(uniprot_query.encode(\"utf-8\"))\n",
    "encoded.decode(\"utf-8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbd59cef-b16f-49da-840f-64693e00c024",
   "metadata": {},
   "outputs": [],
   "source": [
    "# SOLUTION:\n",
    "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICAKICAgIFNFTEVDVCA/ciA/dHlwZSBXSEVSRSB7CiAgICAgICAgP3IgdXA6bG9jdXNOYW1lICJBdDNnMTc4NDAiIC4KICAgICAgICA/ciByZGY6dHlwZSA/dHlwZSAuCiAgICB9IAo=\" | base64 --decode"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1e5ca7e6-08dc-4b39-8e7e-6315f2c0a93c",
   "metadata": {},
   "source": [
    "# 4. Which proteins are encoded by this gene ?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53e603f8-3172-4b8f-869e-625c077c2952",
   "metadata": {},
   "outputs": [],
   "source": [
    "uniprot_query = \"\"\"\n",
    "    \n",
    "\"\"\"\n",
    "\n",
    "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n",
    "sparql.setQuery(uniprot_query)\n",
    "sparql.setReturnFormat(JSON)\n",
    "results = sparql.query().convert()\n",
    "for r in results['results']['bindings']:\n",
    "    print(r[\"P1\"][\"value\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f6354c6-dc11-4be9-a653-cb7f249d9042",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# SOLUTION:\n",
    "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICAKICAgIFNFTEVDVCA/UDEgV0hFUkUgewogICAgICAgID9nZW5lMSB1cDpsb2N1c05hbWUgIkF0M2cxNzg0MCIgLgogICAgICAgID9QMSB1cDplbmNvZGVkQnkgP2dlbmUxIC4KICAgIH0gCg==\" | base64 --decode"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ef07672",
   "metadata": {},
   "source": [
    "# 5 What are the GO annotations for these proteins?\n",
    "- use the \"up:classifiedWith\" property to get the GO terms associated with a protein \n",
    "- use the \"rdfs:label\" property to get the label of the GO term\n",
    "- manually check the GO term hierarchy with QuickGO, e.g. https://www.ebi.ac.uk/QuickGO/GTerm?id=GO:0004672, or https://www.ebi.ac.uk/QuickGO/GTerm?id=GO:0005829 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ba6d8cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "uniprot_query = \"\"\"\n",
    "    \n",
    "\"\"\"\n",
    "\n",
    "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n",
    "sparql.setQuery(uniprot_query)\n",
    "sparql.setReturnFormat(JSON)\n",
    "results = sparql.query().convert()\n",
    "for r in results['results']['bindings']:\n",
    "    print(r[\"P1\"][\"value\"], r[\"goTermLabel\"][\"value\"], r[\"goTerm\"][\"value\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9110af1",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# SOLUTION:\n",
    "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICBQUkVGSVggZ286IDxodHRwOi8vcHVybC5vYm9saWJyYXJ5Lm9yZy9vYm8vR09fPgogICAgCiAgICBTRUxFQ1QgP1AxID9nb1Rlcm1MYWJlbCA/Z29UZXJtICBXSEVSRSB7CiAgICAgICAgP2dlbmUxIHVwOmxvY3VzTmFtZSAiQXQzZzE3ODQwIiAuCiAgICAgICAgP1AxIHVwOmVuY29kZWRCeSA/Z2VuZTEgLgogICAgICAgID9QMSB1cDpjbGFzc2lmaWVkV2l0aCA/Z29UZXJtIC4KICAgICAgICA/Z29UZXJtIHJkZnM6bGFiZWwgP2dvVGVybUxhYmVsIC4KICAgIH0gCg==\" | base64 --decode"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1607848",
   "metadata": {},
   "source": [
    "# 5.1 Filter only proteins with GO terms associated with a biological function\n",
    "- Use the ` ?x rdfs:subClassOf* go:0003674 .` property path expression to ensure that the GO term is a descendant of the \"molecular function\" root term (GO:0003674).\n",
    "- Don't forget to declare the `go` prefix in your query (`PREFIX go: <http://purl.obolibrary.org/obo/GO_>`)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5b0d1c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "uniprot_query = \"\"\"\n",
    "    \n",
    "\"\"\"\n",
    "\n",
    "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n",
    "sparql.setQuery(uniprot_query)\n",
    "sparql.setReturnFormat(JSON)\n",
    "results = sparql.query().convert()\n",
    "for r in results['results']['bindings']:\n",
    "    print(r[\"P1\"][\"value\"], r[\"goTermLabel\"][\"value\"], r[\"goTerm\"][\"value\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f1dade1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# SOLUTION:\n",
    "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICBQUkVGSVggZ286IDxodHRwOi8vcHVybC5vYm9saWJyYXJ5Lm9yZy9vYm8vR09fPgogICAgCiAgICBTRUxFQ1QgP1AxID9nb1Rlcm1MYWJlbCA/Z29UZXJtICBXSEVSRSB7CiAgICAgICAgP2dlbmUxIHVwOmxvY3VzTmFtZSAiQXQzZzE3ODQwIiAuCiAgICAgICAgP1AxIHVwOmVuY29kZWRCeSA/Z2VuZTEgLgogICAgICAgID9QMSB1cDpjbGFzc2lmaWVkV2l0aCA/Z29UZXJtIC4KICAgICAgICA/Z29UZXJtIHJkZnM6c3ViQ2xhc3NPZiogZ286MDAwMzY3NCAuCiAgICAgICAgP2dvVGVybSByZGZzOmxhYmVsID9nb1Rlcm1MYWJlbCAuCiAgICB9IAo=\" | base64 --decode"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9231ae93",
   "metadata": {},
   "source": [
    "# 5.2 Display all the GO annotations for these proteins, including the label of the GO term \n",
    "- Use the `rdfs:label` property to get the label of the GO term\n",
    "- Use again the `rdfs:subClassOf*` property path expression to enumerate the whole hierarchy of GO terms under the \"molecular function\" root term (GO:0003674).  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3e3f6a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "uniprot_query = \"\"\"\n",
    "    \n",
    "\"\"\"\n",
    "\n",
    "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n",
    "sparql.setQuery(uniprot_query)\n",
    "sparql.setReturnFormat(JSON)\n",
    "results = sparql.query().convert()\n",
    "for r in results['results']['bindings']:\n",
    "    print(r[\"P1\"][\"value\"], r[\"parentGOLabel\"][\"value\"], r[\"parentGO\"][\"value\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bd1660d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# SOLUTION:\n",
    "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICBQUkVGSVggZ286IDxodHRwOi8vcHVybC5vYm9saWJyYXJ5Lm9yZy9vYm8vR09fPgogICAgCiAgICBTRUxFQ1QgRElTVElOQ1QgP1AxID9wYXJlbnRHT0xhYmVsID9wYXJlbnRHTyAgV0hFUkUgewogICAgICAgID9nZW5lMSB1cDpsb2N1c05hbWUgIkF0M2cxNzg0MCIgLgogICAgICAgID9QMSB1cDplbmNvZGVkQnkgP2dlbmUxIC4KICAgICAgICA/UDEgdXA6Y2xhc3NpZmllZFdpdGggP2dvVGVybSAuCiAgICAgICAgP2dvVGVybSByZGZzOnN1YkNsYXNzT2YqIGdvOjAwMDM2NzQgLgogICAgICAgID9nb1Rlcm0gcmRmczpsYWJlbCA/Z29UZXJtTGFiZWwgLgogICAgICAgIAogICAgICAgID9nb1Rlcm0gcmRmczpzdWJDbGFzc09mKiA/cGFyZW50R08gLgogICAgICAgID9wYXJlbnRHTyByZGZzOmxhYmVsID9wYXJlbnRHT0xhYmVsIC4KICAgIH0gCg==\" | base64 --decode"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39fd0412",
   "metadata": {},
   "source": [
    "# 5.3 Identify target genes for a given GO term (\"angiogenesis\" `go:0001525`) and its descendants\n",
    "- Use the `rdfs:subClassOf*` property path expression to get all the GO terms that are descendants of \"angiogenesis\".\n",
    "- Use the `up:classifiedWith` property to get all the proteins annotated with these GO terms.\n",
    "- Use the `up:encodedBy` property to get the genes encoding these proteins.\n",
    "- Use the `skos:prefLabel` property to get the gene name."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51191259",
   "metadata": {},
   "outputs": [],
   "source": [
    "uniprot_query = \"\"\"\n",
    "    \n",
    "\"\"\"\n",
    "\n",
    "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n",
    "sparql.setQuery(uniprot_query)\n",
    "sparql.setReturnFormat(JSON)\n",
    "results = sparql.query().convert()\n",
    "for r in results['results']['bindings']:\n",
    "    print(r[\"G_label\"][\"value\"], r[\"goTermLabel\"][\"value\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "104ab3fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# SOLUTION:\n",
    "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICBQUkVGSVggZ286IDxodHRwOi8vcHVybC5vYm9saWJyYXJ5Lm9yZy9vYm8vR09fPgogICAgUFJFRklYIHNrb3M6IDxodHRwOi8vd3d3LnczLm9yZy8yMDA0LzAyL3Nrb3MvY29yZSM+CiAgICAKICAgIFNFTEVDVCBESVNUSU5DVCA/R19sYWJlbCA/Z29UZXJtTGFiZWwgV0hFUkUgewogICAgICAgID9nb1Rlcm0gcmRmczpzdWJDbGFzc09mKiBnbzowMDAxNTI1IC4KICAgICAgICA/Z29UZXJtIHJkZnM6bGFiZWwgP2dvVGVybUxhYmVsIC4KICAgICAgICA/UCB1cDpjbGFzc2lmaWVkV2l0aCA/Z29UZXJtIC4KICAgICAgICA/UCB1cDplbmNvZGVkQnkgP0cgLgoKICAgICAgICA/RyBza29zOnByZWZMYWJlbCA/R19sYWJlbCAuCiAgICB9IAo=\" | base64 --decode"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dcd2b205-dfe7-4be5-b26c-c56a2939acf5",
   "metadata": {},
   "source": [
    "# 6. Protein-Protein Interactions\n",
    "## Which proteins are interacting with \"At3g17840\" ?\n",
    "- use the \"up:interaction\" property to get the interacting proteins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d12f989a-b0e0-47e1-b793-454b0df73bfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "uniprot_query = \"\"\"\n",
    "    \n",
    "\"\"\"\n",
    "\n",
    "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n",
    "sparql.setQuery(uniprot_query)\n",
    "sparql.setReturnFormat(JSON)\n",
    "results = sparql.query().convert()\n",
    "for r in results['results']['bindings']:\n",
    "    print(r[\"P1\"][\"value\"], r[\"P2\"][\"value\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ab6cf75-cfca-4561-9cad-69c4741ae389",
   "metadata": {},
   "outputs": [],
   "source": [
    "# SOLUTION:\n",
    "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICAKICAgIFNFTEVDVCA/UDEgP1AyIFdIRVJFIHsKICAgICAgICA/Z2VuZTEgdXA6bG9jdXNOYW1lIEF0M2cxNzg0MCAuCiAgICAgICAgP1AxIHVwOmVuY29kZWRCeSA/Z2VuZTEgLgogICAgCiAgICAgICAgP1AxIHVwOmludGVyYWN0aW9uID9pIC4KICAgICAgICA/UDIgdXA6aW50ZXJhY3Rpb24gP2kgLgogICAgICAgIAogICAgICAgIEZJTFRFUiAoP1AyICE9ID9QMSkKICAgIH0gCg==\" | base64 --decode"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5296b82e-52f2-44e7-8b7a-44ac52f5d8fd",
   "metadata": {},
   "source": [
    "# 6. How many experiments support these interactions ?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "082f54c3-a051-48da-b443-1d2d3d8c59af",
   "metadata": {},
   "outputs": [],
   "source": [
    "uniprot_query = \"\"\"\n",
    "    \n",
    "\"\"\"\n",
    "\n",
    "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n",
    "sparql.setQuery(uniprot_query)\n",
    "sparql.setReturnFormat(JSON)\n",
    "results = sparql.query().convert()\n",
    "for r in results['results']['bindings']:\n",
    "    print(r[\"gene2_label\"][\"value\"], r[\"nb_expe\"][\"value\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83ddcd5a-cc54-4337-83b4-bd534d390a71",
   "metadata": {},
   "outputs": [],
   "source": [
    "# SOLUTION:\n",
    "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICAKICAgIFNFTEVDVCA/Z2VuZTJfbGFiZWwgP25iX2V4cGUgV0hFUkUgewogICAgICAgID9nZW5lMSB1cDpsb2N1c05hbWUgIkF0M2cxNzg0MCIgLgogICAgICAgID9QMSB1cDplbmNvZGVkQnkgP2dlbmUxIC4KICAgIAogICAgICAgID9QMSB1cDppbnRlcmFjdGlvbiA/aSAuCiAgICAgICAgP1AyIHVwOmludGVyYWN0aW9uID9pIC4KICAgICAgICA/aSB1cDpleHBlcmltZW50cyA/bmJfZXhwZSAuCiAgICAgICAgP1AxIHVwOm1uZW1vbmljID9QMV9sYWJlbCAuCiAgICAgICAgP1AyIHVwOm1uZW1vbmljID9QMl9sYWJlbCAuCiAgICAgICAgRklMVEVSICg/UDIgIT0gP1AxKQogICAgICAgID9QMiB1cDplbmNvZGVkQnkgP2dlbmUyIC4KICAgICAgICA/Z2VuZTIgdXA6bG9jdXNOYW1lID9nZW5lMl9sYWJlbCAuCiAgICB9IE9SREVSIEJZIERFU0MoP25iX2V4cGUpCg==\" | base64 --decode"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19b70866-3806-43fb-88e0-c8acc66c293a",
   "metadata": {},
   "source": [
    "# 7. Automating the whole process to generate a PPI network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50e78e1a-00fc-47c0-be1d-38f64bc751ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "wallomics = pd.read_csv(\"data/wallomics1_results_prot.tsv\", sep='\\t')\n",
    "wallomics.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6063e344-5dac-46c3-b448-be148210b940",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f687926-1db0-4a52-8358-8284ee4913b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from SPARQLWrapper import SPARQLWrapper, JSON, TURTLE\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "# limit to the first 100 genes for testing\n",
    "genes = wallomics[\"Gene\"].head(100).tolist()\n",
    "#print(genes)\n",
    "\n",
    "with_ppi = []\n",
    "interacting_genes = []\n",
    "\n",
    "for g in tqdm(genes):\n",
    "    name = g.capitalize()\n",
    "    #print(name)\n",
    "\n",
    "    uniprot_query = \"\"\"\n",
    "    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n",
    "    PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>\n",
    "    PREFIX owl: <http://www.w3.org/2002/07/owl#>\n",
    "    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n",
    "    PREFIX dc: <http://purl.org/dc/elements/1.1/>\n",
    "    PREFIX dcterms: <http://purl.org/dc/terms/>\n",
    "    \n",
    "    PREFIX taxon: <http://purl.uniprot.org/taxonomy/>\n",
    "    PREFIX uniprot: <http://purl.uniprot.org/uniprot/>\n",
    "    PREFIX up:<http://purl.uniprot.org/core/>\n",
    "    \n",
    "    SELECT * WHERE {\n",
    "        ?gene1 up:locusName \\\"\"\"\"+name+\"\"\"\\\" .\n",
    "        ?P1 up:encodedBy ?gene1 .\n",
    "    \n",
    "        ?P1 up:interaction ?i .\n",
    "        ?P2 up:interaction ?i .\n",
    "        ?i up:experiments ?nb_expe .\n",
    "        ?P1 up:mnemonic ?P1_label .\n",
    "        ?P2 up:mnemonic ?P2_label .\n",
    "        FILTER (?P2 != ?P1)\n",
    "        ?P2 up:encodedBy ?gene2 .\n",
    "        ?gene2 up:locusName ?gene2_label .\n",
    "    } ORDER BY DESC(?nb_expe)\n",
    "    \"\"\"\n",
    "    #print(uniprot_query)\n",
    "    \n",
    "    #res = requests.get(\"http://sparql.uniprot.org/sparql/\", params = {\"query\": bgee_query, \"format\": \"application/rdf+json\"})\n",
    "    \n",
    "    sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n",
    "    sparql.setQuery(uniprot_query)\n",
    "    sparql.setReturnFormat(JSON)\n",
    "    results = sparql.query().convert()\n",
    "    #print(len(results))\n",
    "    #print(results)\n",
    "\n",
    "    for r in results['results']['bindings']:\n",
    "        interacting_genes.append({\"g1\":name, \"g2\":r['gene2_label']['value'], \"nb_expe\":r['nb_expe']['value']})\n",
    "        #with_ppi.append(r['P1_label']['value'])\n",
    "        \n",
    "#with_ppi\n",
    "interacting_genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b3f51bc-b3ea-4419-a7b5-28da9c35928f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!pip install ipycytoscape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e797de64-eaa6-485b-8d35-c40f01c8feba",
   "metadata": {},
   "source": [
    "# Display the network with cytoscape "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "988602f4-dbe1-40ea-a6bf-66787d1581dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ipycytoscape\n",
    "\n",
    "nodes = []\n",
    "graph_struct = {}\n",
    "graph_struct[\"nodes\"]=[]\n",
    "graph_struct[\"edges\"]=[]\n",
    "for g in interacting_genes:\n",
    "    if not g[\"g1\"] in nodes : \n",
    "        nodes.append(g[\"g1\"])\n",
    "        graph_struct[\"nodes\"].append({\"data\":{\"id\":g[\"g1\"], \"label\":g[\"g1\"]}})\n",
    "    if not g[\"g2\"] in nodes : \n",
    "        nodes.append(g[\"g2\"])\n",
    "        graph_struct[\"nodes\"].append({\"data\":{\"id\":g[\"g2\"], \"label\":g[\"g2\"]}})\n",
    "    graph_struct[\"edges\"].append( {\"data\": { \"source\": g[\"g1\"], \"target\": g[\"g2\"], \"weight\": g[\"nb_expe\"] }})\n",
    "\n",
    "ipycytoscape_obj = ipycytoscape.CytoscapeWidget()\n",
    "\n",
    "ipycytoscape_obj.set_style([{\n",
    "                        'selector': 'node',\n",
    "                        'style': {\n",
    "                            'background-color': '#11479e',\n",
    "                            'label': 'data(label)',\n",
    "                            }\n",
    "                        },\n",
    "                        {\n",
    "                            'selector': 'edge',\n",
    "                            'style': {\n",
    "                                'width': 'data(weight)',\n",
    "                                'line-color': '#9dbaea',\n",
    "                                'target-arrow-shape': 'triangle',\n",
    "                                'target-arrow-color': '#9dbaea',\n",
    "                                'curve-style': 'bezier'\n",
    "                            }\n",
    "                        }])\n",
    "\n",
    "ipycytoscape_obj.graph.add_graph_from_json(graph_struct, directed=True)\n",
    "ipycytoscape_obj.set_layout(name=\"cose\")\n",
    "ipycytoscape_obj"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "etbii",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
