{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "061a47dc-9b82-4117-966a-ca585da7fb93", "metadata": {}, "outputs": [], "source": [ "import re\n", "import os, sys\n", "import pandas as pd\n", "from rdflib import Graph, Literal, RDF, URIRef" ] }, { "cell_type": "markdown", "id": "6058ab39-567a-4608-8693-939422e13c08", "metadata": {}, "source": [ "# 1. Uniprot KG\n", "\n", "Here is a an example of the Uniprot data schema: \n", "\n", "![:scale 50%](fig/uniprot-PPI.png)\n" ] }, { "cell_type": "markdown", "id": "bea4c102-feb4-482c-899a-305c80a964b6", "metadata": {}, "source": [ "# 2. Which resource is named \"At3g17840\"" ] }, { "cell_type": "code", "execution_count": null, "id": "7fcd5078-4a03-458f-9b07-6ffe1e2e858c", "metadata": {}, "outputs": [], "source": [ "uniprot_query = \"\"\"\n", " \n", "\"\"\"\n", "\n", "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n", "sparql.setQuery(uniprot_query)\n", "sparql.setReturnFormat(JSON)\n", "results = sparql.query().convert()\n", "for r in results['results']['bindings']:\n", " print(r[\"r\"][\"value\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "c4a33bbe-5b3e-4809-b34b-5394eabf07a7", "metadata": {}, "outputs": [], "source": [ "# SOLUTION:\n", "!echo \"UFJFRklYIHJkZjogPGh0dHA6Ly93d3cudzMub3JnLzE5OTkvMDIvMjItcmRmLXN5bnRheC1ucyM+CiAgICBQUkVGSVggcmRmczo8aHR0cDovL3d3dy53My5vcmcvMjAwMC8wMS9yZGYtc2NoZW1hIz4KICAgIFBSRUZJWCBvd2w6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAyLzA3L293bCM+CiAgICBQUkVGSVggeHNkOiA8aHR0cDovL3d3dy53My5vcmcvMjAwMS9YTUxTY2hlbWEjPgogICAgUFJFRklYIGRjOiA8aHR0cDovL3B1cmwub3JnL2RjL2VsZW1lbnRzLzEuMS8+CiAgICBQUkVGSVggZGN0ZXJtczogPGh0dHA6Ly9wdXJsLm9yZy9kYy90ZXJtcy8+CiAgICAKICAgIFBSRUZJWCB0YXhvbjogPGh0dHA6Ly9wdXJsLnVuaXByb3Qub3JnL3RheG9ub215Lz4KICAgIFBSRUZJWCB1bmlwcm90OiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdW5pcHJvdC8+CiAgICBQUkVGSVggdXA6PGh0dHA6Ly9wdXJsLnVuaXByb3Qub3JnL2NvcmUvPgogICAgCiAgICBTRUxFQ1QgP3IgV0hFUkUgewogICAgICAgID9yIHVwOmxvY3VzTmFtZSBBdDNnMTc4NDAgLgogICAgfSA=\" | base64 --decode" ] }, { "cell_type": "markdown", "id": "d46c35ce-618a-4fc5-882b-7cfcac62ad37", "metadata": {}, "source": [ "# 3. What is the type of this resource ?" ] }, { "cell_type": "code", "execution_count": null, "id": "8f843fe2-f5ca-4cb0-9eaf-7f25260c95f4", "metadata": {}, "outputs": [], "source": [ "uniprot_query = \"\"\"\n", " PREFIX rdf: \n", " PREFIX rdfs:\n", " PREFIX owl: \n", " PREFIX xsd: \n", " PREFIX dc: \n", " PREFIX dcterms: \n", " \n", " PREFIX taxon: \n", " PREFIX uniprot: \n", " PREFIX up:\n", " \n", " SELECT ?r ?type WHERE {\n", " ?r up:locusName \"At3g17840\" .\n", " ?r rdf:type ?type .\n", " } \n", "\"\"\"\n", "\n", "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n", "sparql.setQuery(uniprot_query)\n", "sparql.setReturnFormat(JSON)\n", "results = sparql.query().convert()\n", "for r in results['results']['bindings']:\n", " print(r[\"r\"][\"value\"], r[\"type\"][\"value\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "cbd59cef-b16f-49da-840f-64693e00c024", "metadata": {}, "outputs": [], "source": [ "# SOLUTION:\n", "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICAKICAgIFNFTEVDVCA/ciA/dHlwZSBXSEVSRSB7CiAgICAgICAgP3IgdXA6bG9jdXNOYW1lIEF0M2cxNzg0MCAuCiAgICAgICAgP3IgcmRmOnR5cGUgP3R5cGUgLgogICAgfSAK\" | base64 --decode" ] }, { "cell_type": "markdown", "id": "1e5ca7e6-08dc-4b39-8e7e-6315f2c0a93c", "metadata": {}, "source": [ "# 4. Which protein is encoded by this gene ?" ] }, { "cell_type": "code", "execution_count": null, "id": "53e603f8-3172-4b8f-869e-625c077c2952", "metadata": {}, "outputs": [], "source": [ "uniprot_query = \"\"\"\n", "\n", "\"\"\"\n", "\n", "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n", "sparql.setQuery(uniprot_query)\n", "sparql.setReturnFormat(JSON)\n", "results = sparql.query().convert()\n", "for r in results['results']['bindings']:\n", " print(r[\"P1\"][\"value\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "9f6354c6-dc11-4be9-a653-cb7f249d9042", "metadata": {}, "outputs": [], "source": [ "# SOLUTION:\n", "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICAKICAgIFNFTEVDVCA/UDEgV0hFUkUgewogICAgICAgID9nZW5lMSB1cDpsb2N1c05hbWUgQXQzZzE3ODQwIC4KICAgICAgICA/UDEgdXA6ZW5jb2RlZEJ5ID9nZW5lMSAuCiAgICB9IAo=\" | base64 --decode" ] }, { "cell_type": "markdown", "id": "dcd2b205-dfe7-4be5-b26c-c56a2939acf5", "metadata": {}, "source": [ "# 5. Which proteins are interacting with it ?" ] }, { "cell_type": "code", "execution_count": null, "id": "d12f989a-b0e0-47e1-b793-454b0df73bfa", "metadata": {}, "outputs": [], "source": [ "uniprot_query = \"\"\"\n", "\n", "\"\"\"\n", "\n", "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n", "sparql.setQuery(uniprot_query)\n", "sparql.setReturnFormat(JSON)\n", "results = sparql.query().convert()\n", "for r in results['results']['bindings']:\n", " print(r[\"P1\"][\"value\"], r[\"P2\"][\"value\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "0ab6cf75-cfca-4561-9cad-69c4741ae389", "metadata": {}, "outputs": [], "source": [ "# SOLUTION:\n", "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICAKICAgIFNFTEVDVCA/UDEgP1AyIFdIRVJFIHsKICAgICAgICA/Z2VuZTEgdXA6bG9jdXNOYW1lIEF0M2cxNzg0MCAuCiAgICAgICAgP1AxIHVwOmVuY29kZWRCeSA/Z2VuZTEgLgogICAgCiAgICAgICAgP1AxIHVwOmludGVyYWN0aW9uID9pIC4KICAgICAgICA/UDIgdXA6aW50ZXJhY3Rpb24gP2kgLgogICAgICAgIAogICAgICAgIEZJTFRFUiAoP1AyICE9ID9QMSkKICAgIH0gCg==\" | base64 --decode" ] }, { "cell_type": "markdown", "id": "5296b82e-52f2-44e7-8b7a-44ac52f5d8fd", "metadata": {}, "source": [ "# 6. How many experiments support these interactions ?" ] }, { "cell_type": "code", "execution_count": null, "id": "082f54c3-a051-48da-b443-1d2d3d8c59af", "metadata": {}, "outputs": [], "source": [ "uniprot_query = \"\"\"\n", " \n", "\"\"\"\n", "\n", "sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n", "sparql.setQuery(uniprot_query)\n", "sparql.setReturnFormat(JSON)\n", "results = sparql.query().convert()\n", "for r in results['results']['bindings']:\n", " print(r[\"gene2_label\"][\"value\"], r[\"nb_expe\"][\"value\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "83ddcd5a-cc54-4337-83b4-bd534d390a71", "metadata": {}, "outputs": [], "source": [ "# SOLUTION:\n", "!echo \"CiAgICBQUkVGSVggcmRmOiA8aHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIz4KICAgIFBSRUZJWCByZGZzOjxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzAxL3JkZi1zY2hlbWEjPgogICAgUFJFRklYIG93bDogPGh0dHA6Ly93d3cudzMub3JnLzIwMDIvMDcvb3dsIz4KICAgIFBSRUZJWCB4c2Q6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYSM+CiAgICBQUkVGSVggZGM6IDxodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLz4KICAgIFBSRUZJWCBkY3Rlcm1zOiA8aHR0cDovL3B1cmwub3JnL2RjL3Rlcm1zLz4KICAgIAogICAgUFJFRklYIHRheG9uOiA8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvdGF4b25vbXkvPgogICAgUFJFRklYIHVuaXByb3Q6IDxodHRwOi8vcHVybC51bmlwcm90Lm9yZy91bmlwcm90Lz4KICAgIFBSRUZJWCB1cDo8aHR0cDovL3B1cmwudW5pcHJvdC5vcmcvY29yZS8+CiAgICAKICAgIFNFTEVDVCA/Z2VuZTJfbGFiZWwgP25iX2V4cGUgV0hFUkUgewogICAgICAgID9nZW5lMSB1cDpsb2N1c05hbWUgQXQzZzE3ODQwIC4KICAgICAgICA/UDEgdXA6ZW5jb2RlZEJ5ID9nZW5lMSAuCiAgICAKICAgICAgICA/UDEgdXA6aW50ZXJhY3Rpb24gP2kgLgogICAgICAgID9QMiB1cDppbnRlcmFjdGlvbiA/aSAuCiAgICAgICAgP2kgdXA6ZXhwZXJpbWVudHMgP25iX2V4cGUgLgogICAgICAgID9QMSB1cDptbmVtb25pYyA/UDFfbGFiZWwgLgogICAgICAgID9QMiB1cDptbmVtb25pYyA/UDJfbGFiZWwgLgogICAgICAgIEZJTFRFUiAoP1AyICE9ID9QMSkKICAgICAgICA/UDIgdXA6ZW5jb2RlZEJ5ID9nZW5lMiAuCiAgICAgICAgP2dlbmUyIHVwOmxvY3VzTmFtZSA/Z2VuZTJfbGFiZWwgLgogICAgfSBPUkRFUiBCWSBERVNDKD9uYl9leHBlKQo=\" | base64 --decode" ] }, { "cell_type": "markdown", "id": "19b70866-3806-43fb-88e0-c8acc66c293a", "metadata": {}, "source": [ "# 7. Automating the whole process to generate a PPI network" ] }, { "cell_type": "code", "execution_count": null, "id": "50e78e1a-00fc-47c0-be1d-38f64bc751ca", "metadata": {}, "outputs": [], "source": [ "wallomics = pd.read_csv(\"wallomics1_results_prot.tsv\", sep='\\t')\n", "wallomics.head(10)" ] }, { "cell_type": "code", "execution_count": null, "id": "6063e344-5dac-46c3-b448-be148210b940", "metadata": {}, "outputs": [], "source": [ "!pip install tqdm" ] }, { "cell_type": "code", "execution_count": null, "id": "0f687926-1db0-4a52-8358-8284ee4913b9", "metadata": {}, "outputs": [], "source": [ "from SPARQLWrapper import SPARQLWrapper, JSON, TURTLE\n", "from tqdm.notebook import tqdm\n", "\n", "genes = wallomics[\"Gene\"]\n", "#print(genes)\n", "\n", "with_ppi = []\n", "interacting_genes = []\n", "\n", "for g in tqdm(genes):\n", " name = g.capitalize()\n", " #print(name)\n", "\n", " uniprot_query = \"\"\"\n", " PREFIX rdf: \n", " PREFIX rdfs:\n", " PREFIX owl: \n", " PREFIX xsd: \n", " PREFIX dc: \n", " PREFIX dcterms: \n", " \n", " PREFIX taxon: \n", " PREFIX uniprot: \n", " PREFIX up:\n", " \n", " SELECT * WHERE {\n", " ?gene1 up:locusName \\\"\"\"\"+name+\"\"\"\\\" .\n", " ?P1 up:encodedBy ?gene1 .\n", " \n", " ?P1 up:interaction ?i .\n", " ?P2 up:interaction ?i .\n", " ?i up:experiments ?nb_expe .\n", " ?P1 up:mnemonic ?P1_label .\n", " ?P2 up:mnemonic ?P2_label .\n", " FILTER (?P2 != ?P1)\n", " ?P2 up:encodedBy ?gene2 .\n", " ?gene2 up:locusName ?gene2_label .\n", " } ORDER BY DESC(?nb_expe)\n", " \"\"\"\n", " #print(uniprot_query)\n", " \n", " #res = requests.get(\"http://sparql.uniprot.org/sparql/\", params = {\"query\": bgee_query, \"format\": \"application/rdf+json\"})\n", " \n", " sparql = SPARQLWrapper(\"http://sparql.uniprot.org/sparql/\")\n", " sparql.setQuery(uniprot_query)\n", " sparql.setReturnFormat(JSON)\n", " results = sparql.query().convert()\n", " #print(len(results))\n", " #print(results)\n", "\n", " for r in results['results']['bindings']:\n", " interacting_genes.append({\"g1\":name, \"g2\":r['gene2_label']['value']})\n", " #with_ppi.append(r['P1_label']['value'])\n", " \n", "#with_ppi\n", "interacting_genes" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }