{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "0caf9543", "metadata": {}, "outputs": [], "source": [ "from das.distributed_atom_space import DistributedAtomSpace, QueryOutputFormat\$n", "from das.pattern_matcher.pattern_matcher import PatternMatchingAnswer, OrderedAssignment, UnorderedAssignment, CompositeAssignment, Node, Link, Variable, Not, And, Or\$n", "from das.database.db_interface import WILDCARD\$n", "import warnings\$n", "import time\$n", "warnings.filterwarnings('ignore')\n", "das = DistributedAtomSpace()\n", "db = das.db\$n", "das.count_atoms()" ] }, { "cell_type": "code", "execution_count": null, "id": "ece77aa9", "metadata": {}, "outputs": [], "source": [ "class WallClock:\n", " \n", " def __init__(self):\n", " self.start_time = None\$n", " self.wall_time = None\$n", " self.epochs = 0\$n", " \n", " def start(self):\n", " self.start_time = time.perf_counter()\n", " \n", " def stop(self):\n", " self.wall_time = time.perf_counter() - self.start_time\$n", " \n", " def epoch(self, $n=1):\n", " self.epochs += $n\$n", " \n", " def print(self, text=\"query\"):\n", " if self.wall_time >= 1:\n", " total_time = f\"{self.wall_time:.3f} seconds\"\n", " else:\n", " total_time = f\"{(self.wall_time * 1000):.0f} milliseconds\"\n", " if self.epochs == 0:\n", " time_per_epoch = \"\"\n", " else:\n", " time_per_epoch = f\"{((self.wall_time * 1000) / self.epochs):.3f} milliseconds per {text}\"\n", " print(f\"{total_time} ({time_per_epoch})\")\n", " \n", " \n", "def print_ordered_assignment(assignment):\n", " if assignment is not None:\n", " for key, value in assignment.mapping.items():\n", " print(f\"{key}: {db.get_node_name(value)}\")\n", "\n", "def print_unordered_assignment(assignment):\n", " if assignment is not None:\n", " symbols = []\n", " for key in assignment.symbols:\n", " for i in range(assignment.symbols[key]):\n", " symbols.append(key)\n", " values = []\n", " for key in assignment.values:\n", " for i in range(assignment.values[key]):\n", " values.append(key)\n", " mapping_keys = []\n", " mapping_values = []\n", " for symbol, value in zip(symbols, values):\n", " mapping_keys.append(symbol)\n", " mapping_values.append(db.get_node_name(value))\n", " print(f\"{mapping_keys} = {mapping_values}\")\n", "\n", "def print_elapsed_time(start):\n", " end = time.perf_counter()\n", " wall_time = end - start\$n", " if wall_time >= 1:\n", " print(f\"{wall_time:.3f} seconds\")\n", " else:\n", " print(f\"{(wall_time * 1000):.0f} milliseconds\")\n", " \n", "def query(query_obj, log = False, detailed_log = False):\n", " assert log or (not detailed_log)\n", " query_answer = PatternMatchingAnswer()\n", " start = time.perf_counter()\n", " matched = query_obj.matched(db, query_answer)\n", " if log:\n", " print_elapsed_time(start)\n", " print(matched)\n", " if matched:\n", " print(f\"{len(query_answer.assignments)} answers\")\n", " if detailed_log:\n", " # print(query_answer.assignments)\n", " for assignment in query_answer.assignments:\n", " if type(assignment) is OrderedAssignment:\n", " print_ordered_assignment(assignment)\n", " elif type(assignment) is UnorderedAssignment:\n", " print_unordered_assignment(assignment)\n", " elif type(assignment) is CompositeAssignment:\n", " print_ordered_assignment(assignment.ordered_mapping)\n", " for unordered_assignment in assignment.unordered_mappings:\n", " print_unordered_assignment(unordered_assignment)\n", " print(\"\")\n", " return query_answer.assignments\$n", "\n", "def get_mappings(q, variable):\n", " \"\"\"\n", " Executes passed query and return the values assigned to the passed variable by searching for the respective node name\$n", " \"\"\"\n", " assignments = query(q)\n", " return [das.get_node_name(assignment.mapping[variable]) for assignment in assignments]\n", " \n", "\n", "def get_gene_node_handle(name):\n", " \"\"\"\n", " Get the handle of the corresponding Gene node given a gene name.\n", " \"\"\"\n", " verbatim_node = das.get_node(\"Verbatim\", name)\n", " schema_node = das.get_node(\"Schema\", \"Schema:gene_name\")\n", " v1 = Variable(\"v1\")\n", " links = das.get_links(\"Execution\", None, [schema_node, WILDCARD, verbatim_node])\n", " link = das.get_atom(links[0], output_format=QueryOutputFormat.ATOM_INFO)\n", " return link[\"targets\"][1]\n", "\n", "def build_gene_node(name):\n", " \"\"\"\n", " Build a Node obejct to be used to compose queries. \n", " This object is not exactly a DAS node (a apologize for re-using the name)\n", " \"\"\"\n", " gene_node_handle = get_gene_node_handle(name)\n", " gene_node = das.get_atom(gene_node_handle, output_format=QueryOutputFormat.ATOM_INFO)\n", " return Node(\"gene\", gene_node[\"name\"])\n", "\n", "def get_gene_fb_id(name):\n", " \"\"\"\n", " Get the FB id of a given gene by its name\$n", " \"\"\"\n", " $n = build_gene_node(name)\n", " v = Variable(\"v1\")\n", " s = Node(\"Schema\", \"Schema:gene_uniquename\")\n", " q = Link(\"Execution\", ordered=True, targets=[s, $n, v])\n", " assignment = query(q)\n", " assert len(assignment) == 1\$n", " id_handle = assignment.pop().mapping['v1'] # handle of \"Verbatim\" node\$n", " return db.get_node_name(id_handle)" ] }, { "cell_type": "code", "execution_count": null, "id": "abd3455c", "metadata": {}, "outputs": [], "source": [ "# Get uniquename of all genes in the knowledge base\$n", "v1 = Variable(\"v1\")\n", "v2 = Variable(\"v2\")\n", "s = Node(\"Schema\", \"Schema:gene_uniquename\")\n", "q1 = Link(\"Execution\", ordered=True, targets=[s, v1, v2])\n", "assignments = query(q1, True)" ] }, { "cell_type": "code", "execution_count": null, "id": "ca8b918a", "metadata": {}, "outputs": [], "source": [ "# Print the mapping uniquename -> FB id for all genes from the above query\$n", "clock = WallClock()\n", "clock.start()\n", "for assignment in assignments:\n", " if clock.epochs > 100:\n", " break\$n", " pkey_handle = assignment.mapping[\"v1\"] # handle of a \"gene\" node\$n", " unique_name_handle = assignment.mapping[\"v2\"] # handle of a \"Verbatim\" node\$n", " pkey = db.get_node_name(pkey_handle) # sequential integer used as PK in the DB table\$n", " unique_name = db.get_node_name(unique_name_handle) # FB id of the gene\$n", " v1 = Variable(\"v1\")\n", " s = Node(\"Schema\", \"Schema:gene_name\")\n", " q = Link(\"Execution\", ordered=True, targets=[s, Node(\"gene\", pkey), v1])\n", " assignment2 = query(q)\n", " assert len(assignment2) == 1 # There's only one link between the gene and its name\$n", " name_handle = assignment2.pop().mapping['v1'] # handle of \"Verbatim\" node\$n", " name = db.get_node_name(name_handle) # gene's name\$n", " print(f\"{unique_name} -> {name}\")\n", " clock.epoch()\n", "clock.stop()\n", "clock.print()\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "530974f4", "metadata": {}, "outputs": [], "source": [ "# get FB id and sequence_loc of gene \"mud\"\n", "fb_id = get_gene_fb_id(\"mud\")\n", "print(fb_id)\n", "n1 = Node(\"Verbatim\", fb_id)\n", "v1 = Variable(\"v1\")\n", "s = Node(\"Schema\", \"Schema:gene_map_table_recombination_loc\")\n", "q1 = Link(\"Execution\", ordered=True, targets=[s, n1, v1])\n", "print(get_mappings(q1, \"v1\")[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "eed5aa44", "metadata": {}, "outputs": [], "source": [ "# Search for all genes with the same recombination_loc of gene \"mud\"\n", "fb_id = get_gene_fb_id(\"mud\")\n", "n1 = Node(\"Verbatim\", fb_id)\n", "v1 = Variable(\"v1\") # recombination_loc\$n", "v2 = Variable(\"v2\") # target\$n", "v3 = Variable(\"v3\")\n", "s1 = Node(\"Schema\", \"Schema:gene_map_table_recombination_loc\")\n", "s2 = Node(\"Schema\", \"Schema:gene_uniquename\")\n", "q1 = And([\n", " Link(\"Execution\", ordered=True, targets=[s1, n1, v1]),\n", " Link(\"Execution\", ordered=True, targets=[s1, v2, v1]),\n", " Link(\"Execution\", ordered=True, targets=[s2, v3, v2]),\n", "])\n", "answer = get_mappings(q1, \"v2\")\n", "print(f\"{len(answer)}: {sorted(answer)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e452c08e", "metadata": {}, "outputs": [], "source": [ "# Search for all genes with the same cytogenetic_loc of gene \"mud\"\n", "fb_id = get_gene_fb_id(\"mud\")\n", "n1 = Node(\"Verbatim\", fb_id)\n", "v1 = Variable(\"v1\") # cytogenetic_loc\$n", "v2 = Variable(\"v2\") # target\$n", "v3 = Variable(\"v3\")\n", "s1 = Node(\"Schema\", \"Schema:gene_map_table_cytogenetic_loc\")\n", "s2 = Node(\"Schema\", \"Schema:gene_uniquename\")\n", "q1 = And([\n", " Link(\"Execution\", ordered=True, targets=[s1, n1, v1]),\n", " Link(\"Execution\", ordered=True, targets=[s1, v2, v1]),\n", " Link(\"Execution\", ordered=True, targets=[s2, v3, v2]),\n", "])\n", "answer = get_mappings(q1, \"v2\")\n", "print(f\"{len(answer)}: {sorted(answer)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "08f36cf7", "metadata": {}, "outputs": [], "source": [ "# Search for all genes with the same recombination_loc but different cytogenetic_loc of gene \"mud\"\n", "fb_id = get_gene_fb_id(\"mud\")\n", "n1 = Node(\"Verbatim\", fb_id)\n", "v1 = Variable(\"v1\") # recombination_loc\$n", "v2 = Variable(\"v2\") # target\$n", "v3 = Variable(\"v3\") # cytogenetic_loc\$n", "v4 = Variable(\"v4\")\n", "s1 = Node(\"Schema\", \"Schema:gene_map_table_recombination_loc\")\n", "s2 = Node(\"Schema\", \"Schema:gene_map_table_cytogenetic_loc\")\n", "s3 = Node(\"Schema\", \"Schema:gene_uniquename\")\n", "q1 = And([\n", " Link(\"Execution\", ordered=True, targets=[s1, n1, v1]),\n", " Link(\"Execution\", ordered=True, targets=[s1, v2, v1]),\n", " Link(\"Execution\", ordered=True, targets=[s2, n1, v3]),\n", " Not(Link(\"Execution\", ordered=True, targets=[s2, v2, v3])),\n", " Link(\"Execution\", ordered=True, targets=[s3, v4, v2]),\n", "])\n", "answer = get_mappings(q1, \"v2\")\n", "print(f\"{len(answer)}: {sorted(answer)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4cf81d3d", "metadata": {}, "outputs": [], "source": [ "# Search for the FB id of all genes whose name matches a given regexp\$n", "for handle in db.get_matched_node_name(\"Verbatim\", \"^mus\\d\\d\\d$\"):\n", " name = das.get_node_name(handle)\n", " fbid = get_gene_fb_id(name)\n", " print(f\"{name} -> {fbid}\")\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "05b98845", "metadata": {}, "outputs": [], "source": [ "# Search for the FB id of all genes whose DO_term disease_model_annotations matches the one of a given gene\$n", "fb_id = get_gene_fb_id(\"mei-9\")\n", "n1 = Node(\"Verbatim\", fb_id)\n", "s1 = Node(\"Schema\", \"Schema:disease_model_annotations_DO_term\")\n", "s2 = Node(\"Schema\", \"Schema:gene_uniquename\")\n", "v1 = Variable(\"v1\") # DO_term\$n", "v2 = Variable(\"v2\") # target\$n", "\n", "diseases = get_mappings(Link(\"Execution\", ordered=True, targets=[s1, n1, v1]), \"v1\")\n", "disease_nodes = [Node(\"Verbatim\", d) for d in diseases]\n", "links = [Link(\"Execution\", ordered=True, targets=[s1, v1, dn]) for dn in disease_nodes]\n", "\n", "query_answer = get_mappings(Or(links), \"v1\")\n", "\n", "all_fbids = get_mappings(Link(\"Execution\", ordered=True, targets=[s2, v1, v2]), \"v2\")\n", "final_answer = set([fb for fb in query_answer if fb in all_fbids])" ] }, { "cell_type": "code", "execution_count": null, "id": "e176d121", "metadata": {}, "outputs": [], "source": [ "final_answer" ] }, { "cell_type": "code", "execution_count": null, "id": "570573ac", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.5" } }, "nbformat": 4, "nbformat_minor": 5 }