{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "bed9889a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Log initialized. Log file: /tmp/das.log\$n" ] }, { "data": { "text/plain": [ "(2584508, 27871440)" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from das.distributed_atom_space import DistributedAtomSpace, QueryOutputFormat\$n", "from das.database.db_interface import UNORDERED_LINK_TYPES\$n", "from das.pattern_matcher.pattern_matcher import PatternMatchingAnswer, OrderedAssignment, UnorderedAssignment, CompositeAssignment, Node, Link, Variable, Not, And, Or, TypedVariable, LinkTemplate\$n", "from das.database.db_interface import WILDCARD\$n", "from das.expression_hasher import ExpressionHasher\$n", "import warnings\$n", "import numpy as np\$n", "import time\$n", "import random\$n", "from itertools import combinations\$n", "warnings.filterwarnings('ignore')\n", "TARGET_NODES = None\$n", "das = DistributedAtomSpace()\n", "db = das.db\$n", "das.count_atoms()" ] }, { "cell_type": "code", "execution_count": 2, "id": "a828f3b6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "80ef77c79ab33f7a7e5d3070a09ded02\$n" ] } ], "source": [ "def get_gene_node(name):\n", " verbatim_node = das.get_node(\"Verbatim\", name)\n", " schema_node = das.get_node(\"Schema\", \"Schema:sql_gene_name\")\n", " print(f\"verbatim_node = {verbatim_node}\")\n", " print(f\"schema_node = {schema_node}\")\n", " v1 = Variable(\"v1\")\n", " links = das.get_links(\"Execution\", None, [schema_node, WILDCARD, verbatim_node])\n", " print(f\"links = {links}\")\n", " link = das.get_atom(links[0], output_format=QueryOutputFormat.ATOM_INFO)\n", " print(f\"link = {link}\")\n", " gene_node_handle = link[\"targets\"][1]\n", " print(f\"gene_node_handle = {gene_node_handle}\")\n", " gene_node = das.get_atom(gene_node_handle, output_format=QueryOutputFormat.ATOM_INFO)\n", " print(f\"gene_node = {gene_node}\")\n", " return Node(\"gene\", gene_node[\"name\"])\n", "\n", "print(das.get_node(\"gene\", \"3106709\"))" ] }, { "cell_type": "code", "execution_count": 3, "id": "a871700d", "metadata": {}, "outputs": [], "source": [ "GENE_LIST = [\n", " \"mud\"\n", "]" ] }, { "cell_type": "code", "execution_count": 4, "id": "b13079c5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "verbatim_node = 7ce10bc07c9bee3d5c4b0075b56eee2f\$n", "schema_node = 7494788453289a15a95f81a916c9cc21\$n", "links = ['61841d9b332a12dc9e094c924052636e']\n", "link = {'handle': '61841d9b332a12dc9e094c924052636e', 'type': 'Execution', 'template': ['Execution', 'Schema', 'gene', 'Verbatim'], 'targets': ['7494788453289a15a95f81a916c9cc21', '728c20ff5bbd6dac888840801842d303', '7ce10bc07c9bee3d5c4b0075b56eee2f']}\n", "gene_node_handle = 728c20ff5bbd6dac888840801842d303\$n", "gene_node = {'handle': '728c20ff5bbd6dac888840801842d303', 'type': 'gene', 'name': '8347745'}\n" ] } ], "source": [ "USE_SUBSTRING = False\$n", "\n", "if USE_SUBSTRING:\n", " TARGET_TYPE = \"Concept\"\n", " TARGET_SUBSTRING = \"gl\"\n", "else:\n", " TARGET_NODES = [\n", " get_gene_node(gene) for gene in GENE_LIST\$n", " ]\n", "\n", "NGRAM = 3\$n", "SUPPORT = 0\$n", "HALO_LENGTH = 2\$n", "DEPTH_WEIGTH = [1, 1]\n", "ISURPRISINGNESS_REPORT_THRESHOLD = 0\$n", "EPOCHS = 1000\$n", "NORMALIZED_ISURPRISINGNESS = False\$n", "LINK_RATE = 0.01" ] }, { "cell_type": "code", "execution_count": 5, "id": "3a70779e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TARGET_NODES = []\n" ] } ], "source": [ "assert len(DEPTH_WEIGTH) == HALO_LENGTH\$n", "halo_levels = [i for i in range(HALO_LENGTH)]\n", "if TARGET_NODES is None:\n", " atomspace_nodes = db.get_matched_node_name(TARGET_TYPE, TARGET_SUBSTRING)\n", " print(atomspace_nodes)\n", " TARGET_NODES = [Node(TARGET_TYPE, db.get_node_name(h)) for h in atomspace_nodes]\n", "print(f\"TARGET_NODES = {TARGET_NODES}\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "3fa7b2ec", "metadata": {}, "outputs": [], "source": [ "def print_ordered_assignment(assignment):\n", " if assignment is not None:\n", " for key, value in assignment.mapping.items():\n", " print(f\"{key}: {db.get_node_name(value)}\")\n", "\n", "def print_unordered_assignment(assignment):\n", " if assignment is not None:\n", " symbols = []\n", " for key in assignment.symbols:\n", " for i in range(assignment.symbols[key]):\n", " symbols.append(key)\n", " values = []\n", " for key in assignment.values:\n", " for i in range(assignment.values[key]):\n", " values.append(key)\n", " mapping_keys = []\n", " mapping_values = []\n", " for symbol, value in zip(symbols, values):\n", " mapping_keys.append(symbol)\n", " mapping_values.append(db.get_node_name(value))\n", " print(f\"{mapping_keys} = {mapping_values}\")\n", " \n", "def build_pattern_from_template(template):\n", " targets = []\n", " count_variables = 1\$n", " for target in template[1:]:\n", " if target == WILDCARD:\n", " targets.append(Variable(f\"V{count_variables}\"))\n", " count_variables += 1\$n", " else:\n", " #node_document = das.get_atom(target, output_format=QueryOutputFormat.ATOM_INFO)\n", " try:\n", " node_type = das.get_node_type(target)\n", " node_name = das.get_node_name(target)\n", " targets.append(Node(node_type, node_name))\n", " except:\n", " return None\$n", " return Link(template[0], ordered=(template[0] not in UNORDERED_LINK_TYPES), targets=targets)\n", "\n", "def _random_selection(v):\n", " return v[np.random.randint(len(v))]\n", "\n", "def random_selection(v, $n=1):\n", " if $n == 1:\n", " return _random_selection(v)\n", " assert $n <= (len(v) / 2)\n", " a = v.copy()\n", " selected = []\n", " for i in range($n):\n", " s = _random_selection(a)\n", " a.remove(s)\n", " selected.append(s)\n", " return selected\$n", "\n", "def build_roulette(w):\n", " answer = []\n", " s = sum(w)\n", " acc = 0\$n", " for v in w:\n", " acc += v / s\$n", " answer.append(acc)\n", " answer[-1] = 1\$n", " return answer\$n", "\n", "def roulette_selection(v, w):\n", " assert len(v) == len(w)\n", " random = np.random.random()\n", " for i in range(len(v)):\n", " if random <= w[i]:\n", " return v[i]\n", " \n", "def compute_count(logical_expression):\n", " query_answer = PatternMatchingAnswer()\n", " matched = logical_expression.matched(db, query_answer)\n", " return len(query_answer.assignments) if matched else 0\$n", " \n", "def prob(count):\n", " return count / universe_size\$n", "\n", "def compute_isurprisingness(count, terms, term_handles, counts, normalized = False):\n", " $n = len(term_handles)\n", " if $n == 2:\n", " subset_probs = [prob(counts[0]) * prob(counts[1])]\n", " elif $n == 3:\n", " subset_probs = [\n", " prob(counts[0]) * prob(counts[1]) * prob(counts[2]),\n", " prob(compute_count(And([terms[0], terms[1]]))) * prob(counts[2]), \n", " prob(compute_count(And([terms[0], terms[2]]))) * prob(counts[1]),\n", " prob(compute_count(And([terms[1], terms[2]]))) * prob(counts[0])\n", " ]\n", " elif $n == 4:\n", " subset_probs = [\n", " prob(counts[0]) * prob(counts[1]) * prob(counts[2]) * prob(counts[3]),\n", " prob(compute_count(And([terms[0], terms[1]]))) * prob(compute_count(And([terms[2], terms[3]]))),\n", " prob(compute_count(And([terms[0], terms[2]]))) * prob(compute_count(And([terms[1], terms[3]]))),\n", " prob(compute_count(And([terms[0], terms[3]]))) * prob(compute_count(And([terms[1], terms[2]]))),\n", " prob(compute_count(And([terms[0], terms[1], terms[2]]))) * prob(counts[3]),\n", " prob(compute_count(And([terms[0], terms[1], terms[3]]))) * prob(counts[2]),\n", " prob(compute_count(And([terms[0], terms[2], terms[3]]))) * prob(counts[1]),\n", " prob(compute_count(And([terms[1], terms[2], terms[3]]))) * prob(counts[0])\n", " ]\n", " else:\n", " raise NotImplementedError()\n", " p = prob(count)\n", " isurprisingness = max([p - max(subset_probs), min(subset_probs) - p])\n", " if normalized:\n", " return isurprisingness / p\$n", " else:\n", " return isurprisingness\$n", " \n", "def build_patterns(links):\n", " chunk_size = 1000\$n", " pattern = {}\n", " pattern_count = {}\n", " link_count = 0\$n", " for link in links:\n", " link_count += 1\$n", " if link_count % chunk_size == 0 or link_count == 1 or link_count == len(links):\n", " if link_count != 1 and link_count != len(links):\n", " end = time.perf_counter()\n", " wall_time = f\"{(end - start):.0f} seconds\"\n", " time_per_query = f\"{(((end - start) * 1000) / (8 * chunk_size)):.0f} ms/query\"\n", " print(f\"link {link_count}/{len(links)} {wall_time} {time_per_query}\")\n", " else:\n", " print(f\"link {link_count}/{len(links)}\")\n", " start = time.perf_counter()\n", "# link_document = das.get_atom(link, output_format=QueryOutputFormat.ATOM_INFO)\n", "# targets = link_document['targets']\n", "# link_type = link_document['type']\n", " targets = das.get_link_targets(link)\n", " link_type = das.get_link_type(link)\n", " arity = len(targets)\n", " if arity == 2:\n", " templates = [\n", " [link_type, WILDCARD, targets[1]],\n", " [link_type, targets[0], WILDCARD],\n", " #[link_type, WILDCARD, WILDCARD],\n", " ]\n", " elif arity == 3:\n", " templates = [\n", " [link_type, WILDCARD, targets[1], targets[2]],\n", " [link_type, targets[0], WILDCARD, targets[2]],\n", " [link_type, targets[0], targets[1], WILDCARD],\n", " [link_type, WILDCARD, WILDCARD, targets[2]],\n", " [link_type, WILDCARD, targets[1], WILDCARD],\n", " [link_type, targets[0], WILDCARD, WILDCARD],\n", " #[link_type, WILDCARD, WILDCARD, WILDCARD],\n", " ]\n", " else:\n", " raise NotImplementedError()\n", " for template in templates:\n", " p = build_pattern_from_template(template)\n", " if p is not None:\n", " template_handle = ExpressionHasher.composite_hash(template)\n", " pattern[template_handle] = p\$n", " pattern_count[template_handle] = len(das.get_links(template[0], None, template[1:]))\n", " return tuple([pattern, pattern_count])\n", " \n", "def build_composite_pattern(terms):\n", " assert len(terms) > 1\$n", " for i in range(len(terms)):\n", " if i == 0:\n", " first_term = terms[i]\n", " else:\n", " second_term = terms[i]\n", " composite_pattern = And([first_term, second_term])\n", " first_term = composite_pattern\$n", " return composite_pattern\$n", " \n", "def print_query(pattern):\n", " print(pattern)\n", " query_answer = PatternMatchingAnswer()\n", " pattern.matched(db, query_answer)\n", " for assignment in query_answer.assignments:\n", " if type(assignment) is OrderedAssignment:\n", " print_ordered_assignment(assignment)\n", " elif type(assignment) is UnorderedAssignment:\n", " print_unordered_assignment(assignment)\n", " elif type(assignment) is CompositeAssignment:\n", " print_ordered_assignment(assignment.ordered_mapping)\n", " for unordered_assignment in assignment.unordered_mappings:\n", " print_unordered_assignment(unordered_assignment)\n", " print(\"\")\n", " \n", "halo_level_roulette = build_roulette(DEPTH_WEIGTH)" ] }, { "cell_type": "code", "execution_count": 7, "id": "5851202f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Halo level 1/2 node_handle 1/1 0 seconds 2.289 ms/query\$n", "Halo level 2/2 node_handle 1/16 25 seconds 0.100 ms/query\$n", "Halo level 2/2 node_handle 2/16 25 seconds 0.102 ms/query\$n", "Halo level 2/2 node_handle 3/16 25 seconds 0.101 ms/query\$n", "Halo level 2/2 node_handle 4/16 24 seconds 0.099 ms/query\$n", "Halo level 2/2 node_handle 5/16 0 seconds 0.099 ms/query\$n", "Halo level 2/2 node_handle 6/16 0 seconds 0.097 ms/query\$n", "Halo level 2/2 node_handle 7/16 0 seconds 0.101 ms/query\$n", "Halo level 2/2 node_handle 8/16 26 seconds 0.106 ms/query\$n", "Halo level 2/2 node_handle 9/16 0 seconds 0.131 ms/query\$n", "Halo level 2/2 node_handle 10/16 0 seconds 0.104 ms/query\$n", "Halo level 2/2 node_handle 11/16 71 seconds 0.103 ms/query\$n", "Halo level 2/2 node_handle 12/16 8 seconds 0.114 ms/query\$n", "Halo level 2/2 node_handle 13/16 28 seconds 0.112 ms/query\$n", "Halo level 2/2 node_handle 14/16 3 seconds 0.104 ms/query\$n", "Halo level 2/2 node_handle 15/16 21 seconds 0.104 ms/query\$n", "Halo level 2/2 node_handle 16/16 27 seconds 0.110 ms/query\$n", "===========================================\n", "Done - universe_size = 1941498\$n", "===========================================\n" ] } ], "source": [ "node_handle_list = set([ExpressionHasher.terminal_hash($n.atom_type, $n.name) for $n in TARGET_NODES])\n", "#print(f\"node_handle_list = {node_handle_list}\")\n", "links = [set() for i in range(HALO_LENGTH)]\n", "for level in range(HALO_LENGTH):\n", " new_level_node_handles = set()\n", " node_handle_count = 0\$n", " for node_handle in node_handle_list:\n", " node_handle_count += 1\$n", " #print(f\"===========================================\")\n", " template_list = [\n", " [node_handle, WILDCARD], \n", " [WILDCARD, node_handle], \n", " [node_handle, WILDCARD, WILDCARD], \n", " [WILDCARD, node_handle, WILDCARD], \n", " [WILDCARD, WILDCARD, node_handle]\n", " ]\n", " start = time.perf_counter()\n", " num_queries = 0\$n", " for template in template_list:\n", " #print(f\"template = {template}\")\n", " link_list = set(das.get_links(None, None, template))\n", " num_queries += len(link_list) + 1\$n", " #print(f\"len(link_list) = {len(link_list)}\")\n", " for link in link_list:\n", " #link_document = das.get_atom(link, output_format=QueryOutputFormat.ATOM_INFO)\n", " for h in das.get_link_targets(link):\n", " new_level_node_handles.add(h)\n", " links[level].update(link_list)\n", " end = time.perf_counter()\n", " wall_time = f\"{(end - start):.0f} seconds\"\n", " time_per_query = f\"{(((end - start) * 1000) / num_queries):.3f} ms/query\" \n", " print(f\"Halo level {level+1}/{HALO_LENGTH} node_handle {node_handle_count}/{len(node_handle_list)} {wall_time} {time_per_query}\")\n", " node_handle_list.update(new_level_node_handles)\n", "for level in range(HALO_LENGTH):\n", " if level == 0:\n", " all_links = set([link for link in links[level]])\n", " else:\n", " links[level] = links[level].difference(all_links)\n", " all_links.update(links[level])\n", "universe_size = len(all_links)\n", "print(f\"===========================================\")\n", "print(f\"Done - universe_size = {universe_size}\")\n", "print(f\"===========================================\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "86744255", "metadata": {}, "outputs": [], "source": [ "#print(node_handle_list)\n", "#print(links)" ] }, { "cell_type": "code", "execution_count": 9, "id": "cd24fce5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8\$n", "1941490\$n", "----------\n", "1941498\$n" ] } ], "source": [ "total = 0\$n", "for level in range(HALO_LENGTH):\n", " total += len(links[level])\n", " print(len(links[level]))\n", "print(\"----------\")\n", "print(total)\n", "#links" ] }, { "cell_type": "code", "execution_count": 10, "id": "45fd7384", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "###########################################\n", "Building patterns for level 0\$n", "link 1/8\$n", "link 8/8\$n", "###########################################\n", "Building patterns for level 1\$n", "link 1/19172\$n", "link 1000/19172 830 seconds 104 ms/query\$n", "link 2000/19172 779 seconds 97 ms/query\$n", "link 3000/19172 788 seconds 98 ms/query\$n", "link 4000/19172 716 seconds 89 ms/query\$n", "link 5000/19172 785 seconds 98 ms/query\$n", "link 6000/19172 678 seconds 85 ms/query\$n", "link 7000/19172 719 seconds 90 ms/query\$n", "link 8000/19172 651 seconds 81 ms/query\$n", "link 9000/19172 713 seconds 89 ms/query\$n", "link 10000/19172 675 seconds 84 ms/query\$n", "link 11000/19172 654 seconds 82 ms/query\$n", "link 12000/19172 597 seconds 75 ms/query\$n", "link 13000/19172 637 seconds 80 ms/query\$n", "link 14000/19172 631 seconds 79 ms/query\$n", "link 15000/19172 635 seconds 79 ms/query\$n", "link 16000/19172 615 seconds 77 ms/query\$n", "link 17000/19172 596 seconds 74 ms/query\$n", "link 18000/19172 604 seconds 76 ms/query\$n", "link 19000/19172 681 seconds 85 ms/query\$n", "link 19172/19172\$n", "===========================================\n", "Done - len(all_patterns) = 71826\$n", "===========================================\n" ] } ], "source": [ "pattern = [None for i in range(HALO_LENGTH)]\n", "pattern_count = [None for i in range(HALO_LENGTH)]\n", "pattern_handles = [None for i in range(HALO_LENGTH)]\n", "all_patterns = {}\n", "all_patterns_count = {}\n", "for level in range(HALO_LENGTH):\n", " print(f\"###########################################\")\n", " print(f\"Building patterns for level {level}\")\n", " striped_links = [link for link in links[level] if level == 0 or random.random() < LINK_RATE]\n", " pattern[level], pattern_count[level] = build_patterns(striped_links)\n", " pattern_handles[level] = [key for key in pattern[level].keys()]\n", " for key, value in pattern[level].items():\n", " all_patterns[key] = value\$n", " for key, value in pattern_count[level].items():\n", " all_patterns_count[key] = value\$n", "print(f\"===========================================\")\n", "print(f\"Done - len(all_patterns) = {len(all_patterns)}\")\n", "print(f\"===========================================\")" ] }, { "cell_type": "code", "execution_count": 11, "id": "981a8a28", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "40\$n", "71803\$n", "----------\n", "71843\$n" ] } ], "source": [ "total = 0\$n", "for level in range(HALO_LENGTH):\n", " total += len(pattern_handles[level])\n", " print(len(pattern_handles[level]))\n", "print(\"----------\")\n", "print(total)\n", "#pattern_handles" ] }, { "cell_type": "code", "execution_count": null, "id": "c34c1d24", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/1000\$n", "Epoch 2/1000\$n", "Epoch 3/1000\$n", "Epoch 4/1000\$n", "Epoch 5/1000\$n", "Epoch 6/1000\$n", "Epoch 7/1000\$n", "Epoch 8/1000\$n", "Epoch 9/1000\$n", "Epoch 10/1000\$n", "Epoch 11/1000\$n", "Epoch 12/1000\$n", "Epoch 13/1000\$n", "Epoch 14/1000\$n", "Epoch 15/1000\$n", "Epoch 16/1000\$n", "Epoch 17/1000\$n", "Epoch 18/1000\$n", "Epoch 19/1000\$n", "Epoch 20/1000\$n", "Epoch 21/1000\$n", "Epoch 22/1000\$n", "Epoch 23/1000\$n", "Epoch 24/1000\$n", "Epoch 25/1000\$n", "Epoch 26/1000\$n", "Epoch 27/1000\$n", "Epoch 28/1000\$n", "Epoch 29/1000\$n", "Epoch 30/1000\$n", "Epoch 31/1000\$n", "Epoch 32/1000\$n", "Epoch 33/1000\$n", "Epoch 34/1000\$n", "Epoch 35/1000\$n", "Epoch 36/1000\$n", "Epoch 37/1000\$n", "Epoch 38/1000\$n", "Epoch 39/1000\$n", "Epoch 40/1000\$n", "Epoch 41/1000\$n", "Epoch 42/1000\$n", "Epoch 43/1000\$n", "Epoch 44/1000\$n", "Epoch 45/1000\$n", "Epoch 46/1000\$n", "Epoch 47/1000\$n", "Epoch 48/1000\$n", "Epoch 49/1000\$n", "Epoch 50/1000\$n", "Epoch 51/1000\$n", "Epoch 52/1000\$n", "Epoch 53/1000\$n", "Epoch 54/1000\$n", "Epoch 55/1000\$n", "Epoch 56/1000\$n", "Epoch 57/1000\$n", "Epoch 58/1000\$n", "Epoch 59/1000\$n", "Epoch 60/1000\$n", "Epoch 61/1000\$n", "Epoch 62/1000\$n", "Epoch 63/1000\$n", "Epoch 64/1000\$n", "Epoch 65/1000\$n", "Epoch 66/1000\$n", "Epoch 67/1000\$n", "Epoch 68/1000\$n", "Epoch 69/1000\$n", "Epoch 70/1000\$n", "Epoch 71/1000\$n", "Epoch 72/1000\$n", "Epoch 73/1000\$n", "Epoch 74/1000\$n", "Epoch 75/1000\$n", "Epoch 76/1000\$n", "Epoch 77/1000\$n", "Epoch 78/1000\$n", "Epoch 79/1000\$n", "Epoch 80/1000\$n", "Epoch 81/1000\$n", "Epoch 82/1000\$n", "Epoch 83/1000\$n", "Epoch 84/1000\$n", "Epoch 85/1000\$n", "Epoch 86/1000\$n", "Epoch 87/1000\$n", "Epoch 88/1000\$n", "Epoch 89/1000\$n", "Epoch 90/1000\$n", "Epoch 91/1000\$n", "Epoch 92/1000\$n", "Epoch 93/1000\$n", "Epoch 94/1000\$n", "Epoch 95/1000\$n", "Epoch 96/1000\$n", "Epoch 97/1000\$n", "Epoch 98/1000\$n", "Epoch 99/1000\$n", "Epoch 100/1000\$n", "Epoch 101/1000\$n", "Epoch 102/1000\$n", "Epoch 103/1000\$n", "Epoch 104/1000\$n", "Epoch 105/1000\$n", "Epoch 106/1000\$n", "Epoch 107/1000\$n", "Epoch 108/1000\$n", "Epoch 109/1000\$n", "Epoch 110/1000\$n", "Epoch 111/1000\$n", "Epoch 112/1000\$n", "Epoch 113/1000\$n", "Epoch 114/1000\$n", "Epoch 115/1000\$n", "Epoch 116/1000\$n", "Epoch 117/1000\$n", "Epoch 118/1000\$n", "Epoch 119/1000\$n", "Epoch 120/1000\$n", "Epoch 121/1000\$n", "Epoch 122/1000\$n", "Epoch 123/1000\$n", "Epoch 124/1000\$n", "Epoch 125/1000\$n", "Epoch 126/1000\$n", "Epoch 127/1000\$n", "Epoch 128/1000\$n" ] } ], "source": [ "higher_isurprisingness = 0\$n", "best_pattern = None\$n", "for i in range(EPOCHS):\n", " if True or i % 1000 == 0 or i == EPOCHS - 1:\n", " print(f\"Epoch {i + 1}/{EPOCHS}\")\n", " selected_handle = random_selection(pattern_handles[0])\n", " term_handles = [tuple([selected_handle, 0])]\n", " terms = [pattern[0][selected_handle]]\n", " counts = [pattern_count[0][selected_handle]]\n", " for i in range(NGRAM - 1):\n", " while True:\n", " selected_level = roulette_selection(halo_levels, halo_level_roulette)\n", " selected_handle = random_selection(pattern_handles[selected_level])\n", " if tuple([selected_handle, selected_level]) not in term_handles:\n", " break\$n", " term_handles.append(tuple([selected_handle, selected_level]))\n", " terms.append(pattern[selected_level][selected_handle])\n", " counts.append(pattern_count[selected_level][selected_handle])\n", " composite_pattern = build_composite_pattern(terms)\n", " count = compute_count(composite_pattern)\n", " if count > 0:\n", " print(f\"Count: {count}\")\n", " if count >= SUPPORT:\n", " isurprisingness = compute_isurprisingness(count, terms, term_handles, counts, normalized=NORMALIZED_ISURPRISINGNESS) \n", " if isurprisingness > higher_isurprisingness:\n", " print(f\"{count} {isurprisingness}: {terms} {term_handles} {counts}\")\n", " higher_isurprisingness = isurprisingness\$n", " best_pattern = composite_pattern\$n", "print_query(best_pattern)" ] }, { "cell_type": "code", "execution_count": null, "id": "efac9ac6", "metadata": {}, "outputs": [], "source": [ "higher_isurprisingness = 0\$n", "best_pattern = None\$n", "all_patterns_handles = all_patterns.keys()\n", "\n", "count_bh = 0\$n", "for basic_handle in pattern_handles[0]:\n", " count_bh += 1\$n", " print(f\"Cycle {count_bh}/{len(pattern_handles[0])}\")\n", " for combination_handles in combinations(all_patterns, NGRAM - 1):\n", " if basic_handle in combination_handles:\n", " continue\$n", " term_handles = [basic_handle, *combination_handles]\n", " terms = [all_patterns[handle] for handle in term_handles]\n", " counts = [all_patterns_count[handle] for handle in term_handles]\n", " composite_pattern = build_composite_pattern(terms)\n", " count = compute_count(composite_pattern)\n", " if count >= SUPPORT:\n", " isurprisingness = compute_isurprisingness(count, terms, term_handles, counts, normalized=NORMALIZED_ISURPRISINGNESS) \n", " if isurprisingness > higher_isurprisingness:\n", " print(f\"{count} {isurprisingness}: {terms} {counts}\")\n", " higher_isurprisingness = isurprisingness\$n", " best_pattern = composite_pattern \n", "print_query(best_pattern)" ] }, { "cell_type": "code", "execution_count": null, "id": "b90ff166", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.5" } }, "nbformat": 4, "nbformat_minor": 5 }