Fix gds setup in notebook

brs96 · brs96 · commit b8e7432df513 · 2023-10-05T08:45:24.000+01:00
diff --git a/examples/kge-predict-transe-pyg-train.ipynb b/examples/kge-predict-transe-pyg-train.ipynb
@@ -3,37 +3,43 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "%pip install graphdatascience torch torch_geometric"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
     "from graphdatascience import GraphDataScience\n",
     "import torch\n",
     "import torch.optim as optim\n",
     "from torch_geometric.data import Data, download_url\n",
     "from torch_geometric.nn import TransE\n",
     "import collections"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "gds = GraphDataScience(\"bolt://localhost:7687\", auth=('neo4j', 'neo4jneo4j'), database=\"ttt\")"
+    "# Get Neo4j DB URI, credentials and name from environment if applicable\n",
+    "NEO4J_URI = os.environ.get(\"NEO4J_URI\", \"bolt://localhost:7687\")\n",
+    "NEO4J_AUTH = None\n",
+    "NEO4J_DB = os.environ.get(\"NEO4J_DB\", \"neo4j\")\n",
+    "if os.environ.get(\"NEO4J_USER\") and os.environ.get(\"NEO4J_PASSWORD\"):\n",
+    "    NEO4J_AUTH = (\n",
+    "        os.environ.get(\"NEO4J_USER\"),\n",
+    "        os.environ.get(\"NEO4J_PASSWORD\"),\n",
+    "    )\n",
+    "gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB)"
    ]
   },
   {
@@ -42,11 +48,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "url = ('https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/FB15k-237')\n",
-    "raw_file_names = ['train.txt', 'valid.txt', 'test.txt']\n",
-    "raw_dir = './data_from_url'\n",
+    "url = \"https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/FB15k-237\"\n",
+    "raw_file_names = [\"train.txt\", \"valid.txt\", \"test.txt\"]\n",
+    "raw_dir = \"./data_from_url\"\n",
     "for filename in raw_file_names:\n",
-    "    download_url(f'{url}/{filename}', raw_dir)"
+    "    download_url(f\"{url}/{filename}\", raw_dir)"
    ]
   },
   {
@@ -65,20 +71,21 @@
    "outputs": [],
    "source": [
     "rel_types = {\n",
-    "    \"train.txt\":\"TRAIN\",\n",
-    "    \"valid.txt\":\"VALID\",\n",
-    "    \"test.txt\":\"TEST\",\n",
+    "    \"train.txt\": \"TRAIN\",\n",
+    "    \"valid.txt\": \"VALID\",\n",
+    "    \"test.txt\": \"TEST\",\n",
     "}\n",
     "rel_id_to_text_dict = {}\n",
     "rel_type_dict = collections.defaultdict(list)\n",
     "\n",
+    "\n",
     "def process():\n",
     "    node_dict_, rel_dict_ = {}, {}\n",
     "    for file_name in raw_file_names:\n",
-    "        file_name_path = raw_dir + '/' + file_name\n",
+    "        file_name_path = raw_dir + \"/\" + file_name\n",
     "\n",
-    "        with open(file_name_path, 'r') as f:\n",
-    "            data = [x.split('\\t') for x in f.read().split('\\n')[:-1]]\n",
+    "        with open(file_name_path, \"r\") as f:\n",
+    "            data = [x.split(\"\\t\") for x in f.read().split(\"\\n\")[:-1]]\n",
     "\n",
     "        list_of_dicts = []\n",
     "        for i, (src, rel, dst) in enumerate(data):\n",
@@ -94,42 +101,45 @@
     "            target = node_dict_[dst]\n",
     "            edge_type = rel_dict_[rel]\n",
     "\n",
-    "            rel_type_dict[edge_type].append({\n",
-    "                \"source\":source,\n",
-    "                \"target\":target,\n",
-    "            })\n",
-    "            list_of_dicts.append({\n",
-    "                \"source\": source,\n",
-    "                \"source_text\": src,\n",
-    "                \"target\": target,\n",
-    "                \"target_text\": dst,\n",
-    "                \"rel_id\": edge_type,\n",
-    "            })\n",
+    "            rel_type_dict[edge_type].append(\n",
+    "                {\n",
+    "                    \"source\": source,\n",
+    "                    \"target\": target,\n",
+    "                }\n",
+    "            )\n",
+    "            list_of_dicts.append(\n",
+    "                {\n",
+    "                    \"source\": source,\n",
+    "                    \"source_text\": src,\n",
+    "                    \"target\": target,\n",
+    "                    \"target_text\": dst,\n",
+    "                    \"rel_id\": edge_type,\n",
+    "                }\n",
+    "            )\n",
     "\n",
     "        rel_type = rel_types[file_name]\n",
     "        print(f\"Writing {len(list_of_dicts)} entities of {rel_type}\")\n",
     "        gds.run_cypher(\n",
-    "                    \"UNWIND $ll as l \"+\n",
-    "                    \"MERGE (n:Entity {id:l.source, text:l.source_text}) \"+\n",
-    "                    \"MERGE (m:Entity {id:l.target, text:l.target_text}) \"+\n",
-    "                    \"MERGE (n)-[:\"+rel_type+\" {rel_id:l.rel_id}]->(m) \",\n",
-    "                    params={\n",
-    "                        \"ll\": list_of_dicts\n",
-    "                    },\n",
-    "                )\n",
+    "            \"UNWIND $ll as l \"\n",
+    "            + \"MERGE (n:Entity {id:l.source, text:l.source_text}) \"\n",
+    "            + \"MERGE (m:Entity {id:l.target, text:l.target_text}) \"\n",
+    "            + \"MERGE (n)-[:\"\n",
+    "            + rel_type\n",
+    "            + \" {rel_id:l.rel_id}]->(m) \",\n",
+    "            params={\"ll\": list_of_dicts},\n",
+    "        )\n",
     "\n",
     "    for rel_id in rel_type_dict:\n",
     "        REL_TYPE = f\"REL_{rel_id}\"\n",
     "        print(f\"Writing {len(rel_type_dict[rel_id])} entities of {REL_TYPE}\")\n",
     "        gds.run_cypher(\n",
-    "                \"UNWIND $ll AS l MATCH (n:Entity {id:l.source}), (m:Entity {id:l.target}) \"+\n",
-    "                \"MERGE (n)-[:\"+REL_TYPE+\" {rel_id:$rel_id, text:$text}]->(m) \",\n",
-    "                params={\n",
-    "                    \"ll\": rel_type_dict[rel_id],\n",
-    "                    \"rel_id\": rel_id,\n",
-    "                    \"text\": rel_id_to_text_dict[rel_id]\n",
-    "                },\n",
-    "            )\n",
+    "            \"UNWIND $ll AS l MATCH (n:Entity {id:l.source}), (m:Entity {id:l.target}) \"\n",
+    "            + \"MERGE (n)-[:\"\n",
+    "            + REL_TYPE\n",
+    "            + \" {rel_id:$rel_id, text:$text}]->(m) \",\n",
+    "            params={\"ll\": rel_type_dict[rel_id], \"rel_id\": rel_id, \"text\": rel_id_to_text_dict[rel_id]},\n",
+    "        )\n",
+    "\n",
     "\n",
     "process()"
    ]
@@ -157,13 +167,14 @@
     "    print(f\"Graph '{G.name()}' relationship types: {G.relationship_types()}\")\n",
     "    print(f\"Graph '{G.name()}' relationship count: {G.relationship_count()}\")\n",
     "\n",
+    "\n",
     "def project_graph():\n",
     "    node_projection = {\"Entity\": {\"properties\": \"id\"}}\n",
     "    relationship_projection = [\n",
-    "        {\"TRAIN\" : {\"orientation\": \"NATURAL\", \"properties\": \"rel_id\"}},\n",
-    "        {\"TEST\" : {\"orientation\": \"NATURAL\", \"properties\": \"rel_id\"}},\n",
-    "        {\"VALID\" : {\"orientation\": \"NATURAL\", \"properties\": \"rel_id\"}},\n",
-    "                               ]\n",
+    "        {\"TRAIN\": {\"orientation\": \"NATURAL\", \"properties\": \"rel_id\"}},\n",
+    "        {\"TEST\": {\"orientation\": \"NATURAL\", \"properties\": \"rel_id\"}},\n",
+    "        {\"VALID\": {\"orientation\": \"NATURAL\", \"properties\": \"rel_id\"}},\n",
+    "    ]\n",
     "    G, result = gds.graph.project(\n",
     "        \"fb15k-graph-ttv\",\n",
     "        node_projection,\n",
@@ -174,6 +185,7 @@
     "\n",
     "    return G\n",
     "\n",
+    "\n",
     "ttv_G = project_graph()\n",
     "\n",
     "node_properties = gds.graph.nodeProperties.stream(\n",
@@ -193,15 +205,21 @@
    "outputs": [],
    "source": [
     "def create_data_from_graph(relationship_type):\n",
-    "    rels_tmp = gds.graph.relationshipProperties.stream(ttv_G, [\"rel_id\"], relationship_type, separate_property_columns=True)\n",
-    "    topology = [rels_tmp.sourceNodeId.map(lambda x: nodeId_to_id[x]), rels_tmp.targetNodeId.map(lambda x: nodeId_to_id[x])]\n",
+    "    rels_tmp = gds.graph.relationshipProperties.stream(\n",
+    "        ttv_G, [\"rel_id\"], relationship_type, separate_property_columns=True\n",
+    "    )\n",
+    "    topology = [\n",
+    "        rels_tmp.sourceNodeId.map(lambda x: nodeId_to_id[x]),\n",
+    "        rels_tmp.targetNodeId.map(lambda x: nodeId_to_id[x]),\n",
+    "    ]\n",
     "    edge_index = torch.tensor(topology, dtype=torch.long)\n",
     "    edge_type = torch.tensor(rels_tmp.rel_id.astype(int), dtype=torch.long)\n",
     "    data = Data(edge_index=edge_index, edge_type=edge_type)\n",
     "    data.num_nodes = len(nodeId_to_id)\n",
     "    display(data)\n",
     "    return data\n",
     "\n",
+    "\n",
     "train_tensor_data = create_data_from_graph(\"TRAIN\")\n",
     "test_tensor_data = create_data_from_graph(\"TEST\")\n",
     "val_tensor_data = create_data_from_graph(\"VALID\")"
@@ -223,7 +241,7 @@
    "outputs": [],
    "source": [
     "def train_model_with_pyg():\n",
-    "    device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
     "\n",
     "    model = TransE(\n",
     "        num_nodes=train_tensor_data.num_nodes,\n",
@@ -241,7 +259,6 @@
     "\n",
     "    optimizer = optim.Adam(model.parameters(), lr=0.01)\n",
     "\n",
-    "\n",
     "    def train():\n",
     "        model.train()\n",
     "        total_loss = total_examples = 0\n",
@@ -254,7 +271,6 @@
     "            total_examples += head_index.numel()\n",
     "        return total_loss / total_examples\n",
     "\n",
-    "\n",
     "    @torch.no_grad()\n",
     "    def test(data):\n",
     "        model.eval()\n",
@@ -266,16 +282,14 @@
     "            k=10,\n",
     "        )\n",
     "\n",
-    "\n",
     "    # epoch_count = 501\n",
     "    epoch_count = 2\n",
     "    for epoch in range(1, epoch_count):\n",
     "        loss = train()\n",
-    "        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')\n",
+    "        print(f\"Epoch: {epoch:03d}, Loss: {loss:.4f}\")\n",
     "        if epoch % 75 == 0:\n",
     "            rank, hits = test(val_tensor_data)\n",
-    "            print(f'Epoch: {epoch:03d}, Val Mean Rank: {rank:.2f}, '\n",
-    "                  f'Val Hits@10: {hits:.4f}')\n",
+    "            print(f\"Epoch: {epoch:03d}, Val Mean Rank: {rank:.2f}, \" f\"Val Hits@10: {hits:.4f}\")\n",
     "\n",
     "    print(model)\n",
     "    # rank, hits_at_10 = test(test_tensor_data)\n",
@@ -307,12 +321,9 @@
     "    if i % 100 == 0:\n",
     "        print(f\"Node embeddings uploading: {i} of {len(nodeId_to_id)}\", end=\"\\r\")\n",
     "    gds.run_cypher(\n",
-    "            \"MATCH (n:Entity {id: $i}) SET n.emb = $EMBEDDING\",\n",
-    "            params={\n",
-    "                \"i\": i,\n",
-    "                \"EMBEDDING\": model.node_emb.weight[i].tolist()\n",
-    "            },\n",
-    "        )\n",
+    "        \"MATCH (n:Entity {id: $i}) SET n.emb = $EMBEDDING\",\n",
+    "        params={\"i\": i, \"EMBEDDING\": model.node_emb.weight[i].tolist()},\n",
+    "    )\n",
     "print(f\"Node embeddings uploading has been finished\")"
    ]
   },
@@ -355,10 +366,10 @@
    "source": [
     "# 3. Project graph to test\n",
     "G_test, result = gds.graph.project(\n",
-    "        \"graph_to_test\",\n",
-    "        {\"Entity\": {\"properties\": [\"id\", \"emb\"] }},\n",
-    "        rel_label_to_predict,\n",
-    "    )\n",
+    "    \"graph_to_test\",\n",
+    "    {\"Entity\": {\"properties\": [\"id\", \"emb\"]}},\n",
+    "    rel_label_to_predict,\n",
+    ")\n",
     "print_graph_info(G_test)"
    ]
   },
@@ -369,9 +380,7 @@
    "outputs": [],
    "source": [
     "# 4. Set the model to predict\n",
-    "transe_model = gds.model.transe.create(\n",
-    "    G_test, \"emb\", {rel_label_to_predict: target_emb}\n",
-    ")"
+    "transe_model = gds.model.transe.create(G_test, \"emb\", {rel_label_to_predict: target_emb})"
    ]
   },
   {
@@ -386,7 +395,7 @@
     "    target_node_filter=\"Entity\",\n",
     "    relationship_type=rel_label_to_predict,\n",
     "    top_k=3,\n",
-    "    concurrency=4\n",
+    "    concurrency=4,\n",
     ")\n",
     "print(result)"
    ]
@@ -403,7 +412,7 @@
     "    target_node_filter=\"Entity\",\n",
     "    relationship_type=rel_label_to_predict,\n",
     "    top_k=3,\n",
-    "    concurrency=4\n",
+    "    concurrency=4,\n",
     ")\n",
     "print(result)"
    ]
@@ -420,7 +429,7 @@
     "    target_node_filter=\"Entity\",\n",
     "    relationship_type=rel_label_to_predict,\n",
     "    top_k=3,\n",
-    "    concurrency=4\n",
+    "    concurrency=4,\n",
     ")\n",
     "print(result)"
    ]
@@ -436,10 +445,10 @@
     "    source_node_filter=[id_to_nodeId[5], id_to_nodeId[10]],\n",
     "    target_node_filter=\"Entity\",\n",
     "    relationship_type=rel_label_to_predict,\n",
-    "    write_relationship_type=\"WRITTEN_2_\"+rel_label_to_predict,\n",
+    "    write_relationship_type=\"WRITTEN_2_\" + rel_label_to_predict,\n",
     "    write_property=\"transe_score\",\n",
     "    top_k=3,\n",
-    "    concurrency=4\n",
+    "    concurrency=4,\n",
     ")\n",
     "print(result)"
    ]
@@ -458,7 +467,7 @@
     "    mutate_relationship_type=\"MUT_WRITTEN_\" + rel_label_to_predict,\n",
     "    mutate_property=\"mut_transe_score\",\n",
     "    top_k=3,\n",
-    "    concurrency=4\n",
+    "    concurrency=4,\n",
     ")\n",
     "print(result)"
    ]
@@ -469,7 +478,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "rr = gds.graph.relationshipProperties.stream(G_test, ['mut_transe_score'], \"MUT_WRITTEN_\" + rel_label_to_predict, separate_property_columns=True)\n",
+    "rr = gds.graph.relationshipProperties.stream(\n",
+    "    G_test, [\"mut_transe_score\"], \"MUT_WRITTEN_\" + rel_label_to_predict, separate_property_columns=True\n",
+    ")\n",
     "print(rr)"
    ]
   },
@@ -484,22 +495,8 @@
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "name": "python"
   }
  },
  "nbformat": 4,