Merge pull request #2 from hrshdhgd/cache-vector

hrshdhgd · web-flow · commit 6c317f4d456d · 2024-08-22T18:43:09.000-05:00
Using `pystow` persisting ChromaDB so it's not created everytime.
diff --git a/.gitignore b/.gitignore
@@ -127,4 +127,3 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
-src/llm_change_agent/rag_docs/*.yaml
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,6 +17,7 @@ kgcl-schema = "^0.6.8"
 langchain-ollama = "^0.1.1"
 langchain-anthropic = "^0.1.22"
 langchain-chroma = "^0.1.3"
+pystow = "^0.5.4"
 
 [tool.poetry.group.dev.dependencies]
 pytest = {version = ">=8.3.2"}
diff --git a/src/llm_change_agent/constants.py b/src/llm_change_agent/constants.py
@@ -2,6 +2,7 @@
 
 from os import getenv
 
+import pystow
 from importlib_metadata import files
 
 OPENAI_KEY = str(getenv("OPENAI_API_KEY"))
@@ -24,3 +25,7 @@
     "https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/obophenotype_uberon/data_with_changes.yaml",
     "https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/pato-ontology_pato/data_with_changes.yaml",
 ]
+
+LLM_CHANGE_AGENT_MODULE = pystow.module("llm_change_agent")
+VECTOR_STORE = LLM_CHANGE_AGENT_MODULE.join("vector_store")
+VECTO_DB_PATH = VECTOR_STORE / "chroma.sqlite3"
diff --git a/src/llm_change_agent/rag_docs/.keep b/src/llm_change_agent/rag_docs/.keep
diff --git a/src/llm_change_agent/utils/llm_utils.py b/src/llm_change_agent/utils/llm_utils.py
@@ -1,15 +1,13 @@
 """Utility functions for the LLM Change Agent."""
 
-import os
-from pathlib import Path
 from typing import Union
 
-import requests
 import yaml
 from langchain.agents import AgentExecutor
 from langchain.agents.react.agent import create_react_agent
 from langchain.tools.retriever import create_retriever_tool
 from langchain_chroma import Chroma
+from langchain_community.document_loaders import WebBaseLoader
 from langchain_core.documents import Document
 from langchain_openai import OpenAIEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -23,13 +21,11 @@
     KGCL_SCHEMA,
     ONTODIFF_DOCS,
     OPENAI_KEY,
+    VECTO_DB_PATH,
+    VECTOR_STORE,
 )
 from llm_change_agent.templates.templates import get_issue_analyzer_template, grammar_explanation
 
-PROJ_DIR = Path(__file__).parents[1].resolve()
-RAG_DOCS_DIR = PROJ_DIR / "rag_docs"
-os.makedirs(RAG_DOCS_DIR, exist_ok=True)
-
 
 def get_openai_models():
     """Get the list of OpenAI models."""
@@ -156,33 +152,33 @@ def get_kgcl_grammar():
     return {"lark": lark_file, "explanation": grammar_notes}
 
 
-def get_diff_docs():
-    """Download the diff docs."""
-    for url in ONTODIFF_DOCS:
-        # Extract the document name from the URL
-        doc_name = url.split("/")[-2]
-        doc_path = RAG_DOCS_DIR / f"{doc_name}.yaml"
+# def get_diff_docs():
+#     """Download the diff docs."""
+#     for url in ONTODIFF_DOCS:
+#         # Extract the document name from the URL
+#         doc_name = url.split("/")[-2]
+#         doc_path = RAG_DOCS_DIR / f"{doc_name}.yaml"
 
-        # Check if the file already exists
-        if not doc_path.exists():
-            try:
-                # Download the content from the URL
-                response = requests.get(url, timeout=10)
-                response.raise_for_status()  # Raise an error for bad status codes
+#         # Check if the file already exists
+#         if not doc_path.exists():
+#             try:
+#                 # Download the content from the URL
+#                 response = requests.get(url, timeout=10)
+#                 response.raise_for_status()  # Raise an error for bad status codes
 
-                # Write the content to the file
-                with open(doc_path, "w") as doc_file:
-                    doc_file.write(response.text)
+#                 # Write the content to the file
+#                 with open(doc_path, "w") as doc_file:
+#                     doc_file.write(response.text)
 
-                print(f"Downloaded and saved: {doc_name}")
-                yield response.text
+#                 print(f"Downloaded and saved: {doc_name}")
+#                 yield response.text
 
-            except requests.RequestException as e:
-                print(f"Failed to download {url}: {e}")
-        else:
-            with open(doc_path, "r") as doc_file:
-                print(f"Reading from file: {doc_name}")
-                yield doc_file.read()
+#             except requests.RequestException as e:
+#                 print(f"Failed to download {url}: {e}")
+#         else:
+#             with open(doc_path, "r") as doc_file:
+#                 print(f"Reading from file: {doc_name}")
+#                 yield doc_file.read()
 
 
 def split_documents(document: Union[str, Document]):
@@ -203,12 +199,21 @@ def execute_agent(llm, prompt):
     # docs_list = (
     #     split_documents(str(schema)) + split_documents(grammar["lark"]) + split_documents(grammar["explanation"])
     # )
-    docs_list = split_documents(grammar["lark"]) + split_documents(grammar["explanation"])
-    # ! Comment the following 2 lines to speed up the execution.
-    # diff_doc_generator = get_diff_docs()
-    # docs_list = [split_doc for doc in diff_doc_generator for split_doc in split_documents(doc)]
+    grammar_docs_list = split_documents(grammar["lark"]) + split_documents(grammar["explanation"])
+    if VECTO_DB_PATH.exists():
+        vectorstore = Chroma(
+            embedding_function=OpenAIEmbeddings(show_progress_bar=True), persist_directory=str(VECTOR_STORE)
+        )
+    else:
+
+        list_of_doc_lists = [WebBaseLoader(url, show_progress=True).load() for url in ONTODIFF_DOCS]
+        diff_docs_list = [split_doc for docs in list_of_doc_lists for doc in docs for split_doc in split_documents(doc)]
+        docs_list = grammar_docs_list + diff_docs_list
+
+        vectorstore = Chroma.from_documents(
+            documents=docs_list, embedding=OpenAIEmbeddings(show_progress_bar=True), persist_directory=str(VECTOR_STORE)
+        )
 
-    vectorstore = Chroma.from_documents(documents=docs_list, embedding=OpenAIEmbeddings(show_progress_bar=True))
     retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
     tool = create_retriever_tool(retriever, "change_agent_retriever", "Change Agent Retriever")
     tools = [tool]

Original file line number	Diff line number	Diff line change
`@@ -127,4 +127,3 @@ dmypy.json`
`127`	`127`
`128`	`128`	`# Pyre type checker`
`129`	`129`	`.pyre/`
`130`		`-src/llm_change_agent/rag_docs/*.yaml`