Merge pull request #1 from hrshdhgd/rag-with-doiff-docs

hrshdhgd · web-flow · commit 20f025a1da84 · 2024-08-22T17:24:49.000-05:00
Added ontodiff yaml docs for RAG
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+src/llm_change_agent/rag_docs/*.yaml
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/llm_change_agent/constants.py b/src/llm_change_agent/constants.py
@@ -15,3 +15,12 @@
 
 KGCL_SCHEMA = [file for file in files("kgcl-schema") if file.stem == "kgcl" and file.suffix == ".yaml"][0]
 KGCL_GRAMMAR = [file for file in files("kgcl-schema") if file.stem == "kgcl" and file.suffix == ".lark"][0]
+
+ONTODIFF_DOCS = [
+    "https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/EnvironmentOntology_envo/data_with_changes.yaml",
+    "https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/geneontology_go-ontology/data_with_changes.yaml",
+    "https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/monarch-initiative_mondo/data_with_changes.yaml",
+    "https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/obophenotype_cell-ontology/data_with_changes.yaml",
+    "https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/obophenotype_uberon/data_with_changes.yaml",
+    "https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/pato-ontology_pato/data_with_changes.yaml",
+]
diff --git a/src/llm_change_agent/rag_docs/.keep b/src/llm_change_agent/rag_docs/.keep
diff --git a/src/llm_change_agent/utils/llm_utils.py b/src/llm_change_agent/utils/llm_utils.py
@@ -1,5 +1,10 @@
 """Utility functions for the LLM Change Agent."""
 
+import os
+from pathlib import Path
+from typing import Union
+
+import requests
 import yaml
 from langchain.agents import AgentExecutor
 from langchain.agents.react.agent import create_react_agent
@@ -11,9 +16,20 @@
 from openai import OpenAI
 
 from llm_change_agent.config.llm_config import AnthropicConfig, CBORGConfig, LLMConfig, OllamaConfig, OpenAIConfig
-from llm_change_agent.constants import ANTHROPIC_KEY, CBORG_KEY, KGCL_GRAMMAR, KGCL_SCHEMA, OPENAI_KEY
+from llm_change_agent.constants import (
+    ANTHROPIC_KEY,
+    CBORG_KEY,
+    KGCL_GRAMMAR,
+    KGCL_SCHEMA,
+    ONTODIFF_DOCS,
+    OPENAI_KEY,
+)
 from llm_change_agent.templates.templates import get_issue_analyzer_template, grammar_explanation
 
+PROJ_DIR = Path(__file__).parents[1].resolve()
+RAG_DOCS_DIR = PROJ_DIR / "rag_docs"
+os.makedirs(RAG_DOCS_DIR, exist_ok=True)
+
 
 def get_openai_models():
     """Get the list of OpenAI models."""
@@ -140,9 +156,41 @@ def get_kgcl_grammar():
     return {"lark": lark_file, "explanation": grammar_notes}
 
 
-def split_documents(document: str):
+def get_diff_docs():
+    """Download the diff docs."""
+    for url in ONTODIFF_DOCS:
+        # Extract the document name from the URL
+        doc_name = url.split("/")[-2]
+        doc_path = RAG_DOCS_DIR / f"{doc_name}.yaml"
+
+        # Check if the file already exists
+        if not doc_path.exists():
+            try:
+                # Download the content from the URL
+                response = requests.get(url, timeout=10)
+                response.raise_for_status()  # Raise an error for bad status codes
+
+                # Write the content to the file
+                with open(doc_path, "w") as doc_file:
+                    doc_file.write(response.text)
+
+                print(f"Downloaded and saved: {doc_name}")
+                yield response.text
+
+            except requests.RequestException as e:
+                print(f"Failed to download {url}: {e}")
+        else:
+            with open(doc_path, "r") as doc_file:
+                print(f"Reading from file: {doc_name}")
+                yield doc_file.read()
+
+
+def split_documents(document: Union[str, Document]):
     """Split the document into a list of documents."""
-    doc_object = (Document(page_content=document),)
+    if isinstance(document, Document):
+        doc_object = (document,)
+    else:
+        doc_object = (Document(page_content=document),)
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     splits = splitter.split_documents(doc_object)
     return splits
@@ -155,9 +203,12 @@ def execute_agent(llm, prompt):
     # docs_list = (
     #     split_documents(str(schema)) + split_documents(grammar["lark"]) + split_documents(grammar["explanation"])
     # )
-
     docs_list = split_documents(grammar["lark"]) + split_documents(grammar["explanation"])
-    vectorstore = Chroma.from_documents(documents=docs_list, embedding=OpenAIEmbeddings())
+    # ! Comment the following 2 lines to speed up the execution.
+    # diff_doc_generator = get_diff_docs()
+    # docs_list = [split_doc for doc in diff_doc_generator for split_doc in split_documents(doc)]
+
+    vectorstore = Chroma.from_documents(documents=docs_list, embedding=OpenAIEmbeddings(show_progress_bar=True))
     retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
     tool = create_retriever_tool(retriever, "change_agent_retriever", "Change Agent Retriever")
     tools = [tool]
diff --git a/tox.ini b/tox.ini
@@ -39,7 +39,10 @@ deps =
 skip_install = true
 commands =
     black src/ tests/ --exclude "/(tests/input|tests/output)/"
-    ruff check --fix src/ tests/ --exclude tests/input --exclude tests/output --exclude src/llm_change_agent/templates/
+    ruff check --fix src/ tests/ --exclude tests/input \
+                                --exclude tests/output \
+                                --exclude src/llm_change_agent/templates/
+
 description = Run linters.
 
 # This is used for QC checks.
@@ -50,7 +53,10 @@ deps =
 skip_install = true
 commands =
     black --check --diff src/ tests/ --exclude "/(tests/input|tests/output)/"
-    ruff check src/ tests/ --exclude tests/input --exclude tests/output --exclude src/llm_change_agent/templates/
+    ruff check src/ tests/ --exclude tests/input \
+                        --exclude tests/output \
+                        --exclude src/llm_change_agent/templates/ 
+
 description = Run linters.
 
 [testenv:doclint]
@@ -67,15 +73,15 @@ skip_install = true
 deps =
     codespell
     tomli  # required for getting config from pyproject.toml
-commands = codespell src/ tests/ -S tests/input/,tests/output/
+commands = codespell src/ tests/ -S tests/input/,tests/output/,src/llm_change_agent/rag_docs/*
 
 [testenv:codespell-write]
 description = Run spell checker and write corrections.
 skip_install = true
 deps =
     codespell
     tomli
-commands = codespell src/ tests/ --write-changes -S tests/input/,tests/output/
+commands = codespell src/ tests/ --write-changes -S tests/input/,tests/output/,src/llm_change_agent/rag_docs/*
 
 [testenv:docstr-coverage]
 skip_install = true

Original file line number	Diff line number	Diff line change
`@@ -127,3 +127,4 @@ dmypy.json`
`127`	`127`
`128`	`128`	`# Pyre type checker`
`129`	`129`	`.pyre/`
	`130`	`+src/llm_change_agent/rag_docs/*.yaml`