Skip to content

Commit 6c317f4

Browse files
authored
Merge pull request #2 from hrshdhgd/cache-vector
Using `pystow` persisting ChromaDB so it's not created everytime.
2 parents 20f025a + 14eff6e commit 6c317f4

File tree

6 files changed

+71
-37
lines changed

6 files changed

+71
-37
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,4 +127,3 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130-
src/llm_change_agent/rag_docs/*.yaml

poetry.lock

Lines changed: 25 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ kgcl-schema = "^0.6.8"
1717
langchain-ollama = "^0.1.1"
1818
langchain-anthropic = "^0.1.22"
1919
langchain-chroma = "^0.1.3"
20+
pystow = "^0.5.4"
2021

2122
[tool.poetry.group.dev.dependencies]
2223
pytest = {version = ">=8.3.2"}

src/llm_change_agent/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from os import getenv
44

5+
import pystow
56
from importlib_metadata import files
67

78
OPENAI_KEY = str(getenv("OPENAI_API_KEY"))
@@ -24,3 +25,7 @@
2425
"https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/obophenotype_uberon/data_with_changes.yaml",
2526
"https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/pato-ontology_pato/data_with_changes.yaml",
2627
]
28+
29+
LLM_CHANGE_AGENT_MODULE = pystow.module("llm_change_agent")
30+
VECTOR_STORE = LLM_CHANGE_AGENT_MODULE.join("vector_store")
31+
VECTO_DB_PATH = VECTOR_STORE / "chroma.sqlite3"

src/llm_change_agent/rag_docs/.keep

Whitespace-only changes.

src/llm_change_agent/utils/llm_utils.py

Lines changed: 40 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
"""Utility functions for the LLM Change Agent."""
22

3-
import os
4-
from pathlib import Path
53
from typing import Union
64

7-
import requests
85
import yaml
96
from langchain.agents import AgentExecutor
107
from langchain.agents.react.agent import create_react_agent
118
from langchain.tools.retriever import create_retriever_tool
129
from langchain_chroma import Chroma
10+
from langchain_community.document_loaders import WebBaseLoader
1311
from langchain_core.documents import Document
1412
from langchain_openai import OpenAIEmbeddings
1513
from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -23,13 +21,11 @@
2321
KGCL_SCHEMA,
2422
ONTODIFF_DOCS,
2523
OPENAI_KEY,
24+
VECTO_DB_PATH,
25+
VECTOR_STORE,
2626
)
2727
from llm_change_agent.templates.templates import get_issue_analyzer_template, grammar_explanation
2828

29-
PROJ_DIR = Path(__file__).parents[1].resolve()
30-
RAG_DOCS_DIR = PROJ_DIR / "rag_docs"
31-
os.makedirs(RAG_DOCS_DIR, exist_ok=True)
32-
3329

3430
def get_openai_models():
3531
"""Get the list of OpenAI models."""
@@ -156,33 +152,33 @@ def get_kgcl_grammar():
156152
return {"lark": lark_file, "explanation": grammar_notes}
157153

158154

159-
def get_diff_docs():
160-
"""Download the diff docs."""
161-
for url in ONTODIFF_DOCS:
162-
# Extract the document name from the URL
163-
doc_name = url.split("/")[-2]
164-
doc_path = RAG_DOCS_DIR / f"{doc_name}.yaml"
155+
# def get_diff_docs():
156+
# """Download the diff docs."""
157+
# for url in ONTODIFF_DOCS:
158+
# # Extract the document name from the URL
159+
# doc_name = url.split("/")[-2]
160+
# doc_path = RAG_DOCS_DIR / f"{doc_name}.yaml"
165161

166-
# Check if the file already exists
167-
if not doc_path.exists():
168-
try:
169-
# Download the content from the URL
170-
response = requests.get(url, timeout=10)
171-
response.raise_for_status() # Raise an error for bad status codes
162+
# # Check if the file already exists
163+
# if not doc_path.exists():
164+
# try:
165+
# # Download the content from the URL
166+
# response = requests.get(url, timeout=10)
167+
# response.raise_for_status() # Raise an error for bad status codes
172168

173-
# Write the content to the file
174-
with open(doc_path, "w") as doc_file:
175-
doc_file.write(response.text)
169+
# # Write the content to the file
170+
# with open(doc_path, "w") as doc_file:
171+
# doc_file.write(response.text)
176172

177-
print(f"Downloaded and saved: {doc_name}")
178-
yield response.text
173+
# print(f"Downloaded and saved: {doc_name}")
174+
# yield response.text
179175

180-
except requests.RequestException as e:
181-
print(f"Failed to download {url}: {e}")
182-
else:
183-
with open(doc_path, "r") as doc_file:
184-
print(f"Reading from file: {doc_name}")
185-
yield doc_file.read()
176+
# except requests.RequestException as e:
177+
# print(f"Failed to download {url}: {e}")
178+
# else:
179+
# with open(doc_path, "r") as doc_file:
180+
# print(f"Reading from file: {doc_name}")
181+
# yield doc_file.read()
186182

187183

188184
def split_documents(document: Union[str, Document]):
@@ -203,12 +199,21 @@ def execute_agent(llm, prompt):
203199
# docs_list = (
204200
# split_documents(str(schema)) + split_documents(grammar["lark"]) + split_documents(grammar["explanation"])
205201
# )
206-
docs_list = split_documents(grammar["lark"]) + split_documents(grammar["explanation"])
207-
# ! Comment the following 2 lines to speed up the execution.
208-
# diff_doc_generator = get_diff_docs()
209-
# docs_list = [split_doc for doc in diff_doc_generator for split_doc in split_documents(doc)]
202+
grammar_docs_list = split_documents(grammar["lark"]) + split_documents(grammar["explanation"])
203+
if VECTO_DB_PATH.exists():
204+
vectorstore = Chroma(
205+
embedding_function=OpenAIEmbeddings(show_progress_bar=True), persist_directory=str(VECTOR_STORE)
206+
)
207+
else:
208+
209+
list_of_doc_lists = [WebBaseLoader(url, show_progress=True).load() for url in ONTODIFF_DOCS]
210+
diff_docs_list = [split_doc for docs in list_of_doc_lists for doc in docs for split_doc in split_documents(doc)]
211+
docs_list = grammar_docs_list + diff_docs_list
212+
213+
vectorstore = Chroma.from_documents(
214+
documents=docs_list, embedding=OpenAIEmbeddings(show_progress_bar=True), persist_directory=str(VECTOR_STORE)
215+
)
210216

211-
vectorstore = Chroma.from_documents(documents=docs_list, embedding=OpenAIEmbeddings(show_progress_bar=True))
212217
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
213218
tool = create_retriever_tool(retriever, "change_agent_retriever", "Change Agent Retriever")
214219
tools = [tool]

0 commit comments

Comments
 (0)