Skip to content

Commit 20f025a

Browse files
authored
Merge pull request #1 from hrshdhgd/rag-with-doiff-docs
Added ontodiff yaml docs for RAG
2 parents 5e91c19 + 4f5a5e7 commit 20f025a

File tree

6 files changed

+270
-183
lines changed

6 files changed

+270
-183
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,4 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130+
src/llm_change_agent/rag_docs/*.yaml

poetry.lock

Lines changed: 194 additions & 174 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/llm_change_agent/constants.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,12 @@
1515

1616
KGCL_SCHEMA = [file for file in files("kgcl-schema") if file.stem == "kgcl" and file.suffix == ".yaml"][0]
1717
KGCL_GRAMMAR = [file for file in files("kgcl-schema") if file.stem == "kgcl" and file.suffix == ".lark"][0]
18+
19+
ONTODIFF_DOCS = [
20+
"https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/EnvironmentOntology_envo/data_with_changes.yaml",
21+
"https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/geneontology_go-ontology/data_with_changes.yaml",
22+
"https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/monarch-initiative_mondo/data_with_changes.yaml",
23+
"https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/obophenotype_cell-ontology/data_with_changes.yaml",
24+
"https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/obophenotype_uberon/data_with_changes.yaml",
25+
"https://raw.githubusercontent.com/hrshdhgd/ontodiff-curator/main/pato-ontology_pato/data_with_changes.yaml",
26+
]

src/llm_change_agent/rag_docs/.keep

Whitespace-only changes.

src/llm_change_agent/utils/llm_utils.py

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
"""Utility functions for the LLM Change Agent."""
22

3+
import os
4+
from pathlib import Path
5+
from typing import Union
6+
7+
import requests
38
import yaml
49
from langchain.agents import AgentExecutor
510
from langchain.agents.react.agent import create_react_agent
@@ -11,9 +16,20 @@
1116
from openai import OpenAI
1217

1318
from llm_change_agent.config.llm_config import AnthropicConfig, CBORGConfig, LLMConfig, OllamaConfig, OpenAIConfig
14-
from llm_change_agent.constants import ANTHROPIC_KEY, CBORG_KEY, KGCL_GRAMMAR, KGCL_SCHEMA, OPENAI_KEY
19+
from llm_change_agent.constants import (
20+
ANTHROPIC_KEY,
21+
CBORG_KEY,
22+
KGCL_GRAMMAR,
23+
KGCL_SCHEMA,
24+
ONTODIFF_DOCS,
25+
OPENAI_KEY,
26+
)
1527
from llm_change_agent.templates.templates import get_issue_analyzer_template, grammar_explanation
1628

29+
PROJ_DIR = Path(__file__).parents[1].resolve()
30+
RAG_DOCS_DIR = PROJ_DIR / "rag_docs"
31+
os.makedirs(RAG_DOCS_DIR, exist_ok=True)
32+
1733

1834
def get_openai_models():
1935
"""Get the list of OpenAI models."""
@@ -140,9 +156,41 @@ def get_kgcl_grammar():
140156
return {"lark": lark_file, "explanation": grammar_notes}
141157

142158

143-
def split_documents(document: str):
159+
def get_diff_docs():
160+
"""Download the diff docs."""
161+
for url in ONTODIFF_DOCS:
162+
# Extract the document name from the URL
163+
doc_name = url.split("/")[-2]
164+
doc_path = RAG_DOCS_DIR / f"{doc_name}.yaml"
165+
166+
# Check if the file already exists
167+
if not doc_path.exists():
168+
try:
169+
# Download the content from the URL
170+
response = requests.get(url, timeout=10)
171+
response.raise_for_status() # Raise an error for bad status codes
172+
173+
# Write the content to the file
174+
with open(doc_path, "w") as doc_file:
175+
doc_file.write(response.text)
176+
177+
print(f"Downloaded and saved: {doc_name}")
178+
yield response.text
179+
180+
except requests.RequestException as e:
181+
print(f"Failed to download {url}: {e}")
182+
else:
183+
with open(doc_path, "r") as doc_file:
184+
print(f"Reading from file: {doc_name}")
185+
yield doc_file.read()
186+
187+
188+
def split_documents(document: Union[str, Document]):
144189
"""Split the document into a list of documents."""
145-
doc_object = (Document(page_content=document),)
190+
if isinstance(document, Document):
191+
doc_object = (document,)
192+
else:
193+
doc_object = (Document(page_content=document),)
146194
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
147195
splits = splitter.split_documents(doc_object)
148196
return splits
@@ -155,9 +203,12 @@ def execute_agent(llm, prompt):
155203
# docs_list = (
156204
# split_documents(str(schema)) + split_documents(grammar["lark"]) + split_documents(grammar["explanation"])
157205
# )
158-
159206
docs_list = split_documents(grammar["lark"]) + split_documents(grammar["explanation"])
160-
vectorstore = Chroma.from_documents(documents=docs_list, embedding=OpenAIEmbeddings())
207+
# ! Comment the following 2 lines to speed up the execution.
208+
# diff_doc_generator = get_diff_docs()
209+
# docs_list = [split_doc for doc in diff_doc_generator for split_doc in split_documents(doc)]
210+
211+
vectorstore = Chroma.from_documents(documents=docs_list, embedding=OpenAIEmbeddings(show_progress_bar=True))
161212
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
162213
tool = create_retriever_tool(retriever, "change_agent_retriever", "Change Agent Retriever")
163214
tools = [tool]

tox.ini

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,10 @@ deps =
3939
skip_install = true
4040
commands =
4141
black src/ tests/ --exclude "/(tests/input|tests/output)/"
42-
ruff check --fix src/ tests/ --exclude tests/input --exclude tests/output --exclude src/llm_change_agent/templates/
42+
ruff check --fix src/ tests/ --exclude tests/input \
43+
--exclude tests/output \
44+
--exclude src/llm_change_agent/templates/
45+
4346
description = Run linters.
4447

4548
# This is used for QC checks.
@@ -50,7 +53,10 @@ deps =
5053
skip_install = true
5154
commands =
5255
black --check --diff src/ tests/ --exclude "/(tests/input|tests/output)/"
53-
ruff check src/ tests/ --exclude tests/input --exclude tests/output --exclude src/llm_change_agent/templates/
56+
ruff check src/ tests/ --exclude tests/input \
57+
--exclude tests/output \
58+
--exclude src/llm_change_agent/templates/
59+
5460
description = Run linters.
5561

5662
[testenv:doclint]
@@ -67,15 +73,15 @@ skip_install = true
6773
deps =
6874
codespell
6975
tomli # required for getting config from pyproject.toml
70-
commands = codespell src/ tests/ -S tests/input/,tests/output/
76+
commands = codespell src/ tests/ -S tests/input/,tests/output/,src/llm_change_agent/rag_docs/*
7177

7278
[testenv:codespell-write]
7379
description = Run spell checker and write corrections.
7480
skip_install = true
7581
deps =
7682
codespell
7783
tomli
78-
commands = codespell src/ tests/ --write-changes -S tests/input/,tests/output/
84+
commands = codespell src/ tests/ --write-changes -S tests/input/,tests/output/,src/llm_change_agent/rag_docs/*
7985

8086
[testenv:docstr-coverage]
8187
skip_install = true

0 commit comments

Comments
 (0)