Skip to content

DAGE-47: Add MtebWriter #200

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: dataset-generator
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions rre-dataset-generator/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ relevance_scale: "graded"
llm_configuration_file: "llm_config.yaml"

# Output format for the generated dataset
# Accepted values: quepid, rre
# Accepted values: quepid, rre, mteb
output_format: "quepid"

# For rre output format, you need other fields, e.g.:
Expand All @@ -67,8 +67,13 @@ output_format: "quepid"
# rre_query_placeholder: "$query"

# Path where the output dataset will be saved
# For rre: json format, for quepid: csv format
output_destination: "output/generated_dataset.csv"
# For rre, output dataset will be saved into output_destination/"ratings.json"
# For quepid, output dataset will be saved into output_destination/"quepid.scv"
# For mteb, the following three files will be saved into: corpus.jsonl, queries.jsonl, and candidates.jsonl
# corpus.jsonl contains <id,title,text> corpus records extracted from search engine.
# queries.jsonl contains <id,text> query records LLM-generated and/or user-defined.
# candidates.jsonl contains <query_id,doc_id,rating> candidate records.
output_destination: "output"

# (Optional) Whether to save LLM rating score explanation to file
# Default: false; set to true to save LLM rating explanation
Expand All @@ -77,3 +82,4 @@ save_llm_explanation: true
# (Optional**) File path where it contains <query, doc_id, rating, explanation> records.
# (**) When save_llm_explanation is set to True, this param needs to be present
llm_explanation_destination: "output/rating_explanation.json"

2 changes: 2 additions & 0 deletions rre-dataset-generator/dataset_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

# build factories
from src.llm.llm_provider_factory import LLMServiceFactory
from src.writers.mteb_writer import MtebWriter
from src.writers.writer_factory import WriterFactory
from src.search_engine.search_engine_factory import SearchEngineFactory

Expand Down Expand Up @@ -119,3 +120,4 @@ def add_cartesian_product_scores(config: Config, data_store: DataStore, llm_serv
if config.save_llm_explanation:
data_store.export_all_records_with_explanation(config.llm_explanation_destination)
log.info(f"Dataset with LLM explanation is saved into: {config.llm_explanation_destination}")

3 changes: 2 additions & 1 deletion rre-dataset-generator/src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class Config(BaseModel):
num_queries_needed: int = Field(..., gt=0, description="Total number of queries to generate.")
relevance_scale: Literal['binary', 'graded']
llm_configuration_file: FilePath = Field(..., description="Path to the LLM configuration file.")
output_format: Literal['quepid', 'rre']
output_format: Literal['quepid', 'rre', 'mteb']
output_destination: Path = Field(..., description="Path to save the output dataset.")
save_llm_explanation: Optional[bool] = False
llm_explanation_destination: Optional[Path] = Field(None, description="Path to save the LLM rating explanation")
Expand All @@ -34,6 +34,7 @@ class Config(BaseModel):
rre_query_template: FilePath = Field(None, description="Query template for rre evaluator.")
rre_query_placeholder: str = Field(None, description="Key-value pair to substitute in the rre query template.")


@field_validator('doc_fields')
def check_no_empty_fields(cls, v):
if any(not f.strip() for f in v):
Expand Down
4 changes: 2 additions & 2 deletions rre-dataset-generator/src/search_engine/data_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def get_document(self, doc_id: str) -> Optional[Document]:

def get_documents(self) -> List[Document]:
"""
Returns a list of Document objects."
Returns a list of Document objects.
"""
return list(self._documents.values())

Expand Down Expand Up @@ -206,7 +206,7 @@ def load_tmp_file_content(self) -> None:

def export_all_records_with_explanation(self, output_path: str | Path) -> None:
"""
Exports query-doc-rating-explanation tuples to a JSON file.
Exports query-doc_id-rating-explanation tuples to a JSON file.
"""
records = []
for query_context in self._queries_by_id.values():
Expand Down
17 changes: 13 additions & 4 deletions rre-dataset-generator/src/utils.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,32 @@
import argparse
import re
import html
from pathlib import Path

import re
from typing import Any

_TAG_REGEX = re.compile('<.*?>')


def parse_args():
parser = argparse.ArgumentParser(description='Parse arguments for CLI.')

parser.add_argument('-c', '--config_file', type=str,
help='Config file path to use for the application [default: \"config.yaml\"]',
required=False, default="config.yaml")

parser.add_argument('-v', '--verbose',action='store_true',
parser.add_argument('-v', '--verbose', action='store_true',
help='Activate debug mode for logging [default: False]')

return parser.parse_args()


def clean_text(text: str) -> str:
text_without_html = re.sub(_TAG_REGEX, '', text).strip()
return html.unescape(re.sub(r"\s{2,}", " ", text_without_html))


def _to_string(value: Any) -> str:
if value is None:
return ""
if isinstance(value, (list, tuple)):
return " ".join(str(val) for val in value if val is not None)
return str(value)
89 changes: 89 additions & 0 deletions rre-dataset-generator/src/writers/mteb_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import json
import logging
import os
from pathlib import Path

from src.config import Config
from src.search_engine.data_store import DataStore
from src.utils import _to_string
from src.writers.abstract_writer import AbstractWriter

log = logging.getLogger(__name__)


class MtebWriter(AbstractWriter):
"""
MtebWriter: Write data namely corpus, queries, and candidates to JSONL file for MTEB
https://github.com/embeddings-benchmark/mteb

Corpus format: id,title,text
Queries format: id,text
Candidates format: query_id,doc_id,rating
"""

@classmethod
def build(cls, config: Config, data_store: DataStore):
return cls(datastore=data_store)

def _write_corpus(self, corpus_path: Path) -> None:
"""
Writes corpus records extracted from search engine to JSONL file:
{"id": <doc_id>, "title": <title>, "text": <description>}
"""
with corpus_path.open("w", encoding="utf-8") as file:
for doc in self.datastore.get_documents():
doc_id = str(doc.id)
fields = doc.fields
title = _to_string(fields.get("title"))
text = _to_string(fields.get("description"))

row = {"id": doc_id, "title": title, "text": text}
file.write(json.dumps(row, ensure_ascii=False) + "\n")

def _write_queries(self, queries_path: Path) -> None:
"""
Writes queries LLM-generated and/or user-defined records to JSONL file:
{"id": <query_id>, "text": <query_text>}
"""
with queries_path.open("w", encoding="utf-8") as file:
for query_context in self.datastore.get_queries():
query_id = query_context.get_query_id()
query_text = query_context.get_query_text()

row = {"id": query_id, "text": query_text}
file.write(json.dumps(row, ensure_ascii=False) + "\n")

def _write_candidates(self, candidates_path: Path) -> None:
"""
Writes candidates to JSONL file:
{"query_id": <query_id>, "doc_id": <doc_id>, "rating": <rating_score>}
"""
with candidates_path.open("w", encoding="utf-8") as file:
for query_context in self.datastore.get_queries():
query_id = query_context.get_query_id()
for doc_id in query_context.get_doc_ids():
if query_context.has_rating_score(doc_id):
rating_score = query_context.get_rating_score(doc_id)

row = {"query_id": query_id, "doc_id": doc_id, "rating": rating_score}
file.write(json.dumps(row, ensure_ascii=False) + "\n")

def write(self, output_path: str | Path) -> None:
"""
Write corpus, queries, and candidates JSONL files for MTEB.
"""
path = Path(output_path)
os.makedirs(path, exist_ok=True)
try:
self._write_corpus(path / "corpus.jsonl")
log.info("Corpus written successfully")

self._write_queries(path / "queries.jsonl")
log.info("Queries written successfully")

self._write_candidates(path / "candidates.jsonl")
log.info("Candidates written successfully")

except Exception as e:
log.exception("Failed to write MTEB files: %s", e)
raise
4 changes: 2 additions & 2 deletions rre-dataset-generator/src/writers/quepid_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ def build(cls, config: Config, data_store: DataStore):

def write(self, output_path: str | Path) -> None:
"""
Writes queries and their scored documents to a CSV file in Quepid format.
Writes queries and their scored documents to quepid.csv file.
"""
output_path = Path(output_path)
output_path = Path(output_path) / "quepid.csv"
os.makedirs(output_path.parent, exist_ok=True)
with open(output_path, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
Expand Down
4 changes: 2 additions & 2 deletions rre-dataset-generator/src/writers/rre_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ def _build_json_doc_records(self) -> dict[str, Any]:

def write(self, output_path: str | Path) -> None:
"""
Writes queries and their ratings to json file in RRE format.
Writes queries and their ratings to ratings.json file in RRE format.
"""
output_path = Path(output_path)
output_path = Path(output_path) / "ratings.json"
os.makedirs(output_path.parent, exist_ok=True)
with open(output_path, 'w', newline='') as json_file:
log.debug("Started writing RRE formatted records to json file")
Expand Down
2 changes: 2 additions & 0 deletions rre-dataset-generator/src/writers/writer_factory.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .abstract_writer import AbstractWriter
from .mteb_writer import MtebWriter
from .quepid_writer import QuepidWriter
import logging
from src.search_engine.data_store import DataStore
Expand All @@ -12,6 +13,7 @@ class WriterFactory:
OUTPUT_FORMAT_REGISTRY = {
"quepid": QuepidWriter,
"rre": RreWriter,
"mteb": MtebWriter,
}

@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ num_queries_needed: 10
relevance_scale: "graded"
llm_configuration_file: "tests/unit/resources/llm_config.yaml"
output_format: "quepid"
output_destination: "output/generated_dataset.json"
output_destination: "output"

Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ num_queries_needed: 10
relevance_scale: "graded"
llm_configuration_file: "tests/unit/resources/llm_config.yaml"
output_format: "quepid"
output_destination: "output/generated_dataset.json"
output_destination: "output"
save_llm_explanation: true
llm_explanation_destination: "output/rating_explanation.json"
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ num_queries_needed: 10
relevance_scale: "graded"
llm_configuration_file: "tests/unit/resources/llm_config.yaml"
output_format: "quepid"
output_destination: "output/generated_dataset.json"
output_destination: "output"
15 changes: 15 additions & 0 deletions rre-dataset-generator/tests/unit/resources/mteb_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
query_template: "q=#$query##&fq=genre:horror&wt=json"
search_engine_type: "solr"
index_name: "testcore"
search_engine_collection_endpoint: "http://localhost:8983/solr/testcore/"
doc_number: 100
doc_fields:
- "title"
- "description"
queries: "tests/unit/resources/queries.txt"
generate_queries_from_documents: true
num_queries_needed: 10
relevance_scale: "graded"
llm_configuration_file: "tests/unit/resources/llm_config.yaml"
output_format: "mteb"
output_destination: "output"
3 changes: 2 additions & 1 deletion rre-dataset-generator/tests/unit/resources/rre_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ corpora_file: "tests/integration/solr-init/data/dataset.json"
id_field: "id"
rre_query_template: "tests/unit/resources/only_q.json"
rre_query_placeholder: "$query"
output_destination: "output/ratings.json"
output_destination: "output"

Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@ num_queries_needed: 10
relevance_scale: "graded"
llm_configuration_file: "tests/unit/resources/llm_config.yaml"
output_format: "quepid"
output_destination: "output/generated_dataset.json"
output_destination: "output"


11 changes: 10 additions & 1 deletion rre-dataset-generator/tests/unit/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ def test_good_config_expect_all_parameters_read(config):
assert config.num_queries_needed == 10
assert config.relevance_scale == "graded"
assert config.llm_configuration_file == FilePath("tests/unit/resources/llm_config.yaml")
assert config.output_destination == Path("output/generated_dataset.json")
assert config.output_format == "quepid"
assert config.output_destination == Path("output")
assert config.save_llm_explanation is True
assert config.llm_explanation_destination == Path("output/rating_explanation.json")
assert config.index_name == "testcore"


def test_missing_optional_field_values():
path = "tests/unit/resources/missing_optional.yaml"
cfg = Config.load(path)
Expand Down Expand Up @@ -58,3 +60,10 @@ def test_file_not_found_raises_exception():
path = "tests/unit/resources/file_does_not_exist.yaml"
with pytest.raises(FileNotFoundError):
_ = Config.load(path)


def test_mteb_config_expect_successful_load():
path = "tests/unit/resources/mteb_config.yaml"
mteb_config = Config.load(path)
assert mteb_config.output_format == "mteb"
assert mteb_config.output_destination == Path("output")
Loading