Skip to content

DAGE-47: Add MtebWriter #200

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: dataset-generator
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions rre-dataset-generator/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ uv.lock
# Local Files and Directories
.output_dataset
output/
data/

# macOS metadata files
.DS_Store
Expand Down
7 changes: 7 additions & 0 deletions rre-dataset-generator/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,10 @@ save_llm_explanation: true
# (Optional**) File path where it contains <query, doc_id, rating, explanation> records.
# (**) When save_llm_explanation is set to True, this param needs to be present
llm_explanation_destination: "output/rating_explanation.json"

# File path for MTEB, where the following three files will be written into: corpus.jsonl, queries.jsonl, and candidates.jsonl
# corpus.jsonl contains <id,title,text> corpus records extracted from search engine.
# queries.jsonl contains <id,text> query records LLM-generated and/or user-defined.
# candidates.jsonl contains <query_id,doc_id,rating> candidate records.
mteb_destination: "data"

5 changes: 5 additions & 0 deletions rre-dataset-generator/dataset_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

# build factories
from src.llm.llm_provider_factory import LLMServiceFactory
from src.writers.mteb_writer import MtebWriter
from src.writers.writer_factory import WriterFactory
from src.search_engine.search_engine_factory import SearchEngineFactory

Expand Down Expand Up @@ -96,6 +97,7 @@ def add_cartesian_product_scores(config: Config, data_store: DataStore, llm_serv
llm: BaseChatModel = LLMServiceFactory.build(LLMConfig.load(config.llm_configuration_file))
service: LLMService = LLMService(chat_model=llm)
writer: AbstractWriter = WriterFactory.build(config, data_store)
mteb_writer: MtebWriter = MtebWriter.build(config, data_store)

# pipeline starts
add_user_queries(config, data_store)
Expand All @@ -119,3 +121,6 @@ def add_cartesian_product_scores(config: Config, data_store: DataStore, llm_serv
if config.save_llm_explanation:
data_store.export_all_records_with_explanation(config.llm_explanation_destination)
log.info(f"Dataset with LLM explanation is saved into: {config.llm_explanation_destination}")

mteb_writer.write(config.mteb_destination)
log.info(f"MTEB candidates are saved into: {config.mteb_destination}")
1 change: 1 addition & 0 deletions rre-dataset-generator/src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class Config(BaseModel):
id_field: str = Field(None, description="ID field for the unique key.")
rre_query_template: FilePath = Field(None, description="Query template for rre evaluator.")
rre_query_placeholder: str = Field(None, description="Key-value pair to substitute in the rre query template.")
mteb_destination: Path = Field(..., description="File path for MTEB data")

@field_validator('doc_fields')
def check_no_empty_fields(cls, v):
Expand Down
4 changes: 2 additions & 2 deletions rre-dataset-generator/src/search_engine/data_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def get_document(self, doc_id: str) -> Optional[Document]:

def get_documents(self) -> List[Document]:
"""
Returns a list of Document objects."
Returns a list of Document objects.
"""
return list(self._documents.values())

Expand Down Expand Up @@ -206,7 +206,7 @@ def load_tmp_file_content(self) -> None:

def export_all_records_with_explanation(self, output_path: str | Path) -> None:
"""
Exports query-doc-rating-explanation tuples to a JSON file.
Exports query-doc_id-rating-explanation tuples to a JSON file.
"""
records = []
for query_context in self._queries_by_id.values():
Expand Down
17 changes: 13 additions & 4 deletions rre-dataset-generator/src/utils.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,32 @@
import argparse
import re
import html
from pathlib import Path

import re
from typing import Any

_TAG_REGEX = re.compile('<.*?>')


def parse_args():
parser = argparse.ArgumentParser(description='Parse arguments for CLI.')

parser.add_argument('-c', '--config_file', type=str,
help='Config file path to use for the application [default: \"config.yaml\"]',
required=False, default="config.yaml")

parser.add_argument('-v', '--verbose',action='store_true',
parser.add_argument('-v', '--verbose', action='store_true',
help='Activate debug mode for logging [default: False]')

return parser.parse_args()


def clean_text(text: str) -> str:
text_without_html = re.sub(_TAG_REGEX, '', text).strip()
return html.unescape(re.sub(r"\s{2,}", " ", text_without_html))


def _to_string(value: Any) -> str:
if value is None:
return ""
if isinstance(value, (list, tuple)):
return " ".join(str(val) for val in value if val is not None)
return str(value)
95 changes: 95 additions & 0 deletions rre-dataset-generator/src/writers/mteb_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import json
import logging
import os
from pathlib import Path

from src.config import Config
from src.search_engine.data_store import DataStore
from src.utils import _to_string
from src.writers.abstract_writer import AbstractWriter

log = logging.getLogger(__name__)


class MtebWriter(AbstractWriter):
"""
MtebWriter: Write data namely corpus, queries, and candidates to JSONL file for MTEB
https://github.com/embeddings-benchmark/mteb

Corpus format: id,title,text
Queries format: id,text
Candidates format: query_id,doc_id,rating
"""

@classmethod
def build(cls, config: Config, data_store: DataStore):
return cls(datastore=data_store)

def _write_corpus(self, output_path: str | Path) -> None:
"""
Writes corpus records extracted from search engine to JSONL file:
{"id": <doc_id>, "title": <title>, "text": <description>}
"""
path = Path(output_path)
os.makedirs(path.parent, exist_ok=True)
with path.open("w", encoding="utf-8") as file:
for doc in self.datastore.get_documents():
doc_id = str(doc.id)
fields = doc.fields
title = _to_string(fields.get("title"))
text = _to_string(fields.get("description"))

row = {"id": doc_id, "title": title, "text": text}
file.write(json.dumps(row, ensure_ascii=False) + "\n")

def _write_queries(self, output_path: str | Path) -> None:
"""
Writes queries LLM-generated and/or user-defined records to JSONL file:
{"id": <query_id>, "text": <query_text>}
"""
path = Path(output_path)
os.makedirs(path.parent, exist_ok=True)
with path.open("w", encoding="utf-8") as file:
for query_context in self.datastore.get_queries():
query_id = query_context.get_query_id()
query_text = query_context.get_query_text()

row = {"id": query_id, "text": query_text}
file.write(json.dumps(row, ensure_ascii=False) + "\n")

def _write_candidates(self, output_path: str | Path) -> None:
"""
Writes candidates to JSONL file:
{"query_id": <query_id>, "doc_id": <doc_id>, "rating": <rating_score>}
"""
path = Path(output_path)
os.makedirs(path.parent, exist_ok=True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is already done in the method write, so we get the output_path with type Path and avoid creating the directory.
If the method were public as it was before, this piece of code would be necessary but now the method is private

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

with path.open("w", encoding="utf-8") as file:
for query_context in self.datastore.get_queries():
query_id = query_context.get_query_id()
for doc_id in query_context.get_doc_ids():
if query_context.has_rating_score(doc_id):
rating_score = query_context.get_rating_score(doc_id)

row = {"query_id": query_id, "doc_id": doc_id, "rating": rating_score}
file.write(json.dumps(row, ensure_ascii=False) + "\n")

def write(self, output_path: str | Path) -> None:
"""
Write corpus, queries, and candidates JSONL files for MTEB.
"""
path = Path(output_path)
os.makedirs(path, exist_ok=True)
try:
self._write_corpus(path / "corpus.jsonl")
log.info("Corpus written successfully")

self._write_queries(path / "queries.jsonl")
log.info("Queries written successfully")

self._write_candidates(path / "candidates.jsonl")
log.info("Candidates written successfully")

except Exception as e:
log.exception("Failed to write MTEB files: %s", e)
raise
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ relevance_scale: "graded"
llm_configuration_file: "tests/unit/resources/llm_config.yaml"
output_format: "quepid"
output_destination: "output/generated_dataset.json"
mteb_destination: "data"

3 changes: 3 additions & 0 deletions rre-dataset-generator/tests/unit/resources/good_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@ output_format: "quepid"
output_destination: "output/generated_dataset.json"
save_llm_explanation: true
llm_explanation_destination: "output/rating_explanation.json"
mteb_destination: "data"


Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ relevance_scale: "graded"
llm_configuration_file: "tests/unit/resources/llm_config.yaml"
output_format: "quepid"
output_destination: "output/generated_dataset.json"
mteb_destination: "data"


Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,6 @@ id_field: "id"
corpora_file: "tests/integration/solr-init/data/dataset.json"
rre_query_template: "tests/unit/resources/only_q.json"
rre_query_placeholder: "$query"
mteb_destination: "data"


3 changes: 3 additions & 0 deletions rre-dataset-generator/tests/unit/resources/rre_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@ id_field: "id"
rre_query_template: "tests/unit/resources/only_q.json"
rre_query_placeholder: "$query"
output_destination: "output/ratings.json"
mteb_destination: "data"


Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ relevance_scale: "graded"
llm_configuration_file: "tests/unit/resources/llm_config.yaml"
output_format: "quepid"
output_destination: "output/generated_dataset.json"
mteb_destination: "data"


2 changes: 2 additions & 0 deletions rre-dataset-generator/tests/unit/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def test_good_config_expect_all_parameters_read(config):
assert config.save_llm_explanation is True
assert config.llm_explanation_destination == Path("output/rating_explanation.json")
assert config.index_name == "testcore"
assert config.mteb_destination == Path("data")


def test_missing_optional_field_values():
path = "tests/unit/resources/missing_optional.yaml"
Expand Down
96 changes: 96 additions & 0 deletions rre-dataset-generator/tests/unit/writers/test_mteb_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import json
from pathlib import Path

import pytest

from src.config import Config
from src.search_engine.data_store import DataStore
from src.writers.mteb_writer import MtebWriter


@pytest.fixture
def config():
"""Loads a valid config."""
return Config.load("tests/unit/resources/good_config.yaml")


@pytest.fixture
def populated_datastore() -> DataStore:
"""Returns a DataStore instance populated with test data."""
datastore = DataStore()

# Query 1: 2 rated docs
query_1_id = datastore.add_query("test query 1", "doc1")
datastore.add_query("test query 1", "doc2")
datastore.add_rating_score(query_1_id, "doc1", 1)
datastore.add_rating_score(query_1_id, "doc2", 1)

# Query 2: 1 rated doc
query_2_id = datastore.add_query("test query 2", "doc4")
datastore.add_rating_score(query_2_id, "doc4", 2)

# Query 3: No rated docs
datastore.add_query("test query 3", "doc5")

return datastore


class TestMtebWriter:

def test_write_expect_written_to_jsonl(self, config, populated_datastore, tmp_path: Path):
output_dir = tmp_path / "data"
writer = MtebWriter(populated_datastore)

writer.write(output_dir)

corpus_file = output_dir / "corpus.jsonl"
queries_file = output_dir / "queries.jsonl"
candidates_file = output_dir / "candidates.jsonl"

assert corpus_file.exists()
assert queries_file.exists()
assert candidates_file.exists()

lines = corpus_file.read_text(encoding="utf-8").splitlines()
rows = [json.loads(line) for line in lines if line.strip()]

docs = populated_datastore.get_documents()
assert len(rows) == len(docs)

for row in rows:
assert set(row.keys()) == {"id", "title", "text"}
assert isinstance(row["id"], str)
assert isinstance(row["title"], str)
assert isinstance(row["text"], str)

lines = queries_file.read_text(encoding="utf-8").splitlines()
rows = [json.loads(line) for line in lines if line.strip()]

queries = populated_datastore.get_queries()
assert len(rows) == len(queries)

for row in rows:
assert set(row.keys()) == {"id", "text"}
assert isinstance(row["id"], str)
assert isinstance(row["text"], str)

lines = candidates_file.read_text(encoding="utf-8").splitlines()
rows = [json.loads(line) for line in lines if line.strip()]

expected = set()
for query_context in populated_datastore.get_queries():
query_id = query_context.get_query_id()
for doc_id in query_context.get_doc_ids():
if query_context.has_rating_score(doc_id):
expected.add((query_id, doc_id, query_context.get_rating_score(doc_id)))

assert len(rows) == len(expected)

for row in rows:
assert set(row.keys()) == {"query_id", "doc_id", "rating"}
assert isinstance(row["query_id"], str)
assert isinstance(row["doc_id"], str)
assert isinstance(row["rating"], int)

written = {(row["query_id"], row["doc_id"], row["rating"]) for row in rows}
assert written == expected