From 959fa8df365603e6fb40e784099bc33e1d9fd51f Mon Sep 17 00:00:00 2001 From: Nazerke Seidan Date: Fri, 15 Aug 2025 15:24:04 +0500 Subject: [PATCH 1/5] DAGE-47: Add MtebWriter --- rre-dataset-generator/.gitignore | 1 + rre-dataset-generator/config.yaml | 9 ++ rre-dataset-generator/dataset_generator.py | 11 ++ rre-dataset-generator/src/config.py | 3 + .../src/search_engine/data_store.py | 4 +- rre-dataset-generator/src/utils.py | 17 ++- .../src/writers/mteb_writer.py | 79 +++++++++++++ .../resources/elasticsearch_good_config.yaml | 4 + .../tests/unit/resources/good_config.yaml | 4 + .../resources/good_config_opensearch.yaml | 4 + .../unit/resources/missing_optional.yaml | 4 + .../tests/unit/resources/rre_config.yaml | 4 + .../unit/resources/solr_good_config.yaml | 4 + .../tests/unit/test_config.py | 4 + .../tests/unit/writers/test_mteb_writer.py | 106 ++++++++++++++++++ 15 files changed, 252 insertions(+), 6 deletions(-) create mode 100644 rre-dataset-generator/src/writers/mteb_writer.py create mode 100644 rre-dataset-generator/tests/unit/writers/test_mteb_writer.py diff --git a/rre-dataset-generator/.gitignore b/rre-dataset-generator/.gitignore index cf37e4c3..f6099c05 100644 --- a/rre-dataset-generator/.gitignore +++ b/rre-dataset-generator/.gitignore @@ -13,6 +13,7 @@ uv.lock # Local Files and Directories .output_dataset output/ +data/ # macOS metadata files .DS_Store diff --git a/rre-dataset-generator/config.yaml b/rre-dataset-generator/config.yaml index f94cce54..5a9aad5d 100644 --- a/rre-dataset-generator/config.yaml +++ b/rre-dataset-generator/config.yaml @@ -77,3 +77,12 @@ save_llm_explanation: true # (Optional**) File path where it contains records. # (**) When save_llm_explanation is set to True, this param needs to be present llm_explanation_destination: "output/rating_explanation.json" + +# JSONL file path which contains corpus records extracted from search engine. +mteb_corpus_destination: "data/corpus.jsonl" + +# JSONL file path which contains query records LLM-generated and/or user-defined. +mteb_queries_destination: "data/queries.jsonl" + +# JSONL file path which contains candidate records. +mteb_candidates_destination: "data/candidates.jsonl" \ No newline at end of file diff --git a/rre-dataset-generator/dataset_generator.py b/rre-dataset-generator/dataset_generator.py index 19ac0210..51155c08 100644 --- a/rre-dataset-generator/dataset_generator.py +++ b/rre-dataset-generator/dataset_generator.py @@ -18,6 +18,7 @@ # build factories from src.llm.llm_provider_factory import LLMServiceFactory +from src.writers.mteb_writer import MtebWriter from src.writers.writer_factory import WriterFactory from src.search_engine.search_engine_factory import SearchEngineFactory @@ -96,6 +97,7 @@ def add_cartesian_product_scores(config: Config, data_store: DataStore, llm_serv llm: BaseChatModel = LLMServiceFactory.build(LLMConfig.load(config.llm_configuration_file)) service: LLMService = LLMService(chat_model=llm) writer: AbstractWriter = WriterFactory.build(config, data_store) + mteb_writer: MtebWriter = MtebWriter.build(config, data_store) # pipeline starts add_user_queries(config, data_store) @@ -119,3 +121,12 @@ def add_cartesian_product_scores(config: Config, data_store: DataStore, llm_serv if config.save_llm_explanation: data_store.export_all_records_with_explanation(config.llm_explanation_destination) log.info(f"Dataset with LLM explanation is saved into: {config.llm_explanation_destination}") + + mteb_writer.write_corpus(config.mteb_corpus_destination) + log.info(f"MTEB corpus is saved into: {config.mteb_corpus_destination}") + + mteb_writer.write_queries(config.mteb_queries_destination) + log.info(f"MTEB queries are saved into: {config.mteb_queries_destination}") + + mteb_writer.write_candidates(config.mteb_candidates_destination) + log.info(f"MTEB candidates are saved into: {config.mteb_candidates_destination}") diff --git a/rre-dataset-generator/src/config.py b/rre-dataset-generator/src/config.py index 3d9838b7..6950b37d 100644 --- a/rre-dataset-generator/src/config.py +++ b/rre-dataset-generator/src/config.py @@ -33,6 +33,9 @@ class Config(BaseModel): id_field: str = Field(None, description="ID field for the unique key.") rre_query_template: FilePath = Field(None, description="Query template for rre evaluator.") rre_query_placeholder: str = Field(None, description="Key-value pair to substitute in the rre query template.") + mteb_corpus_destination: Path = Field(..., description="File path to save the MTEB corpus.") + mteb_queries_destination: Path = Field(..., description="File path to save the MTEB queries.") + mteb_candidates_destination: Path = Field(..., description="File path to save the MTEB candidates.") @field_validator('doc_fields') def check_no_empty_fields(cls, v): diff --git a/rre-dataset-generator/src/search_engine/data_store.py b/rre-dataset-generator/src/search_engine/data_store.py index a9d8f3d7..31d15a3b 100644 --- a/rre-dataset-generator/src/search_engine/data_store.py +++ b/rre-dataset-generator/src/search_engine/data_store.py @@ -68,7 +68,7 @@ def get_document(self, doc_id: str) -> Optional[Document]: def get_documents(self) -> List[Document]: """ - Returns a list of Document objects." + Returns a list of Document objects. """ return list(self._documents.values()) @@ -206,7 +206,7 @@ def load_tmp_file_content(self) -> None: def export_all_records_with_explanation(self, output_path: str | Path) -> None: """ - Exports query-doc-rating-explanation tuples to a JSON file. + Exports query-doc_id-rating-explanation tuples to a JSON file. """ records = [] for query_context in self._queries_by_id.values(): diff --git a/rre-dataset-generator/src/utils.py b/rre-dataset-generator/src/utils.py index da1b0186..3d9e10ce 100644 --- a/rre-dataset-generator/src/utils.py +++ b/rre-dataset-generator/src/utils.py @@ -1,11 +1,11 @@ import argparse -import re import html -from pathlib import Path - +import re +from typing import Any _TAG_REGEX = re.compile('<.*?>') + def parse_args(): parser = argparse.ArgumentParser(description='Parse arguments for CLI.') @@ -13,11 +13,20 @@ def parse_args(): help='Config file path to use for the application [default: \"config.yaml\"]', required=False, default="config.yaml") - parser.add_argument('-v', '--verbose',action='store_true', + parser.add_argument('-v', '--verbose', action='store_true', help='Activate debug mode for logging [default: False]') return parser.parse_args() + def clean_text(text: str) -> str: text_without_html = re.sub(_TAG_REGEX, '', text).strip() return html.unescape(re.sub(r"\s{2,}", " ", text_without_html)) + + +def _to_string(value: Any) -> str: + if value is None: + return "" + if isinstance(value, (list, tuple)): + return " ".join(str(val) for val in value if val is not None) + return str(value) diff --git a/rre-dataset-generator/src/writers/mteb_writer.py b/rre-dataset-generator/src/writers/mteb_writer.py new file mode 100644 index 00000000..d1f2e02a --- /dev/null +++ b/rre-dataset-generator/src/writers/mteb_writer.py @@ -0,0 +1,79 @@ +import json +import os +from pathlib import Path + +from src.config import Config +from src.search_engine.data_store import DataStore +from src.utils import _to_string +from src.writers.abstract_writer import AbstractWriter + + +class MtebWriter(AbstractWriter): + """ + MtebWriter: Write data namely corpus, queries, and candidates to JSONL file for MTEB + https://github.com/embeddings-benchmark/mteb + + Corpus format: id,title,text + Queries format: id,text + Candidates format: query_id,doc_id,rating + """ + + @classmethod + def build(cls, config: Config, data_store: DataStore): + return cls(datastore=data_store) + + def write_corpus(self, output_path: str | Path) -> None: + """ + Writes corpus records extracted from search engine to JSONL file: + {"id": , "title": , "text": <description>} + """ + path = Path(output_path) + os.makedirs(path.parent, exist_ok=True) + with path.open("w", encoding="utf-8") as file: + for doc in self.datastore.get_documents(): + doc_id = str(doc.id) + fields = doc.fields + title = _to_string(fields.get("title")) + text = _to_string(fields.get("description")) + + row = {"id": doc_id, "title": title, "text": text} + file.write(json.dumps(row, ensure_ascii=False) + "\n") + + def write_queries(self, output_path: str | Path) -> None: + """ + Writes queries LLM-generated and/or user-defined records to JSONL file: + {"id": <query_id>, "text": <query_text>} + """ + path = Path(output_path) + os.makedirs(path.parent, exist_ok=True) + with path.open("w", encoding="utf-8") as file: + for query_context in self.datastore.get_queries(): + query_id = query_context.get_query_id() + query_text = query_context.get_query_text() + + row = {"id": query_id, "text": query_text} + file.write(json.dumps(row, ensure_ascii=False) + "\n") + + def write_candidates(self, output_path: str | Path) -> None: + """ + Writes candidates to JSONL file: + {"query_id": <query_id>, "doc_id": <doc_id>, "rating": <rating_score>} + """ + path = Path(output_path) + os.makedirs(path.parent, exist_ok=True) + with path.open("w", encoding="utf-8") as file: + for query_context in self.datastore.get_queries(): + query_id = query_context.get_query_id() + for doc_id in query_context.get_doc_ids(): + if query_context.has_rating_score(doc_id): + rating_score = query_context.get_rating_score(doc_id) + + row = {"query_id": query_id, "doc_id": doc_id, "rating": rating_score} + file.write(json.dumps(row, ensure_ascii=False) + "\n") + + def write(self, output_path: str | Path) -> None: + """ + Call these methods to write to JSONL files for MTEB: self.write_corpus(), self.write_queries() and + self.write_candidates() + """ + pass diff --git a/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml b/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml index 35f6b5ef..559da2fb 100644 --- a/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml @@ -20,3 +20,7 @@ relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" output_destination: "output/generated_dataset.json" +mteb_corpus_destination: "data/corpus.jsonl" +mteb_queries_destination: "data/queries.jsonl" +mteb_candidates_destination: "data/candidates.jsonl" + diff --git a/rre-dataset-generator/tests/unit/resources/good_config.yaml b/rre-dataset-generator/tests/unit/resources/good_config.yaml index 46422d0b..94705d98 100644 --- a/rre-dataset-generator/tests/unit/resources/good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/good_config.yaml @@ -21,3 +21,7 @@ output_format: "quepid" output_destination: "output/generated_dataset.json" save_llm_explanation: true llm_explanation_destination: "output/rating_explanation.json" +mteb_corpus_destination: "data/corpus.jsonl" +mteb_queries_destination: "data/queries.jsonl" +mteb_candidates_destination: "data/candidates.jsonl" + diff --git a/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml b/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml index 4bd1df36..fd87b2ad 100644 --- a/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml +++ b/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml @@ -23,3 +23,7 @@ relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" output_destination: "output/generated_dataset.json" +mteb_corpus_destination: "data/corpus.jsonl" +mteb_queries_destination: "data/queries.jsonl" +mteb_candidates_destination: "data/candidates.jsonl" + diff --git a/rre-dataset-generator/tests/unit/resources/missing_optional.yaml b/rre-dataset-generator/tests/unit/resources/missing_optional.yaml index eb8b6f59..9932eace 100644 --- a/rre-dataset-generator/tests/unit/resources/missing_optional.yaml +++ b/rre-dataset-generator/tests/unit/resources/missing_optional.yaml @@ -14,3 +14,7 @@ id_field: "id" corpora_file: "tests/integration/solr-init/data/dataset.json" rre_query_template: "tests/unit/resources/only_q.json" rre_query_placeholder: "$query" +mteb_corpus_destination: "data/corpus.jsonl" +mteb_queries_destination: "data/queries.jsonl" +mteb_candidates_destination: "data/candidates.jsonl" + diff --git a/rre-dataset-generator/tests/unit/resources/rre_config.yaml b/rre-dataset-generator/tests/unit/resources/rre_config.yaml index 98b4ecea..4b8f217b 100644 --- a/rre-dataset-generator/tests/unit/resources/rre_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/rre_config.yaml @@ -16,3 +16,7 @@ id_field: "id" rre_query_template: "tests/unit/resources/only_q.json" rre_query_placeholder: "$query" output_destination: "output/ratings.json" +mteb_corpus_destination: "data/corpus.jsonl" +mteb_queries_destination: "data/queries.jsonl" +mteb_candidates_destination: "data/candidates.jsonl" + diff --git a/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml b/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml index 650e4829..148c0bb3 100644 --- a/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml @@ -13,3 +13,7 @@ relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" output_destination: "output/generated_dataset.json" +mteb_corpus_destination: "data/corpus.jsonl" +mteb_queries_destination: "data/queries.jsonl" +mteb_candidates_destination: "data/candidates.jsonl" + diff --git a/rre-dataset-generator/tests/unit/test_config.py b/rre-dataset-generator/tests/unit/test_config.py index 9c3f96a9..2a9cf675 100644 --- a/rre-dataset-generator/tests/unit/test_config.py +++ b/rre-dataset-generator/tests/unit/test_config.py @@ -30,6 +30,10 @@ def test_good_config_expect_all_parameters_read(config): assert config.save_llm_explanation is True assert config.llm_explanation_destination == Path("output/rating_explanation.json") assert config.index_name == "testcore" + assert config.mteb_corpus_destination == Path("data/corpus.jsonl") + assert config.mteb_queries_destination == Path("data/queries.jsonl") + assert config.mteb_candidates_destination == Path("data/candidates.jsonl") + def test_missing_optional_field_values(): path = "tests/unit/resources/missing_optional.yaml" diff --git a/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py b/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py new file mode 100644 index 00000000..6285e156 --- /dev/null +++ b/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py @@ -0,0 +1,106 @@ +import json +from pathlib import Path + +import pytest + +from src.config import Config +from src.search_engine.data_store import DataStore +from src.writers.mteb_writer import MtebWriter + + +@pytest.fixture +def config(): + """Loads a valid config.""" + return Config.load("tests/unit/resources/good_config.yaml") + + +@pytest.fixture +def populated_datastore() -> DataStore: + """Returns a DataStore instance populated with test data.""" + datastore = DataStore() + + # Query 1: 2 rated docs + query_1_id = datastore.add_query("test query 1", "doc1") + datastore.add_query("test query 1", "doc2") + datastore.add_rating_score(query_1_id, "doc1", 1) + datastore.add_rating_score(query_1_id, "doc2", 1) + + # Query 2: 1 rated doc + query_2_id = datastore.add_query("test query 2", "doc4") + datastore.add_rating_score(query_2_id, "doc4", 2) + + # Query 3: No rated docs + datastore.add_query("test query 3", "doc5") + + return datastore + + +class TestMtebWriter: + + def test_write_corpus_expect_written_to_jsonl(self, config, populated_datastore, tmp_path: Path): + output_file = tmp_path/"corpus.jsonl" + writer = MtebWriter(populated_datastore) + + writer.write_corpus(str(output_file)) + + assert output_file.exists() + + lines = output_file.read_text(encoding="utf-8").splitlines() + rows = [json.loads(line) for line in lines if line.strip()] + + docs = populated_datastore.get_documents() + assert len(rows) == len(docs) + + for row in rows: + assert set(row.keys()) == {"id", "title", "text"} + assert isinstance(row["id"], str) + assert isinstance(row["title"], str) + assert isinstance(row["text"], str) + + def test_write_queries_expect_written_to_jsonl(self, config, populated_datastore, tmp_path: Path): + output_file = tmp_path / "queries.jsonl" + writer = MtebWriter(populated_datastore) + + writer.write_queries(output_file) + + assert output_file.exists() + + lines = output_file.read_text(encoding="utf-8").splitlines() + rows = [json.loads(line) for line in lines if line.strip()] + + queries = populated_datastore.get_queries() + assert len(rows) == len(queries) + + for row in rows: + assert set(row.keys()) == {"id", "text"} + assert isinstance(row["id"], str) + assert isinstance(row["text"], str) + + def test_write_candidates_expect_written_to_jsonl(self, config, populated_datastore, tmp_path: Path): + output_file = tmp_path / "candidates.jsonl" + writer = MtebWriter(populated_datastore) + + writer.write_candidates(output_file) + + assert output_file.exists() + + lines = output_file.read_text(encoding="utf-8").splitlines() + rows = [json.loads(line) for line in lines if line.strip()] + + expected = set() + for query_context in populated_datastore.get_queries(): + query_id = query_context.get_query_id() + for doc_id in query_context.get_doc_ids(): + if query_context.has_rating_score(doc_id): + expected.add((query_id, doc_id, query_context.get_rating_score(doc_id))) + + assert len(rows) == len(expected) + + for row in rows: + assert set(row.keys()) == {"query_id", "doc_id", "rating"} + assert isinstance(row["query_id"], str) + assert isinstance(row["doc_id"], str) + assert isinstance(row["rating"], int) + + written = {(row["query_id"], row["doc_id"], row["rating"]) for row in rows} + assert written == expected From ea2d38f107d39b954f9e7cecbd8af3d61d0e4366 Mon Sep 17 00:00:00 2001 From: Nazerke Seidan <n.seidan@sease.io> Date: Fri, 15 Aug 2025 15:31:21 +0500 Subject: [PATCH 2/5] Update --- rre-dataset-generator/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rre-dataset-generator/config.yaml b/rre-dataset-generator/config.yaml index 5a9aad5d..50108073 100644 --- a/rre-dataset-generator/config.yaml +++ b/rre-dataset-generator/config.yaml @@ -85,4 +85,4 @@ mteb_corpus_destination: "data/corpus.jsonl" mteb_queries_destination: "data/queries.jsonl" # JSONL file path which contains <query_id,doc_id,rating> candidate records. -mteb_candidates_destination: "data/candidates.jsonl" \ No newline at end of file +mteb_candidates_destination: "data/candidates.jsonl" From e68a9f60f0d7695220bd8127f1d7f3529ebfa99b Mon Sep 17 00:00:00 2001 From: Nazerke Seidan <n.seidan@sease.io> Date: Wed, 20 Aug 2025 13:26:27 +0500 Subject: [PATCH 3/5] Refactor --- rre-dataset-generator/config.yaml | 12 +++---- rre-dataset-generator/dataset_generator.py | 10 ++---- rre-dataset-generator/src/config.py | 4 +-- .../src/writers/mteb_writer.py | 28 +++++++++++---- .../resources/elasticsearch_good_config.yaml | 4 +-- .../tests/unit/resources/good_config.yaml | 5 ++- .../resources/good_config_opensearch.yaml | 5 ++- .../unit/resources/missing_optional.yaml | 5 ++- .../tests/unit/resources/rre_config.yaml | 5 ++- .../unit/resources/solr_good_config.yaml | 5 ++- .../tests/unit/test_config.py | 4 +-- .../tests/unit/writers/test_mteb_writer.py | 36 +++++++------------ 12 files changed, 55 insertions(+), 68 deletions(-) diff --git a/rre-dataset-generator/config.yaml b/rre-dataset-generator/config.yaml index 50108073..daf8e78c 100644 --- a/rre-dataset-generator/config.yaml +++ b/rre-dataset-generator/config.yaml @@ -78,11 +78,9 @@ save_llm_explanation: true # (**) When save_llm_explanation is set to True, this param needs to be present llm_explanation_destination: "output/rating_explanation.json" -# JSONL file path which contains <id,title,text> corpus records extracted from search engine. -mteb_corpus_destination: "data/corpus.jsonl" +# File path for MTEB, where the following three files will be written into: corpus.jsonl, queries.jsonl, and candidates.jsonl +# corpus.jsonl contains <id,title,text> corpus records extracted from search engine. +# queries.jsonl contains <id,text> query records LLM-generated and/or user-defined. +# candidates.jsonl contains <query_id,doc_id,rating> candidate records. +mteb_destination: "data" -# JSONL file path which contains <id,text> query records LLM-generated and/or user-defined. -mteb_queries_destination: "data/queries.jsonl" - -# JSONL file path which contains <query_id,doc_id,rating> candidate records. -mteb_candidates_destination: "data/candidates.jsonl" diff --git a/rre-dataset-generator/dataset_generator.py b/rre-dataset-generator/dataset_generator.py index 51155c08..200404f5 100644 --- a/rre-dataset-generator/dataset_generator.py +++ b/rre-dataset-generator/dataset_generator.py @@ -122,11 +122,5 @@ def add_cartesian_product_scores(config: Config, data_store: DataStore, llm_serv data_store.export_all_records_with_explanation(config.llm_explanation_destination) log.info(f"Dataset with LLM explanation is saved into: {config.llm_explanation_destination}") - mteb_writer.write_corpus(config.mteb_corpus_destination) - log.info(f"MTEB corpus is saved into: {config.mteb_corpus_destination}") - - mteb_writer.write_queries(config.mteb_queries_destination) - log.info(f"MTEB queries are saved into: {config.mteb_queries_destination}") - - mteb_writer.write_candidates(config.mteb_candidates_destination) - log.info(f"MTEB candidates are saved into: {config.mteb_candidates_destination}") + mteb_writer.write(config.mteb_destination) + log.info(f"MTEB candidates are saved into: {config.mteb_destination}") diff --git a/rre-dataset-generator/src/config.py b/rre-dataset-generator/src/config.py index 6950b37d..bcc23321 100644 --- a/rre-dataset-generator/src/config.py +++ b/rre-dataset-generator/src/config.py @@ -33,9 +33,7 @@ class Config(BaseModel): id_field: str = Field(None, description="ID field for the unique key.") rre_query_template: FilePath = Field(None, description="Query template for rre evaluator.") rre_query_placeholder: str = Field(None, description="Key-value pair to substitute in the rre query template.") - mteb_corpus_destination: Path = Field(..., description="File path to save the MTEB corpus.") - mteb_queries_destination: Path = Field(..., description="File path to save the MTEB queries.") - mteb_candidates_destination: Path = Field(..., description="File path to save the MTEB candidates.") + mteb_destination: Path = Field(..., description="File path for MTEB data") @field_validator('doc_fields') def check_no_empty_fields(cls, v): diff --git a/rre-dataset-generator/src/writers/mteb_writer.py b/rre-dataset-generator/src/writers/mteb_writer.py index d1f2e02a..b5a61323 100644 --- a/rre-dataset-generator/src/writers/mteb_writer.py +++ b/rre-dataset-generator/src/writers/mteb_writer.py @@ -1,4 +1,5 @@ import json +import logging import os from pathlib import Path @@ -7,6 +8,8 @@ from src.utils import _to_string from src.writers.abstract_writer import AbstractWriter +log = logging.getLogger(__name__) + class MtebWriter(AbstractWriter): """ @@ -22,7 +25,7 @@ class MtebWriter(AbstractWriter): def build(cls, config: Config, data_store: DataStore): return cls(datastore=data_store) - def write_corpus(self, output_path: str | Path) -> None: + def _write_corpus(self, output_path: str | Path) -> None: """ Writes corpus records extracted from search engine to JSONL file: {"id": <doc_id>, "title": <title>, "text": <description>} @@ -39,7 +42,7 @@ def write_corpus(self, output_path: str | Path) -> None: row = {"id": doc_id, "title": title, "text": text} file.write(json.dumps(row, ensure_ascii=False) + "\n") - def write_queries(self, output_path: str | Path) -> None: + def _write_queries(self, output_path: str | Path) -> None: """ Writes queries LLM-generated and/or user-defined records to JSONL file: {"id": <query_id>, "text": <query_text>} @@ -54,7 +57,7 @@ def write_queries(self, output_path: str | Path) -> None: row = {"id": query_id, "text": query_text} file.write(json.dumps(row, ensure_ascii=False) + "\n") - def write_candidates(self, output_path: str | Path) -> None: + def _write_candidates(self, output_path: str | Path) -> None: """ Writes candidates to JSONL file: {"query_id": <query_id>, "doc_id": <doc_id>, "rating": <rating_score>} @@ -73,7 +76,20 @@ def write_candidates(self, output_path: str | Path) -> None: def write(self, output_path: str | Path) -> None: """ - Call these methods to write to JSONL files for MTEB: self.write_corpus(), self.write_queries() and - self.write_candidates() + Write corpus, queries, and candidates JSONL files for MTEB. """ - pass + path = Path(output_path) + os.makedirs(path, exist_ok=True) + try: + self._write_corpus(path / "corpus.jsonl") + log.info("Corpus written successfully") + + self._write_queries(path / "queries.jsonl") + log.info("Queries written successfully") + + self._write_candidates(path / "candidates.jsonl") + log.info("Candidates written successfully") + + except Exception as e: + log.exception("Failed to write MTEB files: %s", e) + raise diff --git a/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml b/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml index 559da2fb..4e0b2320 100644 --- a/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml @@ -20,7 +20,5 @@ relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" output_destination: "output/generated_dataset.json" -mteb_corpus_destination: "data/corpus.jsonl" -mteb_queries_destination: "data/queries.jsonl" -mteb_candidates_destination: "data/candidates.jsonl" +mteb_destination: "data" diff --git a/rre-dataset-generator/tests/unit/resources/good_config.yaml b/rre-dataset-generator/tests/unit/resources/good_config.yaml index 94705d98..ef28d9c1 100644 --- a/rre-dataset-generator/tests/unit/resources/good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/good_config.yaml @@ -21,7 +21,6 @@ output_format: "quepid" output_destination: "output/generated_dataset.json" save_llm_explanation: true llm_explanation_destination: "output/rating_explanation.json" -mteb_corpus_destination: "data/corpus.jsonl" -mteb_queries_destination: "data/queries.jsonl" -mteb_candidates_destination: "data/candidates.jsonl" +mteb_destination: "data" + diff --git a/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml b/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml index fd87b2ad..034fc39b 100644 --- a/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml +++ b/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml @@ -23,7 +23,6 @@ relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" output_destination: "output/generated_dataset.json" -mteb_corpus_destination: "data/corpus.jsonl" -mteb_queries_destination: "data/queries.jsonl" -mteb_candidates_destination: "data/candidates.jsonl" +mteb_destination: "data" + diff --git a/rre-dataset-generator/tests/unit/resources/missing_optional.yaml b/rre-dataset-generator/tests/unit/resources/missing_optional.yaml index 9932eace..782f8173 100644 --- a/rre-dataset-generator/tests/unit/resources/missing_optional.yaml +++ b/rre-dataset-generator/tests/unit/resources/missing_optional.yaml @@ -14,7 +14,6 @@ id_field: "id" corpora_file: "tests/integration/solr-init/data/dataset.json" rre_query_template: "tests/unit/resources/only_q.json" rre_query_placeholder: "$query" -mteb_corpus_destination: "data/corpus.jsonl" -mteb_queries_destination: "data/queries.jsonl" -mteb_candidates_destination: "data/candidates.jsonl" +mteb_destination: "data" + diff --git a/rre-dataset-generator/tests/unit/resources/rre_config.yaml b/rre-dataset-generator/tests/unit/resources/rre_config.yaml index 4b8f217b..bd835108 100644 --- a/rre-dataset-generator/tests/unit/resources/rre_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/rre_config.yaml @@ -16,7 +16,6 @@ id_field: "id" rre_query_template: "tests/unit/resources/only_q.json" rre_query_placeholder: "$query" output_destination: "output/ratings.json" -mteb_corpus_destination: "data/corpus.jsonl" -mteb_queries_destination: "data/queries.jsonl" -mteb_candidates_destination: "data/candidates.jsonl" +mteb_destination: "data" + diff --git a/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml b/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml index 148c0bb3..fc82ec96 100644 --- a/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml @@ -13,7 +13,6 @@ relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" output_destination: "output/generated_dataset.json" -mteb_corpus_destination: "data/corpus.jsonl" -mteb_queries_destination: "data/queries.jsonl" -mteb_candidates_destination: "data/candidates.jsonl" +mteb_destination: "data" + diff --git a/rre-dataset-generator/tests/unit/test_config.py b/rre-dataset-generator/tests/unit/test_config.py index 2a9cf675..a93fee0b 100644 --- a/rre-dataset-generator/tests/unit/test_config.py +++ b/rre-dataset-generator/tests/unit/test_config.py @@ -30,9 +30,7 @@ def test_good_config_expect_all_parameters_read(config): assert config.save_llm_explanation is True assert config.llm_explanation_destination == Path("output/rating_explanation.json") assert config.index_name == "testcore" - assert config.mteb_corpus_destination == Path("data/corpus.jsonl") - assert config.mteb_queries_destination == Path("data/queries.jsonl") - assert config.mteb_candidates_destination == Path("data/candidates.jsonl") + assert config.mteb_destination == Path("data") def test_missing_optional_field_values(): diff --git a/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py b/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py index 6285e156..3ef2ba0d 100644 --- a/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py +++ b/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py @@ -37,15 +37,21 @@ def populated_datastore() -> DataStore: class TestMtebWriter: - def test_write_corpus_expect_written_to_jsonl(self, config, populated_datastore, tmp_path: Path): - output_file = tmp_path/"corpus.jsonl" + def test_write_expect_written_to_jsonl(self, config, populated_datastore, tmp_path: Path): + output_dir = tmp_path / "data" writer = MtebWriter(populated_datastore) - writer.write_corpus(str(output_file)) + writer.write(output_dir) - assert output_file.exists() + corpus_file = output_dir / "corpus.jsonl" + queries_file = output_dir / "queries.jsonl" + candidates_file = output_dir / "candidates.jsonl" - lines = output_file.read_text(encoding="utf-8").splitlines() + assert corpus_file.exists() + assert queries_file.exists() + assert candidates_file.exists() + + lines = corpus_file.read_text(encoding="utf-8").splitlines() rows = [json.loads(line) for line in lines if line.strip()] docs = populated_datastore.get_documents() @@ -57,15 +63,7 @@ def test_write_corpus_expect_written_to_jsonl(self, config, populated_datastore, assert isinstance(row["title"], str) assert isinstance(row["text"], str) - def test_write_queries_expect_written_to_jsonl(self, config, populated_datastore, tmp_path: Path): - output_file = tmp_path / "queries.jsonl" - writer = MtebWriter(populated_datastore) - - writer.write_queries(output_file) - - assert output_file.exists() - - lines = output_file.read_text(encoding="utf-8").splitlines() + lines = queries_file.read_text(encoding="utf-8").splitlines() rows = [json.loads(line) for line in lines if line.strip()] queries = populated_datastore.get_queries() @@ -76,15 +74,7 @@ def test_write_queries_expect_written_to_jsonl(self, config, populated_datastore assert isinstance(row["id"], str) assert isinstance(row["text"], str) - def test_write_candidates_expect_written_to_jsonl(self, config, populated_datastore, tmp_path: Path): - output_file = tmp_path / "candidates.jsonl" - writer = MtebWriter(populated_datastore) - - writer.write_candidates(output_file) - - assert output_file.exists() - - lines = output_file.read_text(encoding="utf-8").splitlines() + lines = candidates_file.read_text(encoding="utf-8").splitlines() rows = [json.loads(line) for line in lines if line.strip()] expected = set() From f8e20df7424e56a52e693faa9e7e2304fee7523a Mon Sep 17 00:00:00 2001 From: Nazerke Seidan <n.seidan@sease.io> Date: Fri, 22 Aug 2025 17:56:00 +0500 Subject: [PATCH 4/5] Respond to Daniele's feedback --- rre-dataset-generator/config.yaml | 17 +++++----- rre-dataset-generator/dataset_generator.py | 3 -- rre-dataset-generator/src/config.py | 4 +-- .../src/writers/mteb_writer.py | 18 ++++------ .../src/writers/quepid_writer.py | 4 +-- .../src/writers/rre_writer.py | 4 +-- .../src/writers/writer_factory.py | 2 ++ .../resources/elasticsearch_good_config.yaml | 4 +-- .../tests/unit/resources/good_config.yaml | 3 +- .../resources/good_config_opensearch.yaml | 3 +- .../unit/resources/missing_optional.yaml | 1 - .../tests/unit/resources/mteb_config.yaml | 15 +++++++++ .../tests/unit/resources/rre_config.yaml | 1 - .../unit/resources/solr_good_config.yaml | 1 - .../tests/unit/test_config.py | 11 +++++-- .../tests/unit/writers/test_mteb_writer.py | 4 +-- .../tests/unit/writers/test_quepid_writer.py | 33 +++++++++++++------ .../tests/unit/writers/test_rre_writer.py | 6 ++-- 18 files changed, 79 insertions(+), 55 deletions(-) create mode 100644 rre-dataset-generator/tests/unit/resources/mteb_config.yaml diff --git a/rre-dataset-generator/config.yaml b/rre-dataset-generator/config.yaml index daf8e78c..a881b4db 100644 --- a/rre-dataset-generator/config.yaml +++ b/rre-dataset-generator/config.yaml @@ -57,7 +57,7 @@ relevance_scale: "graded" llm_configuration_file: "llm_config.yaml" # Output format for the generated dataset -# Accepted values: quepid, rre +# Accepted values: quepid, rre, mteb output_format: "quepid" # For rre output format, you need other fields, e.g.: @@ -67,8 +67,13 @@ output_format: "quepid" # rre_query_placeholder: "$query" # Path where the output dataset will be saved -# For rre: json format, for quepid: csv format -output_destination: "output/generated_dataset.csv" +# For rre, output dataset will be saved into output_destination/"ratings.json" +# For quepid, output dataset will be saved into output_destination/"quepid.scv" +# For mteb, the following three files will be saved into: corpus.jsonl, queries.jsonl, and candidates.jsonl +# corpus.jsonl contains <id,title,text> corpus records extracted from search engine. +# queries.jsonl contains <id,text> query records LLM-generated and/or user-defined. +# candidates.jsonl contains <query_id,doc_id,rating> candidate records. +output_destination: "output" # (Optional) Whether to save LLM rating score explanation to file # Default: false; set to true to save LLM rating explanation @@ -78,9 +83,3 @@ save_llm_explanation: true # (**) When save_llm_explanation is set to True, this param needs to be present llm_explanation_destination: "output/rating_explanation.json" -# File path for MTEB, where the following three files will be written into: corpus.jsonl, queries.jsonl, and candidates.jsonl -# corpus.jsonl contains <id,title,text> corpus records extracted from search engine. -# queries.jsonl contains <id,text> query records LLM-generated and/or user-defined. -# candidates.jsonl contains <query_id,doc_id,rating> candidate records. -mteb_destination: "data" - diff --git a/rre-dataset-generator/dataset_generator.py b/rre-dataset-generator/dataset_generator.py index 200404f5..98f5a76d 100644 --- a/rre-dataset-generator/dataset_generator.py +++ b/rre-dataset-generator/dataset_generator.py @@ -97,7 +97,6 @@ def add_cartesian_product_scores(config: Config, data_store: DataStore, llm_serv llm: BaseChatModel = LLMServiceFactory.build(LLMConfig.load(config.llm_configuration_file)) service: LLMService = LLMService(chat_model=llm) writer: AbstractWriter = WriterFactory.build(config, data_store) - mteb_writer: MtebWriter = MtebWriter.build(config, data_store) # pipeline starts add_user_queries(config, data_store) @@ -122,5 +121,3 @@ def add_cartesian_product_scores(config: Config, data_store: DataStore, llm_serv data_store.export_all_records_with_explanation(config.llm_explanation_destination) log.info(f"Dataset with LLM explanation is saved into: {config.llm_explanation_destination}") - mteb_writer.write(config.mteb_destination) - log.info(f"MTEB candidates are saved into: {config.mteb_destination}") diff --git a/rre-dataset-generator/src/config.py b/rre-dataset-generator/src/config.py index bcc23321..4734c3d9 100644 --- a/rre-dataset-generator/src/config.py +++ b/rre-dataset-generator/src/config.py @@ -25,7 +25,7 @@ class Config(BaseModel): num_queries_needed: int = Field(..., gt=0, description="Total number of queries to generate.") relevance_scale: Literal['binary', 'graded'] llm_configuration_file: FilePath = Field(..., description="Path to the LLM configuration file.") - output_format: Literal['quepid', 'rre'] + output_format: Literal['quepid', 'rre', 'mteb'] output_destination: Path = Field(..., description="Path to save the output dataset.") save_llm_explanation: Optional[bool] = False llm_explanation_destination: Optional[Path] = Field(None, description="Path to save the LLM rating explanation") @@ -33,7 +33,7 @@ class Config(BaseModel): id_field: str = Field(None, description="ID field for the unique key.") rre_query_template: FilePath = Field(None, description="Query template for rre evaluator.") rre_query_placeholder: str = Field(None, description="Key-value pair to substitute in the rre query template.") - mteb_destination: Path = Field(..., description="File path for MTEB data") + @field_validator('doc_fields') def check_no_empty_fields(cls, v): diff --git a/rre-dataset-generator/src/writers/mteb_writer.py b/rre-dataset-generator/src/writers/mteb_writer.py index b5a61323..49513d0e 100644 --- a/rre-dataset-generator/src/writers/mteb_writer.py +++ b/rre-dataset-generator/src/writers/mteb_writer.py @@ -25,14 +25,12 @@ class MtebWriter(AbstractWriter): def build(cls, config: Config, data_store: DataStore): return cls(datastore=data_store) - def _write_corpus(self, output_path: str | Path) -> None: + def _write_corpus(self, corpus_path: Path) -> None: """ Writes corpus records extracted from search engine to JSONL file: {"id": <doc_id>, "title": <title>, "text": <description>} """ - path = Path(output_path) - os.makedirs(path.parent, exist_ok=True) - with path.open("w", encoding="utf-8") as file: + with corpus_path.open("w", encoding="utf-8") as file: for doc in self.datastore.get_documents(): doc_id = str(doc.id) fields = doc.fields @@ -42,14 +40,12 @@ def _write_corpus(self, output_path: str | Path) -> None: row = {"id": doc_id, "title": title, "text": text} file.write(json.dumps(row, ensure_ascii=False) + "\n") - def _write_queries(self, output_path: str | Path) -> None: + def _write_queries(self, queries_path: Path) -> None: """ Writes queries LLM-generated and/or user-defined records to JSONL file: {"id": <query_id>, "text": <query_text>} """ - path = Path(output_path) - os.makedirs(path.parent, exist_ok=True) - with path.open("w", encoding="utf-8") as file: + with queries_path.open("w", encoding="utf-8") as file: for query_context in self.datastore.get_queries(): query_id = query_context.get_query_id() query_text = query_context.get_query_text() @@ -57,14 +53,12 @@ def _write_queries(self, output_path: str | Path) -> None: row = {"id": query_id, "text": query_text} file.write(json.dumps(row, ensure_ascii=False) + "\n") - def _write_candidates(self, output_path: str | Path) -> None: + def _write_candidates(self, candidates_path: Path) -> None: """ Writes candidates to JSONL file: {"query_id": <query_id>, "doc_id": <doc_id>, "rating": <rating_score>} """ - path = Path(output_path) - os.makedirs(path.parent, exist_ok=True) - with path.open("w", encoding="utf-8") as file: + with candidates_path.open("w", encoding="utf-8") as file: for query_context in self.datastore.get_queries(): query_id = query_context.get_query_id() for doc_id in query_context.get_doc_ids(): diff --git a/rre-dataset-generator/src/writers/quepid_writer.py b/rre-dataset-generator/src/writers/quepid_writer.py index c5375dd8..63696b53 100644 --- a/rre-dataset-generator/src/writers/quepid_writer.py +++ b/rre-dataset-generator/src/writers/quepid_writer.py @@ -19,9 +19,9 @@ def build(cls, config: Config, data_store: DataStore): def write(self, output_path: str | Path) -> None: """ - Writes queries and their scored documents to a CSV file in Quepid format. + Writes queries and their scored documents to quepid.csv file. """ - output_path = Path(output_path) + output_path = Path(output_path) / "quepid.csv" os.makedirs(output_path.parent, exist_ok=True) with open(output_path, 'w', newline='') as csvfile: writer = csv.writer(csvfile) diff --git a/rre-dataset-generator/src/writers/rre_writer.py b/rre-dataset-generator/src/writers/rre_writer.py index 3e2a0d9f..adaa797d 100644 --- a/rre-dataset-generator/src/writers/rre_writer.py +++ b/rre-dataset-generator/src/writers/rre_writer.py @@ -74,9 +74,9 @@ def _build_json_doc_records(self) -> dict[str, Any]: def write(self, output_path: str | Path) -> None: """ - Writes queries and their ratings to json file in RRE format. + Writes queries and their ratings to ratings.json file in RRE format. """ - output_path = Path(output_path) + output_path = Path(output_path) / "ratings.json" os.makedirs(output_path.parent, exist_ok=True) with open(output_path, 'w', newline='') as json_file: log.debug("Started writing RRE formatted records to json file") diff --git a/rre-dataset-generator/src/writers/writer_factory.py b/rre-dataset-generator/src/writers/writer_factory.py index b8fc91ee..d381d79c 100644 --- a/rre-dataset-generator/src/writers/writer_factory.py +++ b/rre-dataset-generator/src/writers/writer_factory.py @@ -1,4 +1,5 @@ from .abstract_writer import AbstractWriter +from .mteb_writer import MtebWriter from .quepid_writer import QuepidWriter import logging from src.search_engine.data_store import DataStore @@ -12,6 +13,7 @@ class WriterFactory: OUTPUT_FORMAT_REGISTRY = { "quepid": QuepidWriter, "rre": RreWriter, + "mteb": MtebWriter, } @classmethod diff --git a/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml b/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml index 4e0b2320..6a3a3ab5 100644 --- a/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml @@ -19,6 +19,6 @@ num_queries_needed: 10 relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" -output_destination: "output/generated_dataset.json" -mteb_destination: "data" +output_destination: "output" + diff --git a/rre-dataset-generator/tests/unit/resources/good_config.yaml b/rre-dataset-generator/tests/unit/resources/good_config.yaml index ef28d9c1..08bbfd1c 100644 --- a/rre-dataset-generator/tests/unit/resources/good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/good_config.yaml @@ -18,9 +18,8 @@ num_queries_needed: 10 relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" -output_destination: "output/generated_dataset.json" +output_destination: "output" save_llm_explanation: true llm_explanation_destination: "output/rating_explanation.json" -mteb_destination: "data" diff --git a/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml b/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml index 034fc39b..3e9549cc 100644 --- a/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml +++ b/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml @@ -22,7 +22,6 @@ num_queries_needed: 10 relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" -output_destination: "output/generated_dataset.json" -mteb_destination: "data" +output_destination: "output" diff --git a/rre-dataset-generator/tests/unit/resources/missing_optional.yaml b/rre-dataset-generator/tests/unit/resources/missing_optional.yaml index 782f8173..46b7ff84 100644 --- a/rre-dataset-generator/tests/unit/resources/missing_optional.yaml +++ b/rre-dataset-generator/tests/unit/resources/missing_optional.yaml @@ -14,6 +14,5 @@ id_field: "id" corpora_file: "tests/integration/solr-init/data/dataset.json" rre_query_template: "tests/unit/resources/only_q.json" rre_query_placeholder: "$query" -mteb_destination: "data" diff --git a/rre-dataset-generator/tests/unit/resources/mteb_config.yaml b/rre-dataset-generator/tests/unit/resources/mteb_config.yaml new file mode 100644 index 00000000..a19616db --- /dev/null +++ b/rre-dataset-generator/tests/unit/resources/mteb_config.yaml @@ -0,0 +1,15 @@ +query_template: "q=#$query##&fq=genre:horror&wt=json" +search_engine_type: "solr" +index_name: "testcore" +search_engine_collection_endpoint: "http://localhost:8983/solr/testcore/" +doc_number: 100 +doc_fields: + - "title" + - "description" +queries: "tests/unit/resources/queries.txt" +generate_queries_from_documents: true +num_queries_needed: 10 +relevance_scale: "graded" +llm_configuration_file: "tests/unit/resources/llm_config.yaml" +output_format: "mteb" +output_destination: "output" \ No newline at end of file diff --git a/rre-dataset-generator/tests/unit/resources/rre_config.yaml b/rre-dataset-generator/tests/unit/resources/rre_config.yaml index bd835108..fb40373c 100644 --- a/rre-dataset-generator/tests/unit/resources/rre_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/rre_config.yaml @@ -16,6 +16,5 @@ id_field: "id" rre_query_template: "tests/unit/resources/only_q.json" rre_query_placeholder: "$query" output_destination: "output/ratings.json" -mteb_destination: "data" diff --git a/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml b/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml index fc82ec96..c707ac68 100644 --- a/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml @@ -13,6 +13,5 @@ relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" output_destination: "output/generated_dataset.json" -mteb_destination: "data" diff --git a/rre-dataset-generator/tests/unit/test_config.py b/rre-dataset-generator/tests/unit/test_config.py index a93fee0b..4172ccd8 100644 --- a/rre-dataset-generator/tests/unit/test_config.py +++ b/rre-dataset-generator/tests/unit/test_config.py @@ -26,11 +26,11 @@ def test_good_config_expect_all_parameters_read(config): assert config.num_queries_needed == 10 assert config.relevance_scale == "graded" assert config.llm_configuration_file == FilePath("tests/unit/resources/llm_config.yaml") - assert config.output_destination == Path("output/generated_dataset.json") + assert config.output_format == "quepid" + assert config.output_destination == Path("output") assert config.save_llm_explanation is True assert config.llm_explanation_destination == Path("output/rating_explanation.json") assert config.index_name == "testcore" - assert config.mteb_destination == Path("data") def test_missing_optional_field_values(): @@ -60,3 +60,10 @@ def test_file_not_found_raises_exception(): path = "tests/unit/resources/file_does_not_exist.yaml" with pytest.raises(FileNotFoundError): _ = Config.load(path) + + +def test_mteb_config_expect_successful_load(): + path = "tests/unit/resources/mteb_config.yaml" + mteb_config = Config.load(path) + assert mteb_config.output_format == "mteb" + assert mteb_config.output_destination == Path("output") diff --git a/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py b/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py index 3ef2ba0d..514774ad 100644 --- a/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py +++ b/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py @@ -11,7 +11,7 @@ @pytest.fixture def config(): """Loads a valid config.""" - return Config.load("tests/unit/resources/good_config.yaml") + return Config.load("tests/unit/resources/mteb_config.yaml") @pytest.fixture @@ -38,7 +38,7 @@ def populated_datastore() -> DataStore: class TestMtebWriter: def test_write_expect_written_to_jsonl(self, config, populated_datastore, tmp_path: Path): - output_dir = tmp_path / "data" + output_dir = tmp_path writer = MtebWriter(populated_datastore) writer.write(output_dir) diff --git a/rre-dataset-generator/tests/unit/writers/test_quepid_writer.py b/rre-dataset-generator/tests/unit/writers/test_quepid_writer.py index 8e5740b6..d978a38f 100644 --- a/rre-dataset-generator/tests/unit/writers/test_quepid_writer.py +++ b/rre-dataset-generator/tests/unit/writers/test_quepid_writer.py @@ -46,11 +46,12 @@ def unrated_datastore() -> DataStore: class TestQuepidWriter: def test_write_expect_file_successfully_written(self, populated_datastore, tmp_path: Path): """Tests that the QuepidWriter correctly writes a CSV file.""" - output_file = tmp_path / "output.csv" + output_dir = tmp_path writer = QuepidWriter(populated_datastore) - writer.write(str(output_file)) + writer.write(str(output_dir)) + output_file = Path(output_dir) / "quepid.csv" assert output_file.exists() with open(output_file, 'r', newline='') as csvfile: @@ -68,9 +69,12 @@ def test_write_expect_file_successfully_written(self, populated_datastore, tmp_p def test_write_with_empty_datastore_expect_output_file_written_with_only_header(self, empty_datastore, tmp_path: Path): """Tests writing from an empty datastore.""" - output_file = tmp_path / "output.csv" + output_dir = tmp_path writer = QuepidWriter(empty_datastore) - writer.write(str(output_file)) + writer.write(str(output_dir)) + + output_file = Path(output_dir) / "quepid.csv" + assert output_file.exists() with open(output_file, 'r', newline='') as csvfile: reader = csv.reader(csvfile) @@ -82,9 +86,12 @@ def test_write_with_empty_datastore_expect_output_file_written_with_only_header( def test_write_with_no_rated_documents_expect_empty_file(self, unrated_datastore, tmp_path: Path): """Tests writing when no documents have been rated.""" - output_file = tmp_path / "output.csv" + output_dir = tmp_path writer = QuepidWriter(unrated_datastore) - writer.write(str(output_file)) + writer.write(str(output_dir)) + + output_file = Path(output_dir) / "quepid.csv" + assert output_file.exists() with open(output_file, 'r', newline='') as csvfile: reader = csv.reader(csvfile) @@ -102,9 +109,12 @@ def test_write_with_special_characters_expect_file_successfully_written(self, tm query_id = datastore.add_query(query_text, doc_id) datastore.add_rating_score(query_id, doc_id, 1) - output_file = tmp_path / "output.csv" + output_dir = tmp_path writer = QuepidWriter(datastore) - writer.write(str(output_file)) + writer.write(str(output_dir)) + + output_file = Path(output_dir) / "quepid.csv" + assert output_file.exists() with open(output_file, 'r', newline='') as csvfile: reader = csv.reader(csvfile) @@ -123,9 +133,12 @@ def test_write_with_zero_rating_expect_zero_rating_written(self, tmp_path: Path) query_id = datastore.add_query(query_text, doc_id) datastore.add_rating_score(query_id, doc_id, 0) - output_file = tmp_path / "output.csv" + output_dir = tmp_path writer = QuepidWriter(datastore) - writer.write(str(output_file)) + writer.write(str(output_dir)) + + output_file = Path(output_dir) / "quepid.csv" + assert output_file.exists() with open(output_file, 'r', newline='') as csvfile: reader = csv.reader(csvfile) diff --git a/rre-dataset-generator/tests/unit/writers/test_rre_writer.py b/rre-dataset-generator/tests/unit/writers/test_rre_writer.py index cefd373f..a8fc29ac 100644 --- a/rre-dataset-generator/tests/unit/writers/test_rre_writer.py +++ b/rre-dataset-generator/tests/unit/writers/test_rre_writer.py @@ -37,14 +37,16 @@ def populated_datastore() -> DataStore: class TestRreWriter: def test_rre_file_successfully_written(self, rre_config, populated_datastore, tmp_path: Path): - output_file = tmp_path/"ratings.json" + output_dir = tmp_path writer = RreWriter(populated_datastore, index=rre_config.index_name, corpora_file=rre_config.corpora_file, id_field=rre_config.id_field, query_template=rre_config.rre_query_template, query_placeholder=rre_config.rre_query_placeholder) - writer.write(str(output_file)) + writer.write(str(output_dir)) + + output_file = Path(output_dir) / "ratings.json" assert output_file.exists() From ea65b90d7bca649e3a97da893c9f092aff50dfc9 Mon Sep 17 00:00:00 2001 From: Nazerke Seidan <n.seidan@sease.io> Date: Fri, 22 Aug 2025 18:02:01 +0500 Subject: [PATCH 5/5] Minor change --- rre-dataset-generator/.gitignore | 1 - .../tests/unit/resources/elasticsearch_good_config.yaml | 1 - rre-dataset-generator/tests/unit/resources/good_config.yaml | 2 -- .../tests/unit/resources/good_config_opensearch.yaml | 2 -- .../tests/unit/resources/missing_optional.yaml | 2 -- rre-dataset-generator/tests/unit/resources/mteb_config.yaml | 2 +- rre-dataset-generator/tests/unit/resources/rre_config.yaml | 3 +-- .../tests/unit/resources/solr_good_config.yaml | 2 +- 8 files changed, 3 insertions(+), 12 deletions(-) diff --git a/rre-dataset-generator/.gitignore b/rre-dataset-generator/.gitignore index f6099c05..cf37e4c3 100644 --- a/rre-dataset-generator/.gitignore +++ b/rre-dataset-generator/.gitignore @@ -13,7 +13,6 @@ uv.lock # Local Files and Directories .output_dataset output/ -data/ # macOS metadata files .DS_Store diff --git a/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml b/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml index 6a3a3ab5..358a9458 100644 --- a/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml @@ -21,4 +21,3 @@ llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" output_destination: "output" - diff --git a/rre-dataset-generator/tests/unit/resources/good_config.yaml b/rre-dataset-generator/tests/unit/resources/good_config.yaml index 08bbfd1c..d2d2a682 100644 --- a/rre-dataset-generator/tests/unit/resources/good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/good_config.yaml @@ -21,5 +21,3 @@ output_format: "quepid" output_destination: "output" save_llm_explanation: true llm_explanation_destination: "output/rating_explanation.json" - - diff --git a/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml b/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml index 3e9549cc..9a88980d 100644 --- a/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml +++ b/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml @@ -23,5 +23,3 @@ relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" output_destination: "output" - - diff --git a/rre-dataset-generator/tests/unit/resources/missing_optional.yaml b/rre-dataset-generator/tests/unit/resources/missing_optional.yaml index 46b7ff84..eb8b6f59 100644 --- a/rre-dataset-generator/tests/unit/resources/missing_optional.yaml +++ b/rre-dataset-generator/tests/unit/resources/missing_optional.yaml @@ -14,5 +14,3 @@ id_field: "id" corpora_file: "tests/integration/solr-init/data/dataset.json" rre_query_template: "tests/unit/resources/only_q.json" rre_query_placeholder: "$query" - - diff --git a/rre-dataset-generator/tests/unit/resources/mteb_config.yaml b/rre-dataset-generator/tests/unit/resources/mteb_config.yaml index a19616db..c54fa5c9 100644 --- a/rre-dataset-generator/tests/unit/resources/mteb_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/mteb_config.yaml @@ -12,4 +12,4 @@ num_queries_needed: 10 relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "mteb" -output_destination: "output" \ No newline at end of file +output_destination: "output" diff --git a/rre-dataset-generator/tests/unit/resources/rre_config.yaml b/rre-dataset-generator/tests/unit/resources/rre_config.yaml index fb40373c..fe8d1c2b 100644 --- a/rre-dataset-generator/tests/unit/resources/rre_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/rre_config.yaml @@ -15,6 +15,5 @@ corpora_file: "tests/integration/solr-init/data/dataset.json" id_field: "id" rre_query_template: "tests/unit/resources/only_q.json" rre_query_placeholder: "$query" -output_destination: "output/ratings.json" - +output_destination: "output" diff --git a/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml b/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml index c707ac68..398e5f2d 100644 --- a/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml @@ -12,6 +12,6 @@ num_queries_needed: 10 relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" -output_destination: "output/generated_dataset.json" +output_destination: "output"