diff --git a/rre-dataset-generator/config.yaml b/rre-dataset-generator/config.yaml index f94cce54..a881b4db 100644 --- a/rre-dataset-generator/config.yaml +++ b/rre-dataset-generator/config.yaml @@ -57,7 +57,7 @@ relevance_scale: "graded" llm_configuration_file: "llm_config.yaml" # Output format for the generated dataset -# Accepted values: quepid, rre +# Accepted values: quepid, rre, mteb output_format: "quepid" # For rre output format, you need other fields, e.g.: @@ -67,8 +67,13 @@ output_format: "quepid" # rre_query_placeholder: "$query" # Path where the output dataset will be saved -# For rre: json format, for quepid: csv format -output_destination: "output/generated_dataset.csv" +# For rre, output dataset will be saved into output_destination/"ratings.json" +# For quepid, output dataset will be saved into output_destination/"quepid.scv" +# For mteb, the following three files will be saved into: corpus.jsonl, queries.jsonl, and candidates.jsonl +# corpus.jsonl contains corpus records extracted from search engine. +# queries.jsonl contains query records LLM-generated and/or user-defined. +# candidates.jsonl contains candidate records. +output_destination: "output" # (Optional) Whether to save LLM rating score explanation to file # Default: false; set to true to save LLM rating explanation @@ -77,3 +82,4 @@ save_llm_explanation: true # (Optional**) File path where it contains records. # (**) When save_llm_explanation is set to True, this param needs to be present llm_explanation_destination: "output/rating_explanation.json" + diff --git a/rre-dataset-generator/dataset_generator.py b/rre-dataset-generator/dataset_generator.py index 19ac0210..98f5a76d 100644 --- a/rre-dataset-generator/dataset_generator.py +++ b/rre-dataset-generator/dataset_generator.py @@ -18,6 +18,7 @@ # build factories from src.llm.llm_provider_factory import LLMServiceFactory +from src.writers.mteb_writer import MtebWriter from src.writers.writer_factory import WriterFactory from src.search_engine.search_engine_factory import SearchEngineFactory @@ -119,3 +120,4 @@ def add_cartesian_product_scores(config: Config, data_store: DataStore, llm_serv if config.save_llm_explanation: data_store.export_all_records_with_explanation(config.llm_explanation_destination) log.info(f"Dataset with LLM explanation is saved into: {config.llm_explanation_destination}") + diff --git a/rre-dataset-generator/src/config.py b/rre-dataset-generator/src/config.py index 3d9838b7..4734c3d9 100644 --- a/rre-dataset-generator/src/config.py +++ b/rre-dataset-generator/src/config.py @@ -25,7 +25,7 @@ class Config(BaseModel): num_queries_needed: int = Field(..., gt=0, description="Total number of queries to generate.") relevance_scale: Literal['binary', 'graded'] llm_configuration_file: FilePath = Field(..., description="Path to the LLM configuration file.") - output_format: Literal['quepid', 'rre'] + output_format: Literal['quepid', 'rre', 'mteb'] output_destination: Path = Field(..., description="Path to save the output dataset.") save_llm_explanation: Optional[bool] = False llm_explanation_destination: Optional[Path] = Field(None, description="Path to save the LLM rating explanation") @@ -34,6 +34,7 @@ class Config(BaseModel): rre_query_template: FilePath = Field(None, description="Query template for rre evaluator.") rre_query_placeholder: str = Field(None, description="Key-value pair to substitute in the rre query template.") + @field_validator('doc_fields') def check_no_empty_fields(cls, v): if any(not f.strip() for f in v): diff --git a/rre-dataset-generator/src/search_engine/data_store.py b/rre-dataset-generator/src/search_engine/data_store.py index a9d8f3d7..31d15a3b 100644 --- a/rre-dataset-generator/src/search_engine/data_store.py +++ b/rre-dataset-generator/src/search_engine/data_store.py @@ -68,7 +68,7 @@ def get_document(self, doc_id: str) -> Optional[Document]: def get_documents(self) -> List[Document]: """ - Returns a list of Document objects." + Returns a list of Document objects. """ return list(self._documents.values()) @@ -206,7 +206,7 @@ def load_tmp_file_content(self) -> None: def export_all_records_with_explanation(self, output_path: str | Path) -> None: """ - Exports query-doc-rating-explanation tuples to a JSON file. + Exports query-doc_id-rating-explanation tuples to a JSON file. """ records = [] for query_context in self._queries_by_id.values(): diff --git a/rre-dataset-generator/src/utils.py b/rre-dataset-generator/src/utils.py index da1b0186..3d9e10ce 100644 --- a/rre-dataset-generator/src/utils.py +++ b/rre-dataset-generator/src/utils.py @@ -1,11 +1,11 @@ import argparse -import re import html -from pathlib import Path - +import re +from typing import Any _TAG_REGEX = re.compile('<.*?>') + def parse_args(): parser = argparse.ArgumentParser(description='Parse arguments for CLI.') @@ -13,11 +13,20 @@ def parse_args(): help='Config file path to use for the application [default: \"config.yaml\"]', required=False, default="config.yaml") - parser.add_argument('-v', '--verbose',action='store_true', + parser.add_argument('-v', '--verbose', action='store_true', help='Activate debug mode for logging [default: False]') return parser.parse_args() + def clean_text(text: str) -> str: text_without_html = re.sub(_TAG_REGEX, '', text).strip() return html.unescape(re.sub(r"\s{2,}", " ", text_without_html)) + + +def _to_string(value: Any) -> str: + if value is None: + return "" + if isinstance(value, (list, tuple)): + return " ".join(str(val) for val in value if val is not None) + return str(value) diff --git a/rre-dataset-generator/src/writers/mteb_writer.py b/rre-dataset-generator/src/writers/mteb_writer.py new file mode 100644 index 00000000..49513d0e --- /dev/null +++ b/rre-dataset-generator/src/writers/mteb_writer.py @@ -0,0 +1,89 @@ +import json +import logging +import os +from pathlib import Path + +from src.config import Config +from src.search_engine.data_store import DataStore +from src.utils import _to_string +from src.writers.abstract_writer import AbstractWriter + +log = logging.getLogger(__name__) + + +class MtebWriter(AbstractWriter): + """ + MtebWriter: Write data namely corpus, queries, and candidates to JSONL file for MTEB + https://github.com/embeddings-benchmark/mteb + + Corpus format: id,title,text + Queries format: id,text + Candidates format: query_id,doc_id,rating + """ + + @classmethod + def build(cls, config: Config, data_store: DataStore): + return cls(datastore=data_store) + + def _write_corpus(self, corpus_path: Path) -> None: + """ + Writes corpus records extracted from search engine to JSONL file: + {"id": , "title": , "text": <description>} + """ + with corpus_path.open("w", encoding="utf-8") as file: + for doc in self.datastore.get_documents(): + doc_id = str(doc.id) + fields = doc.fields + title = _to_string(fields.get("title")) + text = _to_string(fields.get("description")) + + row = {"id": doc_id, "title": title, "text": text} + file.write(json.dumps(row, ensure_ascii=False) + "\n") + + def _write_queries(self, queries_path: Path) -> None: + """ + Writes queries LLM-generated and/or user-defined records to JSONL file: + {"id": <query_id>, "text": <query_text>} + """ + with queries_path.open("w", encoding="utf-8") as file: + for query_context in self.datastore.get_queries(): + query_id = query_context.get_query_id() + query_text = query_context.get_query_text() + + row = {"id": query_id, "text": query_text} + file.write(json.dumps(row, ensure_ascii=False) + "\n") + + def _write_candidates(self, candidates_path: Path) -> None: + """ + Writes candidates to JSONL file: + {"query_id": <query_id>, "doc_id": <doc_id>, "rating": <rating_score>} + """ + with candidates_path.open("w", encoding="utf-8") as file: + for query_context in self.datastore.get_queries(): + query_id = query_context.get_query_id() + for doc_id in query_context.get_doc_ids(): + if query_context.has_rating_score(doc_id): + rating_score = query_context.get_rating_score(doc_id) + + row = {"query_id": query_id, "doc_id": doc_id, "rating": rating_score} + file.write(json.dumps(row, ensure_ascii=False) + "\n") + + def write(self, output_path: str | Path) -> None: + """ + Write corpus, queries, and candidates JSONL files for MTEB. + """ + path = Path(output_path) + os.makedirs(path, exist_ok=True) + try: + self._write_corpus(path / "corpus.jsonl") + log.info("Corpus written successfully") + + self._write_queries(path / "queries.jsonl") + log.info("Queries written successfully") + + self._write_candidates(path / "candidates.jsonl") + log.info("Candidates written successfully") + + except Exception as e: + log.exception("Failed to write MTEB files: %s", e) + raise diff --git a/rre-dataset-generator/src/writers/quepid_writer.py b/rre-dataset-generator/src/writers/quepid_writer.py index c5375dd8..63696b53 100644 --- a/rre-dataset-generator/src/writers/quepid_writer.py +++ b/rre-dataset-generator/src/writers/quepid_writer.py @@ -19,9 +19,9 @@ def build(cls, config: Config, data_store: DataStore): def write(self, output_path: str | Path) -> None: """ - Writes queries and their scored documents to a CSV file in Quepid format. + Writes queries and their scored documents to quepid.csv file. """ - output_path = Path(output_path) + output_path = Path(output_path) / "quepid.csv" os.makedirs(output_path.parent, exist_ok=True) with open(output_path, 'w', newline='') as csvfile: writer = csv.writer(csvfile) diff --git a/rre-dataset-generator/src/writers/rre_writer.py b/rre-dataset-generator/src/writers/rre_writer.py index 3e2a0d9f..adaa797d 100644 --- a/rre-dataset-generator/src/writers/rre_writer.py +++ b/rre-dataset-generator/src/writers/rre_writer.py @@ -74,9 +74,9 @@ def _build_json_doc_records(self) -> dict[str, Any]: def write(self, output_path: str | Path) -> None: """ - Writes queries and their ratings to json file in RRE format. + Writes queries and their ratings to ratings.json file in RRE format. """ - output_path = Path(output_path) + output_path = Path(output_path) / "ratings.json" os.makedirs(output_path.parent, exist_ok=True) with open(output_path, 'w', newline='') as json_file: log.debug("Started writing RRE formatted records to json file") diff --git a/rre-dataset-generator/src/writers/writer_factory.py b/rre-dataset-generator/src/writers/writer_factory.py index b8fc91ee..d381d79c 100644 --- a/rre-dataset-generator/src/writers/writer_factory.py +++ b/rre-dataset-generator/src/writers/writer_factory.py @@ -1,4 +1,5 @@ from .abstract_writer import AbstractWriter +from .mteb_writer import MtebWriter from .quepid_writer import QuepidWriter import logging from src.search_engine.data_store import DataStore @@ -12,6 +13,7 @@ class WriterFactory: OUTPUT_FORMAT_REGISTRY = { "quepid": QuepidWriter, "rre": RreWriter, + "mteb": MtebWriter, } @classmethod diff --git a/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml b/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml index 35f6b5ef..358a9458 100644 --- a/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/elasticsearch_good_config.yaml @@ -19,4 +19,5 @@ num_queries_needed: 10 relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" -output_destination: "output/generated_dataset.json" +output_destination: "output" + diff --git a/rre-dataset-generator/tests/unit/resources/good_config.yaml b/rre-dataset-generator/tests/unit/resources/good_config.yaml index 46422d0b..d2d2a682 100644 --- a/rre-dataset-generator/tests/unit/resources/good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/good_config.yaml @@ -18,6 +18,6 @@ num_queries_needed: 10 relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" -output_destination: "output/generated_dataset.json" +output_destination: "output" save_llm_explanation: true llm_explanation_destination: "output/rating_explanation.json" diff --git a/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml b/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml index 4bd1df36..9a88980d 100644 --- a/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml +++ b/rre-dataset-generator/tests/unit/resources/good_config_opensearch.yaml @@ -22,4 +22,4 @@ num_queries_needed: 10 relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" -output_destination: "output/generated_dataset.json" +output_destination: "output" diff --git a/rre-dataset-generator/tests/unit/resources/mteb_config.yaml b/rre-dataset-generator/tests/unit/resources/mteb_config.yaml new file mode 100644 index 00000000..c54fa5c9 --- /dev/null +++ b/rre-dataset-generator/tests/unit/resources/mteb_config.yaml @@ -0,0 +1,15 @@ +query_template: "q=#$query##&fq=genre:horror&wt=json" +search_engine_type: "solr" +index_name: "testcore" +search_engine_collection_endpoint: "http://localhost:8983/solr/testcore/" +doc_number: 100 +doc_fields: + - "title" + - "description" +queries: "tests/unit/resources/queries.txt" +generate_queries_from_documents: true +num_queries_needed: 10 +relevance_scale: "graded" +llm_configuration_file: "tests/unit/resources/llm_config.yaml" +output_format: "mteb" +output_destination: "output" diff --git a/rre-dataset-generator/tests/unit/resources/rre_config.yaml b/rre-dataset-generator/tests/unit/resources/rre_config.yaml index 98b4ecea..fe8d1c2b 100644 --- a/rre-dataset-generator/tests/unit/resources/rre_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/rre_config.yaml @@ -15,4 +15,5 @@ corpora_file: "tests/integration/solr-init/data/dataset.json" id_field: "id" rre_query_template: "tests/unit/resources/only_q.json" rre_query_placeholder: "$query" -output_destination: "output/ratings.json" +output_destination: "output" + diff --git a/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml b/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml index 650e4829..398e5f2d 100644 --- a/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml +++ b/rre-dataset-generator/tests/unit/resources/solr_good_config.yaml @@ -12,4 +12,6 @@ num_queries_needed: 10 relevance_scale: "graded" llm_configuration_file: "tests/unit/resources/llm_config.yaml" output_format: "quepid" -output_destination: "output/generated_dataset.json" +output_destination: "output" + + diff --git a/rre-dataset-generator/tests/unit/test_config.py b/rre-dataset-generator/tests/unit/test_config.py index 9c3f96a9..4172ccd8 100644 --- a/rre-dataset-generator/tests/unit/test_config.py +++ b/rre-dataset-generator/tests/unit/test_config.py @@ -26,11 +26,13 @@ def test_good_config_expect_all_parameters_read(config): assert config.num_queries_needed == 10 assert config.relevance_scale == "graded" assert config.llm_configuration_file == FilePath("tests/unit/resources/llm_config.yaml") - assert config.output_destination == Path("output/generated_dataset.json") + assert config.output_format == "quepid" + assert config.output_destination == Path("output") assert config.save_llm_explanation is True assert config.llm_explanation_destination == Path("output/rating_explanation.json") assert config.index_name == "testcore" + def test_missing_optional_field_values(): path = "tests/unit/resources/missing_optional.yaml" cfg = Config.load(path) @@ -58,3 +60,10 @@ def test_file_not_found_raises_exception(): path = "tests/unit/resources/file_does_not_exist.yaml" with pytest.raises(FileNotFoundError): _ = Config.load(path) + + +def test_mteb_config_expect_successful_load(): + path = "tests/unit/resources/mteb_config.yaml" + mteb_config = Config.load(path) + assert mteb_config.output_format == "mteb" + assert mteb_config.output_destination == Path("output") diff --git a/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py b/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py new file mode 100644 index 00000000..514774ad --- /dev/null +++ b/rre-dataset-generator/tests/unit/writers/test_mteb_writer.py @@ -0,0 +1,96 @@ +import json +from pathlib import Path + +import pytest + +from src.config import Config +from src.search_engine.data_store import DataStore +from src.writers.mteb_writer import MtebWriter + + +@pytest.fixture +def config(): + """Loads a valid config.""" + return Config.load("tests/unit/resources/mteb_config.yaml") + + +@pytest.fixture +def populated_datastore() -> DataStore: + """Returns a DataStore instance populated with test data.""" + datastore = DataStore() + + # Query 1: 2 rated docs + query_1_id = datastore.add_query("test query 1", "doc1") + datastore.add_query("test query 1", "doc2") + datastore.add_rating_score(query_1_id, "doc1", 1) + datastore.add_rating_score(query_1_id, "doc2", 1) + + # Query 2: 1 rated doc + query_2_id = datastore.add_query("test query 2", "doc4") + datastore.add_rating_score(query_2_id, "doc4", 2) + + # Query 3: No rated docs + datastore.add_query("test query 3", "doc5") + + return datastore + + +class TestMtebWriter: + + def test_write_expect_written_to_jsonl(self, config, populated_datastore, tmp_path: Path): + output_dir = tmp_path + writer = MtebWriter(populated_datastore) + + writer.write(output_dir) + + corpus_file = output_dir / "corpus.jsonl" + queries_file = output_dir / "queries.jsonl" + candidates_file = output_dir / "candidates.jsonl" + + assert corpus_file.exists() + assert queries_file.exists() + assert candidates_file.exists() + + lines = corpus_file.read_text(encoding="utf-8").splitlines() + rows = [json.loads(line) for line in lines if line.strip()] + + docs = populated_datastore.get_documents() + assert len(rows) == len(docs) + + for row in rows: + assert set(row.keys()) == {"id", "title", "text"} + assert isinstance(row["id"], str) + assert isinstance(row["title"], str) + assert isinstance(row["text"], str) + + lines = queries_file.read_text(encoding="utf-8").splitlines() + rows = [json.loads(line) for line in lines if line.strip()] + + queries = populated_datastore.get_queries() + assert len(rows) == len(queries) + + for row in rows: + assert set(row.keys()) == {"id", "text"} + assert isinstance(row["id"], str) + assert isinstance(row["text"], str) + + lines = candidates_file.read_text(encoding="utf-8").splitlines() + rows = [json.loads(line) for line in lines if line.strip()] + + expected = set() + for query_context in populated_datastore.get_queries(): + query_id = query_context.get_query_id() + for doc_id in query_context.get_doc_ids(): + if query_context.has_rating_score(doc_id): + expected.add((query_id, doc_id, query_context.get_rating_score(doc_id))) + + assert len(rows) == len(expected) + + for row in rows: + assert set(row.keys()) == {"query_id", "doc_id", "rating"} + assert isinstance(row["query_id"], str) + assert isinstance(row["doc_id"], str) + assert isinstance(row["rating"], int) + + written = {(row["query_id"], row["doc_id"], row["rating"]) for row in rows} + assert written == expected diff --git a/rre-dataset-generator/tests/unit/writers/test_quepid_writer.py b/rre-dataset-generator/tests/unit/writers/test_quepid_writer.py index 8e5740b6..d978a38f 100644 --- a/rre-dataset-generator/tests/unit/writers/test_quepid_writer.py +++ b/rre-dataset-generator/tests/unit/writers/test_quepid_writer.py @@ -46,11 +46,12 @@ def unrated_datastore() -> DataStore: class TestQuepidWriter: def test_write_expect_file_successfully_written(self, populated_datastore, tmp_path: Path): """Tests that the QuepidWriter correctly writes a CSV file.""" - output_file = tmp_path / "output.csv" + output_dir = tmp_path writer = QuepidWriter(populated_datastore) - writer.write(str(output_file)) + writer.write(str(output_dir)) + output_file = Path(output_dir) / "quepid.csv" assert output_file.exists() with open(output_file, 'r', newline='') as csvfile: @@ -68,9 +69,12 @@ def test_write_expect_file_successfully_written(self, populated_datastore, tmp_p def test_write_with_empty_datastore_expect_output_file_written_with_only_header(self, empty_datastore, tmp_path: Path): """Tests writing from an empty datastore.""" - output_file = tmp_path / "output.csv" + output_dir = tmp_path writer = QuepidWriter(empty_datastore) - writer.write(str(output_file)) + writer.write(str(output_dir)) + + output_file = Path(output_dir) / "quepid.csv" + assert output_file.exists() with open(output_file, 'r', newline='') as csvfile: reader = csv.reader(csvfile) @@ -82,9 +86,12 @@ def test_write_with_empty_datastore_expect_output_file_written_with_only_header( def test_write_with_no_rated_documents_expect_empty_file(self, unrated_datastore, tmp_path: Path): """Tests writing when no documents have been rated.""" - output_file = tmp_path / "output.csv" + output_dir = tmp_path writer = QuepidWriter(unrated_datastore) - writer.write(str(output_file)) + writer.write(str(output_dir)) + + output_file = Path(output_dir) / "quepid.csv" + assert output_file.exists() with open(output_file, 'r', newline='') as csvfile: reader = csv.reader(csvfile) @@ -102,9 +109,12 @@ def test_write_with_special_characters_expect_file_successfully_written(self, tm query_id = datastore.add_query(query_text, doc_id) datastore.add_rating_score(query_id, doc_id, 1) - output_file = tmp_path / "output.csv" + output_dir = tmp_path writer = QuepidWriter(datastore) - writer.write(str(output_file)) + writer.write(str(output_dir)) + + output_file = Path(output_dir) / "quepid.csv" + assert output_file.exists() with open(output_file, 'r', newline='') as csvfile: reader = csv.reader(csvfile) @@ -123,9 +133,12 @@ def test_write_with_zero_rating_expect_zero_rating_written(self, tmp_path: Path) query_id = datastore.add_query(query_text, doc_id) datastore.add_rating_score(query_id, doc_id, 0) - output_file = tmp_path / "output.csv" + output_dir = tmp_path writer = QuepidWriter(datastore) - writer.write(str(output_file)) + writer.write(str(output_dir)) + + output_file = Path(output_dir) / "quepid.csv" + assert output_file.exists() with open(output_file, 'r', newline='') as csvfile: reader = csv.reader(csvfile) diff --git a/rre-dataset-generator/tests/unit/writers/test_rre_writer.py b/rre-dataset-generator/tests/unit/writers/test_rre_writer.py index cefd373f..a8fc29ac 100644 --- a/rre-dataset-generator/tests/unit/writers/test_rre_writer.py +++ b/rre-dataset-generator/tests/unit/writers/test_rre_writer.py @@ -37,14 +37,16 @@ def populated_datastore() -> DataStore: class TestRreWriter: def test_rre_file_successfully_written(self, rre_config, populated_datastore, tmp_path: Path): - output_file = tmp_path/"ratings.json" + output_dir = tmp_path writer = RreWriter(populated_datastore, index=rre_config.index_name, corpora_file=rre_config.corpora_file, id_field=rre_config.id_field, query_template=rre_config.rre_query_template, query_placeholder=rre_config.rre_query_placeholder) - writer.write(str(output_file)) + writer.write(str(output_dir)) + + output_file = Path(output_dir) / "ratings.json" assert output_file.exists()