From 3d8f2dba7cf686da717b44d837ff8b107112c9cd Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Mon, 30 Dec 2024 15:21:29 -0800 Subject: [PATCH 01/10] Add two new UTs: loading from JSON and CSV. --- judgeval/data/datasets/dataset.py | 3 ++ tests/data/datasets/test_dataset.py | 65 +++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/judgeval/data/datasets/dataset.py b/judgeval/data/datasets/dataset.py index 6e779451..eaf4d7b7 100644 --- a/judgeval/data/datasets/dataset.py +++ b/judgeval/data/datasets/dataset.py @@ -206,6 +206,9 @@ def add_from_csv( We want to collect the examples and ground truths separately which can be determined by the "example" column. If the value is True, then it is an example, otherwise it is a ground truth. + + We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons. + This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters. """ examples, ground_truths = [], [] diff --git a/tests/data/datasets/test_dataset.py b/tests/data/datasets/test_dataset.py index 32a307fa..177c6db4 100644 --- a/tests/data/datasets/test_dataset.py +++ b/tests/data/datasets/test_dataset.py @@ -175,3 +175,68 @@ def test_str_representation(dataset, sample_example, sample_ground_truth): assert "EvalDataset" in str_rep assert "ground_truths" in str_rep assert "examples" in str_rep + +# new UTs for dataset UX testing + +def test_load_from_json(): + ex1 = Example( + input="test input", + actual_output="test output", + expected_output="expected output", + context=["context1", "context2"], + retrieval_context=["retrieval1"], + additional_metadata={"key": "value"}, + tools_called=["tool1"], + expected_tools=["tool1", "tool2"], + name="test example" + ) + + gt1 = GroundTruthExample( + input="test input", + expected_output="expected output", + context=["context1"], + retrieval_context=["retrieval1"], + additional_metadata={"key": "value"}, + tools_called=["tool1"], + expected_tools=["tool1"], + comments="test comment", + source_file="test.py" + ) + + dataset = EvalDataset() + + dataset.add_from_json("tests/data/datasets/sample_data/dataset.json") + assert dataset.ground_truths == [gt1] + assert dataset.examples == [ex1] + + +def test_load_from_csv(): + ex1 = Example( + input="test input", + actual_output="test output", + expected_output="expected output", + context=["context1", "context2"], + retrieval_context=["retrieval1"], + additional_metadata={"key": "value"}, + tools_called=["tool1"], + expected_tools=["tool1", "tool2"], + name="test example" + ) + + gt1 = GroundTruthExample( + input="test input", + expected_output="expected output", + context=["context1"], + retrieval_context=["retrieval1"], + additional_metadata={"key": "value"}, + tools_called=["tool1"], + expected_tools=["tool1"], + comments="test comment", + source_file="test.py" + ) + + dataset = EvalDataset() + + dataset.add_from_csv("tests/data/datasets/sample_data/dataset.csv") + assert dataset.ground_truths == [gt1] + assert dataset.examples == [ex1] From 0e422847c6040ba3c63b919f87c434131f6ec210 Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Mon, 30 Dec 2024 15:22:14 -0800 Subject: [PATCH 02/10] Add sample test data for UTs loading from JSON/CSV --> dataset. --- judgeval/data/datasets/20241111_175859.csv | 4 -- judgeval/data/datasets/20241111_175859.json | 44 ---------------- judgeval/data/datasets/sample.csv | 4 -- tests/data/datasets/sample_data/dataset.csv | 3 ++ tests/data/datasets/sample_data/dataset.json | 53 ++++++++++++++++++++ 5 files changed, 56 insertions(+), 52 deletions(-) delete mode 100644 judgeval/data/datasets/20241111_175859.csv delete mode 100644 judgeval/data/datasets/20241111_175859.json delete mode 100644 judgeval/data/datasets/sample.csv create mode 100644 tests/data/datasets/sample_data/dataset.csv create mode 100644 tests/data/datasets/sample_data/dataset.json diff --git a/judgeval/data/datasets/20241111_175859.csv b/judgeval/data/datasets/20241111_175859.csv deleted file mode 100644 index 5ebd7667..00000000 --- a/judgeval/data/datasets/20241111_175859.csv +++ /dev/null @@ -1,4 +0,0 @@ -input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example -"Hello, do you sell beans?",Yes! We sell beans 'n cheese.,"Crikey! Of course, with a jacket potato too.",,Spudbros is a British business that sells baked potatoes.,{},,,Spudbros_1,,,True -Does beans go on first?,That's your choice! Either beans or cheese is a good option.,Mate... you can't be serious. Of course it's cheese first.,,Spudbros typically creates their baked potatoes by adding a layer of three-cheese blend before topping it with beans.,{},,,Spudbros_2,,,True -Can I have a spooky spud?,,"Nah mate, Halloween's past, ain't it?",,,{},,,,love me a spooky spud!,,False diff --git a/judgeval/data/datasets/20241111_175859.json b/judgeval/data/datasets/20241111_175859.json deleted file mode 100644 index 1f960881..00000000 --- a/judgeval/data/datasets/20241111_175859.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "ground_truths": [ - { - "input": "Can I have a spooky spud?", - "actual_output": null, - "expected_output": "Nah mate, Halloween's past, ain't it?", - "context": [], - "retrieval_context": [], - "additional_metadata": {}, - "comments": "love me a spooky spud!", - "tools_called": [], - "expected_tools": [], - "source_file": null - } - ], - "examples": [ - { - "input": "Hello, do you sell beans?", - "actual_output": "Yes! We sell beans 'n cheese.", - "expected_output": "Crikey! Of course, with a jacket potato too.", - "context": [], - "retrieval_context": [ - "Spudbros is a British business that sells baked potatoes." - ], - "additional_metadata": {}, - "tools_called": [], - "expected_tools": [], - "name": "Spudbros_1" - }, - { - "input": "Does beans go on first?", - "actual_output": "That's your choice! Either beans or cheese is a good option.", - "expected_output": "Mate... you can't be serious. Of course it's cheese first.", - "context": [], - "retrieval_context": [ - "Spudbros typically creates their baked potatoes by adding a layer of three-cheese blend before topping it with beans." - ], - "additional_metadata": {}, - "tools_called": [], - "expected_tools": [], - "name": "Spudbros_2" - } - ] -} \ No newline at end of file diff --git a/judgeval/data/datasets/sample.csv b/judgeval/data/datasets/sample.csv deleted file mode 100644 index 6db7c41b..00000000 --- a/judgeval/data/datasets/sample.csv +++ /dev/null @@ -1,4 +0,0 @@ -input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example -"Hello, do you sell beans?",Yes! We sell beans 'n cheese.,"Crikey! Of course, with a jacket potato too.",,Spudbros is a British business that sells baked potatoes.,,,,Spudbros_1,,,TRUE -Does beans go on first?,That's your choice! Either beans or cheese is a good option.,Mate... you can't be serious. Of course it's cheese first.,,Spudbros typically creates their baked potatoes by adding a layer of three-cheese blend before topping it with beans.,,,,Spudbros_2,,,TRUE -Can I have a spooky spud?,,"Nah mate, Halloween's past, ain't it?",,,,,,,love me a spooky spud!,,FALSE \ No newline at end of file diff --git a/tests/data/datasets/sample_data/dataset.csv b/tests/data/datasets/sample_data/dataset.csv new file mode 100644 index 00000000..d4f8a72f --- /dev/null +++ b/tests/data/datasets/sample_data/dataset.csv @@ -0,0 +1,3 @@ +input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example +test input,test output,expected output,context1;context2,retrieval1,{'key': 'value'},tool1,tool1;tool2,test example,,,True +test input,,expected output,context1,retrieval1,{'key': 'value'},tool1,tool1,,test comment,test.py,False diff --git a/tests/data/datasets/sample_data/dataset.json b/tests/data/datasets/sample_data/dataset.json new file mode 100644 index 00000000..e95afcfb --- /dev/null +++ b/tests/data/datasets/sample_data/dataset.json @@ -0,0 +1,53 @@ +{ + "ground_truths": [ + { + "input": "test input", + "actual_output": null, + "expected_output": "expected output", + "context": [ + "context1" + ], + "retrieval_context": [ + "retrieval1" + ], + "additional_metadata": { + "key": "value" + }, + "comments": "test comment", + "tools_called": [ + "tool1" + ], + "expected_tools": [ + "tool1" + ], + "source_file": "test.py" + } + ], + "examples": [ + { + "input": "test input", + "actual_output": "test output", + "expected_output": "expected output", + "context": [ + "context1", + "context2" + ], + "retrieval_context": [ + "retrieval1" + ], + "additional_metadata": { + "key": "value" + }, + "tools_called": [ + "tool1" + ], + "expected_tools": [ + "tool1", + "tool2" + ], + "name": "test example", + "example_id": null, + "timestamp": "20241230_145155" + } + ] +} \ No newline at end of file From 45593934f366a9a5580999f51a3511acbb77b8f2 Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Mon, 30 Dec 2024 16:35:23 -0800 Subject: [PATCH 03/10] Add trace ID to GroundTruthExample and integrate into Dataset helpers --- judgeval/data/datasets/dataset.py | 13 +++++++++---- judgeval/data/datasets/ground_truth.py | 7 +++++-- judgeval/data/example.py | 1 + 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/judgeval/data/datasets/dataset.py b/judgeval/data/datasets/dataset.py index eaf4d7b7..9089ecee 100644 --- a/judgeval/data/datasets/dataset.py +++ b/judgeval/data/datasets/dataset.py @@ -195,13 +195,14 @@ def add_from_csv( "Please install pandas to use this method. 'pip install pandas'" ) - df = pd.read_csv(file_path) + df = pd.read_csv(file_path, dtype={'trace_id': str}) """ Expect the CSV to have headers "input", "actual_output", "expected_output", "context", \ "retrieval_context", "additional_metadata", "tools_called", \ - "expected_tools", "name", "comments", "source_file", "example" + "expected_tools", "name", "comments", "source_file", "example", \ + "trace_id" We want to collect the examples and ground truths separately which can be determined by the "example" column. If the value is True, then it is an @@ -222,8 +223,8 @@ def add_from_csv( "additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(), "tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [], "expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [], + "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None } - if row["example"]: data["name"] = row["name"] if pd.notna(row["name"]) else None # every Example has `input` and `actual_output` fields @@ -233,6 +234,7 @@ def add_from_csv( else: raise ValueError("Every example must have an 'input' and 'actual_output' field.") else: + # GroundTruthExample has `comments` and `source_file` fields data["comments"] = row["comments"] if pd.notna(row["comments"]) else None data["source_file"] = row["source_file"] if pd.notna(row["source_file"]) else None # every GroundTruthExample has `input` field @@ -284,7 +286,8 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s writer.writerow([ "input", "actual_output", "expected_output", "context", \ "retrieval_context", "additional_metadata", "tools_called", \ - "expected_tools", "name", "comments", "source_file", "example" + "expected_tools", "name", "comments", "source_file", "example", \ + "trace_id" ]) for e in self.examples: writer.writerow( @@ -301,6 +304,7 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s None, # Example does not have comments None, # Example does not have source file True, # Adding an Example + e.trace_id ] ) @@ -319,6 +323,7 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s g.comments, g.source_file, False, # Adding a GroundTruthExample, not an Example + g.trace_id ] ) else: diff --git a/judgeval/data/datasets/ground_truth.py b/judgeval/data/datasets/ground_truth.py index c100919e..902869ad 100644 --- a/judgeval/data/datasets/ground_truth.py +++ b/judgeval/data/datasets/ground_truth.py @@ -19,6 +19,7 @@ class GroundTruthExample(BaseModel): tools_called: Optional[List[str]] = None expected_tools: Optional[List[str]] = None source_file: Optional[str] = None + trace_id: Optional[str] = None def to_dict(self): return { @@ -32,6 +33,7 @@ def to_dict(self): "tools_called": self.tools_called, "expected_tools": self.expected_tools, "source_file": self.source_file, + "trace_id": self.trace_id, } def __str__(self): @@ -46,6 +48,7 @@ def __str__(self): f"comments={self.comments}, " f"tools_called={self.tools_called}, " f"expected_tools={self.expected_tools}, " - f"source_file={self.source_file}" + f"source_file={self.source_file}, " + f"trace_id={self.trace_id}" f")" - ) + ) \ No newline at end of file diff --git a/judgeval/data/example.py b/judgeval/data/example.py index 38238f7a..74b541d5 100644 --- a/judgeval/data/example.py +++ b/judgeval/data/example.py @@ -56,6 +56,7 @@ def to_dict(self): "name": self.name, "example_id": self.example_id, "timestamp": self.timestamp, + "trace_id": self.trace_id } def __str__(self): From 75fe8e22d10da57c696f462fc0ca5f8e205b8686 Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Mon, 30 Dec 2024 16:36:02 -0800 Subject: [PATCH 04/10] Update UT for dataset loading from JSON/CSV with trace IDs --- tests/data/datasets/sample_data/dataset.csv | 6 +- tests/data/datasets/sample_data/dataset.json | 6 +- tests/data/datasets/test_dataset.py | 59 ++++++++++++++++++-- 3 files changed, 60 insertions(+), 11 deletions(-) diff --git a/tests/data/datasets/sample_data/dataset.csv b/tests/data/datasets/sample_data/dataset.csv index d4f8a72f..51b58db1 100644 --- a/tests/data/datasets/sample_data/dataset.csv +++ b/tests/data/datasets/sample_data/dataset.csv @@ -1,3 +1,3 @@ -input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example -test input,test output,expected output,context1;context2,retrieval1,{'key': 'value'},tool1,tool1;tool2,test example,,,True -test input,,expected output,context1,retrieval1,{'key': 'value'},tool1,tool1,,test comment,test.py,False +input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example,trace_id +test input,test output,expected output,context1;context2,retrieval1,{'key': 'value'},tool1,tool1;tool2,test example,,,True,123 +test input,,expected output,context1,retrieval1,{'key': 'value'},tool1,tool1,,test comment,test.py,False,094121 diff --git a/tests/data/datasets/sample_data/dataset.json b/tests/data/datasets/sample_data/dataset.json index e95afcfb..8a7b3e49 100644 --- a/tests/data/datasets/sample_data/dataset.json +++ b/tests/data/datasets/sample_data/dataset.json @@ -20,7 +20,8 @@ "expected_tools": [ "tool1" ], - "source_file": "test.py" + "source_file": "test.py", + "trace_id": "094121" } ], "examples": [ @@ -47,7 +48,8 @@ ], "name": "test example", "example_id": null, - "timestamp": "20241230_145155" + "timestamp": "20241230_160117", + "trace_id": "123" } ] } \ No newline at end of file diff --git a/tests/data/datasets/test_dataset.py b/tests/data/datasets/test_dataset.py index 177c6db4..0ece857c 100644 --- a/tests/data/datasets/test_dataset.py +++ b/tests/data/datasets/test_dataset.py @@ -128,7 +128,8 @@ def test_add_from_csv(mock_read_csv, dataset): 'name': ['name1', None], 'comments': [None, 'comment2'], 'source_file': [None, 'file2'], - 'example': [True, False] + 'example': [True, False], + 'trace_id': [None, '123'] }) mock_read_csv.return_value = mock_df @@ -188,7 +189,8 @@ def test_load_from_json(): additional_metadata={"key": "value"}, tools_called=["tool1"], expected_tools=["tool1", "tool2"], - name="test example" + name="test example", + trace_id="123" ) gt1 = GroundTruthExample( @@ -200,14 +202,26 @@ def test_load_from_json(): tools_called=["tool1"], expected_tools=["tool1"], comments="test comment", - source_file="test.py" + source_file="test.py", + trace_id="094121" ) dataset = EvalDataset() dataset.add_from_json("tests/data/datasets/sample_data/dataset.json") assert dataset.ground_truths == [gt1] - assert dataset.examples == [ex1] + assert len(dataset.examples) == 1 + loaded_example = dataset.examples[0] + assert loaded_example.input == ex1.input + assert loaded_example.actual_output == ex1.actual_output + assert loaded_example.expected_output == ex1.expected_output + assert loaded_example.context == ex1.context + assert loaded_example.retrieval_context == ex1.retrieval_context + assert loaded_example.additional_metadata == ex1.additional_metadata + assert loaded_example.tools_called == ex1.tools_called + assert loaded_example.expected_tools == ex1.expected_tools + assert loaded_example.name == ex1.name + assert loaded_example.trace_id == ex1.trace_id def test_load_from_csv(): @@ -220,7 +234,8 @@ def test_load_from_csv(): additional_metadata={"key": "value"}, tools_called=["tool1"], expected_tools=["tool1", "tool2"], - name="test example" + name="test example", + trace_id="123" ) gt1 = GroundTruthExample( @@ -232,7 +247,8 @@ def test_load_from_csv(): tools_called=["tool1"], expected_tools=["tool1"], comments="test comment", - source_file="test.py" + source_file="test.py", + trace_id="094121" ) dataset = EvalDataset() @@ -240,3 +256,34 @@ def test_load_from_csv(): dataset.add_from_csv("tests/data/datasets/sample_data/dataset.csv") assert dataset.ground_truths == [gt1] assert dataset.examples == [ex1] + + +if __name__ == "__main__": + ex1 = Example( + input="test input", + actual_output="test output", + expected_output="expected output", + context=["context1", "context2"], + retrieval_context=["retrieval1"], + additional_metadata={"key": "value"}, + tools_called=["tool1"], + expected_tools=["tool1", "tool2"], + name="test example", + trace_id="123" + ) + + gt1 = GroundTruthExample( + input="test input", + expected_output="expected output", + context=["context1"], + retrieval_context=["retrieval1"], + additional_metadata={"key": "value"}, + tools_called=["tool1"], + expected_tools=["tool1"], + comments="test comment", + source_file="test.py", + trace_id="094121" + ) + dataset = EvalDataset(ground_truths=[gt1], examples=[ex1]) + dataset.save_as("json", "tests/data/datasets/sample_data/", "dataset") + dataset.save_as("csv", "tests/data/datasets/sample_data/", "dataset") From 5bed97f51624aafd9ee9e623010fb2fb57aa4c24 Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Mon, 30 Dec 2024 16:37:08 -0800 Subject: [PATCH 05/10] Refactor; remove bottom main code and add documentation. --- tests/data/datasets/test_dataset.py | 33 ++--------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/tests/data/datasets/test_dataset.py b/tests/data/datasets/test_dataset.py index 0ece857c..0db5a4d8 100644 --- a/tests/data/datasets/test_dataset.py +++ b/tests/data/datasets/test_dataset.py @@ -210,6 +210,8 @@ def test_load_from_json(): dataset.add_from_json("tests/data/datasets/sample_data/dataset.json") assert dataset.ground_truths == [gt1] + + # We can't do the same comparison as above because the timestamps are different assert len(dataset.examples) == 1 loaded_example = dataset.examples[0] assert loaded_example.input == ex1.input @@ -256,34 +258,3 @@ def test_load_from_csv(): dataset.add_from_csv("tests/data/datasets/sample_data/dataset.csv") assert dataset.ground_truths == [gt1] assert dataset.examples == [ex1] - - -if __name__ == "__main__": - ex1 = Example( - input="test input", - actual_output="test output", - expected_output="expected output", - context=["context1", "context2"], - retrieval_context=["retrieval1"], - additional_metadata={"key": "value"}, - tools_called=["tool1"], - expected_tools=["tool1", "tool2"], - name="test example", - trace_id="123" - ) - - gt1 = GroundTruthExample( - input="test input", - expected_output="expected output", - context=["context1"], - retrieval_context=["retrieval1"], - additional_metadata={"key": "value"}, - tools_called=["tool1"], - expected_tools=["tool1"], - comments="test comment", - source_file="test.py", - trace_id="094121" - ) - dataset = EvalDataset(ground_truths=[gt1], examples=[ex1]) - dataset.save_as("json", "tests/data/datasets/sample_data/", "dataset") - dataset.save_as("csv", "tests/data/datasets/sample_data/", "dataset") From 07f06f39c276dcb989bfa938ee9035fc7e39d2f2 Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Mon, 30 Dec 2024 16:54:36 -0800 Subject: [PATCH 06/10] Add documentation string for JSON helper func to specify example JSON format. --- judgeval/data/datasets/dataset.py | 59 ++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/judgeval/data/datasets/dataset.py b/judgeval/data/datasets/dataset.py index 9089ecee..8279e43f 100644 --- a/judgeval/data/datasets/dataset.py +++ b/judgeval/data/datasets/dataset.py @@ -158,7 +158,64 @@ def add_from_json(self, file_path: str) -> None: Adds examples and ground truths from a JSON file. The format of the JSON file is expected to be a dictionary with two keys: "examples" and "ground_truths". - The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth. + The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth. + + The JSON file is expected to have the following format: + { + "ground_truths": [ + { + "input": "test input", + "actual_output": null, + "expected_output": "expected output", + "context": [ + "context1" + ], + "retrieval_context": [ + "retrieval1" + ], + "additional_metadata": { + "key": "value" + }, + "comments": "test comment", + "tools_called": [ + "tool1" + ], + "expected_tools": [ + "tool1" + ], + "source_file": "test.py", + "trace_id": "094121" + } + ], + "examples": [ + { + "input": "test input", + "actual_output": "test output", + "expected_output": "expected output", + "context": [ + "context1", + "context2" + ], + "retrieval_context": [ + "retrieval1" + ], + "additional_metadata": { + "key": "value" + }, + "tools_called": [ + "tool1" + ], + "expected_tools": [ + "tool1", + "tool2" + ], + "name": "test example", + "example_id": null, + "timestamp": "20241230_160117", + "trace_id": "123" + } + ] + } """ try: with open(file_path, "r") as file: From b20a058f51b1265caddffb55ccf7bfe9efffa303 Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Mon, 30 Dec 2024 16:59:30 -0800 Subject: [PATCH 07/10] Update GroundTruthExample UTs to reflect TraceID being added to the class --- tests/data/datasets/test_ground_truth.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/data/datasets/test_ground_truth.py b/tests/data/datasets/test_ground_truth.py index 58f58c52..bf486d18 100644 --- a/tests/data/datasets/test_ground_truth.py +++ b/tests/data/datasets/test_ground_truth.py @@ -56,6 +56,7 @@ def test_ground_truth_example_to_dict(): "tools_called": None, "expected_tools": None, "source_file": None, + "trace_id": None } assert example.to_dict() == expected_dict @@ -79,7 +80,8 @@ def test_ground_truth_example_str_representation(): "comments=None, " "tools_called=None, " "expected_tools=None, " - "source_file=None)" + "source_file=None, " + "trace_id=None)" ) assert str(example) == expected_str From bc372aba7d69562967e96b6d5882bcfe0779ac34 Mon Sep 17 00:00:00 2001 From: JCamyre Date: Mon, 30 Dec 2024 17:18:19 -0800 Subject: [PATCH 08/10] Add comment clarifying why we specify 'trace_id' as a str when reading csv's using pandas. --- judgeval/data/datasets/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/judgeval/data/datasets/dataset.py b/judgeval/data/datasets/dataset.py index 8279e43f..abaebbc4 100644 --- a/judgeval/data/datasets/dataset.py +++ b/judgeval/data/datasets/dataset.py @@ -252,6 +252,7 @@ def add_from_csv( "Please install pandas to use this method. 'pip install pandas'" ) + # Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior) df = pd.read_csv(file_path, dtype={'trace_id': str}) """ Expect the CSV to have headers From 532240019dde040132acf3ae6dcdee48899f0b01 Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Mon, 30 Dec 2024 17:56:54 -0800 Subject: [PATCH 09/10] Edit UT to pass GH requirements >:) --- tests/scorers/test_prompt_scorer.py | 56 +++++++++++++++-------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/tests/scorers/test_prompt_scorer.py b/tests/scorers/test_prompt_scorer.py index 4bdf3e89..8675d71e 100644 --- a/tests/scorers/test_prompt_scorer.py +++ b/tests/scorers/test_prompt_scorer.py @@ -27,6 +27,10 @@ def mock_model(): # Simple implementation of PromptScorer for testing class SampleScorer(PromptScorer): + def __init__(self, mock_model, *args, **kwargs): + super().__init__(*args, **kwargs) + self.model = mock_model + def build_measure_prompt(self, example: Example) -> List[dict]: return [ {"role": "system", "content": "Test system prompt"}, @@ -44,19 +48,19 @@ def success_check(self, **kwargs) -> bool: # Tests for PromptScorer class TestPromptScorer: - def test_init(self): - scorer = SampleScorer("test_scorer") + def test_init(self, mock_model): + scorer = SampleScorer(name="test_scorer", mock_model=mock_model) assert scorer.name == "test_scorer" assert scorer.threshold == 0.5 assert scorer.include_reason is True assert scorer.async_mode is True - def test_init_strict_mode(self): - scorer = SampleScorer("test_scorer", strict_mode=True) + def test_init_strict_mode(self, mock_model): + scorer = SampleScorer(name="test_scorer", mock_model=mock_model, strict_mode=True) assert scorer.threshold == 1 - def test_enforce_prompt_format(self): - scorer = SampleScorer("test_scorer") + def test_enforce_prompt_format(self, mock_model): + scorer = SampleScorer(name="test_scorer", mock_model=mock_model) prompt = [{"role": "system", "content": "Base prompt"}] schema = {"score": float, "reason": str} @@ -65,23 +69,21 @@ def test_enforce_prompt_format(self): assert '"score": (float)' in formatted[0]["content"] assert '"reason": (str)' in formatted[0]["content"] - def test_enforce_prompt_format_invalid_input(self): - scorer = SampleScorer("test_scorer") + def test_enforce_prompt_format_invalid_input(self, mock_model): + scorer = SampleScorer(name="test_scorer", mock_model=mock_model) with pytest.raises(TypeError): scorer.enforce_prompt_format("invalid", {}) @pytest.mark.asyncio async def test_a_score_example(self, example, mock_model): - scorer = SampleScorer("test_scorer") - scorer.model = mock_model + scorer = SampleScorer(name="test_scorer", mock_model=mock_model) result = await scorer.a_score_example(example, _show_indicator=False) assert result == 0.8 assert scorer.reason == "Test reason" def test_score_example_sync(self, example, mock_model): - scorer = SampleScorer("test_scorer", async_mode=False) - scorer.model = mock_model + scorer = SampleScorer(name="test_scorer", mock_model=mock_model, async_mode=False) result = scorer.score_example(example, _show_indicator=False) assert result == 0.8 @@ -102,18 +104,18 @@ def classifier_options(self): def test_classifier_init(self, classifier_conversation, classifier_options): scorer = ClassifierScorer( - "test_classifier", - classifier_conversation, - classifier_options + name="test_classifier", + conversation=classifier_conversation, + options=classifier_options ) assert scorer.conversation == classifier_conversation assert scorer.options == classifier_options def test_build_measure_prompt(self, example, classifier_conversation, classifier_options): scorer = ClassifierScorer( - "test_classifier", - classifier_conversation, - classifier_options + name="test_classifier", + conversation=classifier_conversation, + options=classifier_options ) prompt = scorer.build_measure_prompt(example) @@ -121,9 +123,9 @@ def test_build_measure_prompt(self, example, classifier_conversation, classifier def test_process_response(self, classifier_conversation, classifier_options): scorer = ClassifierScorer( - "test_classifier", - classifier_conversation, - classifier_options + name="test_classifier", + conversation=classifier_conversation, + options=classifier_options ) response = {"choice": "positive", "reason": "Test reason"} @@ -133,9 +135,9 @@ def test_process_response(self, classifier_conversation, classifier_options): def test_process_response_invalid_choice(self, classifier_conversation, classifier_options): scorer = ClassifierScorer( - "test_classifier", - classifier_conversation, - classifier_options + name="test_classifier", + conversation=classifier_conversation, + options=classifier_options ) response = {"choice": "invalid", "reason": "Test reason"} @@ -144,9 +146,9 @@ def test_process_response_invalid_choice(self, classifier_conversation, classifi def test_success_check(self, classifier_conversation, classifier_options): scorer = ClassifierScorer( - "test_classifier", - classifier_conversation, - classifier_options + name="test_classifier", + conversation=classifier_conversation, + options=classifier_options ) scorer.score = 1.0 From caeaa0b8051fa0468ea97516914cf0cb5d42d79f Mon Sep 17 00:00:00 2001 From: SecroLoL Date: Wed, 1 Jan 2025 19:14:35 -0800 Subject: [PATCH 10/10] Fix broken PromptScorer/Classifier Scorer UTs. Pydantic attribute issues are resolved and now UTs pass. --- judgeval/scorers/prompt_scorer.py | 5 +++++ tests/scorers/test_prompt_scorer.py | 11 ++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/judgeval/scorers/prompt_scorer.py b/judgeval/scorers/prompt_scorer.py index 7971500a..b1829afe 100644 --- a/judgeval/scorers/prompt_scorer.py +++ b/judgeval/scorers/prompt_scorer.py @@ -47,6 +47,10 @@ class PromptScorer(CustomScorer, BaseModel): score_type: str threshold: float = Field(default=0.5) using_native_model: bool = Field(default=True) + + # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD + response: Optional[dict] = None + result: Optional[float] = None def __init__( self, @@ -295,6 +299,7 @@ def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapp BaseModel.__init__( self, name=name, + slug=slug, score_type=name, conversation=conversation, options=options, diff --git a/tests/scorers/test_prompt_scorer.py b/tests/scorers/test_prompt_scorer.py index 8675d71e..e5e7e9ed 100644 --- a/tests/scorers/test_prompt_scorer.py +++ b/tests/scorers/test_prompt_scorer.py @@ -1,6 +1,7 @@ import pytest +from pydantic import BaseModel, Field from unittest.mock import MagicMock, AsyncMock -from typing import List, Dict +from typing import List, Dict, Any from judgeval.data import Example from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer @@ -27,6 +28,9 @@ def mock_model(): # Simple implementation of PromptScorer for testing class SampleScorer(PromptScorer): + + model: Any = Field(default=None) + def __init__(self, mock_model, *args, **kwargs): super().__init__(*args, **kwargs) self.model = mock_model @@ -105,6 +109,7 @@ def classifier_options(self): def test_classifier_init(self, classifier_conversation, classifier_options): scorer = ClassifierScorer( name="test_classifier", + slug="test_classifier_slug", conversation=classifier_conversation, options=classifier_options ) @@ -114,6 +119,7 @@ def test_classifier_init(self, classifier_conversation, classifier_options): def test_build_measure_prompt(self, example, classifier_conversation, classifier_options): scorer = ClassifierScorer( name="test_classifier", + slug="test_classifier_slug", conversation=classifier_conversation, options=classifier_options ) @@ -124,6 +130,7 @@ def test_build_measure_prompt(self, example, classifier_conversation, classifier def test_process_response(self, classifier_conversation, classifier_options): scorer = ClassifierScorer( name="test_classifier", + slug="test_classifier_slug", conversation=classifier_conversation, options=classifier_options ) @@ -136,6 +143,7 @@ def test_process_response(self, classifier_conversation, classifier_options): def test_process_response_invalid_choice(self, classifier_conversation, classifier_options): scorer = ClassifierScorer( name="test_classifier", + slug="test_classifier_slug", conversation=classifier_conversation, options=classifier_options ) @@ -147,6 +155,7 @@ def test_process_response_invalid_choice(self, classifier_conversation, classifi def test_success_check(self, classifier_conversation, classifier_options): scorer = ClassifierScorer( name="test_classifier", + slug="test_classifier_slug", conversation=classifier_conversation, options=classifier_options )