From 3d8f2dba7cf686da717b44d837ff8b107112c9cd Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Mon, 30 Dec 2024 15:21:29 -0800
Subject: [PATCH 01/10] Add two new UTs: loading from JSON and CSV.

---
 judgeval/data/datasets/dataset.py   |  3 ++
 tests/data/datasets/test_dataset.py | 65 +++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/judgeval/data/datasets/dataset.py b/judgeval/data/datasets/dataset.py
index 6e779451..eaf4d7b7 100644
--- a/judgeval/data/datasets/dataset.py
+++ b/judgeval/data/datasets/dataset.py
@@ -206,6 +206,9 @@ def add_from_csv(
         We want to collect the examples and ground truths separately which can
         be determined by the "example" column. If the value is True, then it is an
         example, otherwise it is a ground truth.
+
+        We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
+        This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
         """
         examples, ground_truths = [], []
 
diff --git a/tests/data/datasets/test_dataset.py b/tests/data/datasets/test_dataset.py
index 32a307fa..177c6db4 100644
--- a/tests/data/datasets/test_dataset.py
+++ b/tests/data/datasets/test_dataset.py
@@ -175,3 +175,68 @@ def test_str_representation(dataset, sample_example, sample_ground_truth):
     assert "EvalDataset" in str_rep
     assert "ground_truths" in str_rep
     assert "examples" in str_rep
+
+# new UTs for dataset UX testing
+
+def test_load_from_json():
+    ex1 = Example(
+        input="test input",
+        actual_output="test output",
+        expected_output="expected output",
+        context=["context1", "context2"],
+        retrieval_context=["retrieval1"],
+        additional_metadata={"key": "value"},
+        tools_called=["tool1"],
+        expected_tools=["tool1", "tool2"],
+        name="test example"
+    )
+
+    gt1 = GroundTruthExample(
+        input="test input",
+        expected_output="expected output",
+        context=["context1"],
+        retrieval_context=["retrieval1"],
+        additional_metadata={"key": "value"},
+        tools_called=["tool1"],
+        expected_tools=["tool1"],
+        comments="test comment",
+        source_file="test.py"
+    )
+
+    dataset = EvalDataset()
+
+    dataset.add_from_json("tests/data/datasets/sample_data/dataset.json")
+    assert dataset.ground_truths == [gt1]
+    assert dataset.examples == [ex1]
+
+
+def test_load_from_csv():
+    ex1 = Example(
+        input="test input",
+        actual_output="test output",
+        expected_output="expected output",
+        context=["context1", "context2"],
+        retrieval_context=["retrieval1"],
+        additional_metadata={"key": "value"},
+        tools_called=["tool1"],
+        expected_tools=["tool1", "tool2"],
+        name="test example"
+    )
+
+    gt1 = GroundTruthExample(
+        input="test input",
+        expected_output="expected output",
+        context=["context1"],
+        retrieval_context=["retrieval1"],
+        additional_metadata={"key": "value"},
+        tools_called=["tool1"],
+        expected_tools=["tool1"],
+        comments="test comment",
+        source_file="test.py"
+    )
+
+    dataset = EvalDataset()
+
+    dataset.add_from_csv("tests/data/datasets/sample_data/dataset.csv")
+    assert dataset.ground_truths == [gt1]
+    assert dataset.examples == [ex1]

From 0e422847c6040ba3c63b919f87c434131f6ec210 Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Mon, 30 Dec 2024 15:22:14 -0800
Subject: [PATCH 02/10] Add sample test data for UTs loading from JSON/CSV -->
 dataset.

---
 judgeval/data/datasets/20241111_175859.csv   |  4 --
 judgeval/data/datasets/20241111_175859.json  | 44 ----------------
 judgeval/data/datasets/sample.csv            |  4 --
 tests/data/datasets/sample_data/dataset.csv  |  3 ++
 tests/data/datasets/sample_data/dataset.json | 53 ++++++++++++++++++++
 5 files changed, 56 insertions(+), 52 deletions(-)
 delete mode 100644 judgeval/data/datasets/20241111_175859.csv
 delete mode 100644 judgeval/data/datasets/20241111_175859.json
 delete mode 100644 judgeval/data/datasets/sample.csv
 create mode 100644 tests/data/datasets/sample_data/dataset.csv
 create mode 100644 tests/data/datasets/sample_data/dataset.json

diff --git a/judgeval/data/datasets/20241111_175859.csv b/judgeval/data/datasets/20241111_175859.csv
deleted file mode 100644
index 5ebd7667..00000000
--- a/judgeval/data/datasets/20241111_175859.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example
-"Hello, do you sell beans?",Yes! We sell beans 'n cheese.,"Crikey! Of course, with a jacket potato too.",,Spudbros is a British business that sells baked potatoes.,{},,,Spudbros_1,,,True
-Does beans go on first?,That's your choice! Either beans or cheese is a good option.,Mate... you can't be serious. Of course it's cheese first.,,Spudbros typically creates their baked potatoes by adding a layer of three-cheese blend before topping it with beans.,{},,,Spudbros_2,,,True
-Can I have a spooky spud?,,"Nah mate, Halloween's past, ain't it?",,,{},,,,love me a spooky spud!,,False
diff --git a/judgeval/data/datasets/20241111_175859.json b/judgeval/data/datasets/20241111_175859.json
deleted file mode 100644
index 1f960881..00000000
--- a/judgeval/data/datasets/20241111_175859.json
+++ /dev/null
@@ -1,44 +0,0 @@
-{
-    "ground_truths": [
-        {
-            "input": "Can I have a spooky spud?",
-            "actual_output": null,
-            "expected_output": "Nah mate, Halloween's past, ain't it?",
-            "context": [],
-            "retrieval_context": [],
-            "additional_metadata": {},
-            "comments": "love me a spooky spud!",
-            "tools_called": [],
-            "expected_tools": [],
-            "source_file": null
-        }
-    ],
-    "examples": [
-        {
-            "input": "Hello, do you sell beans?",
-            "actual_output": "Yes! We sell beans 'n cheese.",
-            "expected_output": "Crikey! Of course, with a jacket potato too.",
-            "context": [],
-            "retrieval_context": [
-                "Spudbros is a British business that sells baked potatoes."
-            ],
-            "additional_metadata": {},
-            "tools_called": [],
-            "expected_tools": [],
-            "name": "Spudbros_1"
-        },
-        {
-            "input": "Does beans go on first?",
-            "actual_output": "That's your choice! Either beans or cheese is a good option.",
-            "expected_output": "Mate... you can't be serious. Of course it's cheese first.",
-            "context": [],
-            "retrieval_context": [
-                "Spudbros typically creates their baked potatoes by adding a layer of three-cheese blend before topping it with beans."
-            ],
-            "additional_metadata": {},
-            "tools_called": [],
-            "expected_tools": [],
-            "name": "Spudbros_2"
-        }
-    ]
-}
\ No newline at end of file
diff --git a/judgeval/data/datasets/sample.csv b/judgeval/data/datasets/sample.csv
deleted file mode 100644
index 6db7c41b..00000000
--- a/judgeval/data/datasets/sample.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example
-"Hello, do you sell beans?",Yes! We sell beans 'n cheese.,"Crikey! Of course, with a jacket potato too.",,Spudbros is a British business that sells baked potatoes.,,,,Spudbros_1,,,TRUE
-Does beans go on first?,That's your choice! Either beans or cheese is a good option.,Mate... you can't be serious. Of course it's cheese first.,,Spudbros typically creates their baked potatoes by adding a layer of three-cheese blend before topping it with beans.,,,,Spudbros_2,,,TRUE
-Can I have a spooky spud?,,"Nah mate, Halloween's past, ain't it?",,,,,,,love me a spooky spud!,,FALSE
\ No newline at end of file
diff --git a/tests/data/datasets/sample_data/dataset.csv b/tests/data/datasets/sample_data/dataset.csv
new file mode 100644
index 00000000..d4f8a72f
--- /dev/null
+++ b/tests/data/datasets/sample_data/dataset.csv
@@ -0,0 +1,3 @@
+input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example
+test input,test output,expected output,context1;context2,retrieval1,{'key': 'value'},tool1,tool1;tool2,test example,,,True
+test input,,expected output,context1,retrieval1,{'key': 'value'},tool1,tool1,,test comment,test.py,False
diff --git a/tests/data/datasets/sample_data/dataset.json b/tests/data/datasets/sample_data/dataset.json
new file mode 100644
index 00000000..e95afcfb
--- /dev/null
+++ b/tests/data/datasets/sample_data/dataset.json
@@ -0,0 +1,53 @@
+{
+    "ground_truths": [
+        {
+            "input": "test input",
+            "actual_output": null,
+            "expected_output": "expected output",
+            "context": [
+                "context1"
+            ],
+            "retrieval_context": [
+                "retrieval1"
+            ],
+            "additional_metadata": {
+                "key": "value"
+            },
+            "comments": "test comment",
+            "tools_called": [
+                "tool1"
+            ],
+            "expected_tools": [
+                "tool1"
+            ],
+            "source_file": "test.py"
+        }
+    ],
+    "examples": [
+        {
+            "input": "test input",
+            "actual_output": "test output",
+            "expected_output": "expected output",
+            "context": [
+                "context1",
+                "context2"
+            ],
+            "retrieval_context": [
+                "retrieval1"
+            ],
+            "additional_metadata": {
+                "key": "value"
+            },
+            "tools_called": [
+                "tool1"
+            ],
+            "expected_tools": [
+                "tool1",
+                "tool2"
+            ],
+            "name": "test example",
+            "example_id": null,
+            "timestamp": "20241230_145155"
+        }
+    ]
+}
\ No newline at end of file

From 45593934f366a9a5580999f51a3511acbb77b8f2 Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Mon, 30 Dec 2024 16:35:23 -0800
Subject: [PATCH 03/10] Add trace ID to GroundTruthExample and integrate into
 Dataset helpers

---
 judgeval/data/datasets/dataset.py      | 13 +++++++++----
 judgeval/data/datasets/ground_truth.py |  7 +++++--
 judgeval/data/example.py               |  1 +
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/judgeval/data/datasets/dataset.py b/judgeval/data/datasets/dataset.py
index eaf4d7b7..9089ecee 100644
--- a/judgeval/data/datasets/dataset.py
+++ b/judgeval/data/datasets/dataset.py
@@ -195,13 +195,14 @@ def add_from_csv(
                 "Please install pandas to use this method. 'pip install pandas'"
             )
         
-        df = pd.read_csv(file_path)
+        df = pd.read_csv(file_path, dtype={'trace_id': str})
         """
         Expect the CSV to have headers
 
         "input", "actual_output", "expected_output", "context", \
         "retrieval_context", "additional_metadata", "tools_called", \
-        "expected_tools", "name", "comments", "source_file", "example"
+        "expected_tools", "name", "comments", "source_file", "example", \
+        "trace_id"
 
         We want to collect the examples and ground truths separately which can
         be determined by the "example" column. If the value is True, then it is an
@@ -222,8 +223,8 @@ def add_from_csv(
                 "additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
                 "tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
                 "expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
+                "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
             }
-
             if row["example"]:
                 data["name"] = row["name"] if pd.notna(row["name"]) else None
                 # every Example has `input` and `actual_output` fields
@@ -233,6 +234,7 @@ def add_from_csv(
                 else:
                     raise ValueError("Every example must have an 'input' and 'actual_output' field.")
             else:
+                # GroundTruthExample has `comments` and `source_file` fields
                 data["comments"] = row["comments"] if pd.notna(row["comments"]) else None
                 data["source_file"] = row["source_file"] if pd.notna(row["source_file"]) else None
                 # every GroundTruthExample has `input` field
@@ -284,7 +286,8 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
                 writer.writerow([
                     "input", "actual_output", "expected_output", "context", \
                     "retrieval_context", "additional_metadata", "tools_called", \
-                    "expected_tools", "name", "comments", "source_file", "example"
+                    "expected_tools", "name", "comments", "source_file", "example", \
+                    "trace_id"
                 ])
                 for e in self.examples:
                     writer.writerow(
@@ -301,6 +304,7 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
                             None,  # Example does not have comments
                             None,  # Example does not have source file
                             True,  # Adding an Example
+                            e.trace_id
                         ]
                     )
                 
@@ -319,6 +323,7 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
                             g.comments,
                             g.source_file,
                             False,  # Adding a GroundTruthExample, not an Example
+                            g.trace_id
                         ]
                     )
         else:
diff --git a/judgeval/data/datasets/ground_truth.py b/judgeval/data/datasets/ground_truth.py
index c100919e..902869ad 100644
--- a/judgeval/data/datasets/ground_truth.py
+++ b/judgeval/data/datasets/ground_truth.py
@@ -19,6 +19,7 @@ class GroundTruthExample(BaseModel):
     tools_called: Optional[List[str]] = None
     expected_tools: Optional[List[str]] = None
     source_file: Optional[str] = None
+    trace_id: Optional[str] = None
 
     def to_dict(self):
         return {
@@ -32,6 +33,7 @@ def to_dict(self):
             "tools_called": self.tools_called,
             "expected_tools": self.expected_tools,
             "source_file": self.source_file,
+            "trace_id": self.trace_id,
         }
     
     def __str__(self):
@@ -46,6 +48,7 @@ def __str__(self):
             f"comments={self.comments}, "
             f"tools_called={self.tools_called}, "
             f"expected_tools={self.expected_tools}, "
-            f"source_file={self.source_file}"
+            f"source_file={self.source_file}, "
+            f"trace_id={self.trace_id}"
             f")"
-        )
+        )
\ No newline at end of file
diff --git a/judgeval/data/example.py b/judgeval/data/example.py
index 38238f7a..74b541d5 100644
--- a/judgeval/data/example.py
+++ b/judgeval/data/example.py
@@ -56,6 +56,7 @@ def to_dict(self):
             "name": self.name,
             "example_id": self.example_id,
             "timestamp": self.timestamp,
+            "trace_id": self.trace_id
         }
 
     def __str__(self):

From 75fe8e22d10da57c696f462fc0ca5f8e205b8686 Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Mon, 30 Dec 2024 16:36:02 -0800
Subject: [PATCH 04/10] Update UT for dataset loading from JSON/CSV with trace
 IDs

---
 tests/data/datasets/sample_data/dataset.csv  |  6 +-
 tests/data/datasets/sample_data/dataset.json |  6 +-
 tests/data/datasets/test_dataset.py          | 59 ++++++++++++++++++--
 3 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/tests/data/datasets/sample_data/dataset.csv b/tests/data/datasets/sample_data/dataset.csv
index d4f8a72f..51b58db1 100644
--- a/tests/data/datasets/sample_data/dataset.csv
+++ b/tests/data/datasets/sample_data/dataset.csv
@@ -1,3 +1,3 @@
-input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example
-test input,test output,expected output,context1;context2,retrieval1,{'key': 'value'},tool1,tool1;tool2,test example,,,True
-test input,,expected output,context1,retrieval1,{'key': 'value'},tool1,tool1,,test comment,test.py,False
+input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example,trace_id
+test input,test output,expected output,context1;context2,retrieval1,{'key': 'value'},tool1,tool1;tool2,test example,,,True,123
+test input,,expected output,context1,retrieval1,{'key': 'value'},tool1,tool1,,test comment,test.py,False,094121
diff --git a/tests/data/datasets/sample_data/dataset.json b/tests/data/datasets/sample_data/dataset.json
index e95afcfb..8a7b3e49 100644
--- a/tests/data/datasets/sample_data/dataset.json
+++ b/tests/data/datasets/sample_data/dataset.json
@@ -20,7 +20,8 @@
             "expected_tools": [
                 "tool1"
             ],
-            "source_file": "test.py"
+            "source_file": "test.py",
+            "trace_id": "094121"
         }
     ],
     "examples": [
@@ -47,7 +48,8 @@
             ],
             "name": "test example",
             "example_id": null,
-            "timestamp": "20241230_145155"
+            "timestamp": "20241230_160117",
+            "trace_id": "123"
         }
     ]
 }
\ No newline at end of file
diff --git a/tests/data/datasets/test_dataset.py b/tests/data/datasets/test_dataset.py
index 177c6db4..0ece857c 100644
--- a/tests/data/datasets/test_dataset.py
+++ b/tests/data/datasets/test_dataset.py
@@ -128,7 +128,8 @@ def test_add_from_csv(mock_read_csv, dataset):
         'name': ['name1', None],
         'comments': [None, 'comment2'],
         'source_file': [None, 'file2'],
-        'example': [True, False]
+        'example': [True, False],
+        'trace_id': [None, '123']
     })
     mock_read_csv.return_value = mock_df
 
@@ -188,7 +189,8 @@ def test_load_from_json():
         additional_metadata={"key": "value"},
         tools_called=["tool1"],
         expected_tools=["tool1", "tool2"],
-        name="test example"
+        name="test example",
+        trace_id="123"
     )
 
     gt1 = GroundTruthExample(
@@ -200,14 +202,26 @@ def test_load_from_json():
         tools_called=["tool1"],
         expected_tools=["tool1"],
         comments="test comment",
-        source_file="test.py"
+        source_file="test.py",
+        trace_id="094121"
     )
 
     dataset = EvalDataset()
 
     dataset.add_from_json("tests/data/datasets/sample_data/dataset.json")
     assert dataset.ground_truths == [gt1]
-    assert dataset.examples == [ex1]
+    assert len(dataset.examples) == 1
+    loaded_example = dataset.examples[0]
+    assert loaded_example.input == ex1.input
+    assert loaded_example.actual_output == ex1.actual_output
+    assert loaded_example.expected_output == ex1.expected_output
+    assert loaded_example.context == ex1.context
+    assert loaded_example.retrieval_context == ex1.retrieval_context
+    assert loaded_example.additional_metadata == ex1.additional_metadata
+    assert loaded_example.tools_called == ex1.tools_called
+    assert loaded_example.expected_tools == ex1.expected_tools
+    assert loaded_example.name == ex1.name
+    assert loaded_example.trace_id == ex1.trace_id
 
 
 def test_load_from_csv():
@@ -220,7 +234,8 @@ def test_load_from_csv():
         additional_metadata={"key": "value"},
         tools_called=["tool1"],
         expected_tools=["tool1", "tool2"],
-        name="test example"
+        name="test example",
+        trace_id="123"
     )
 
     gt1 = GroundTruthExample(
@@ -232,7 +247,8 @@ def test_load_from_csv():
         tools_called=["tool1"],
         expected_tools=["tool1"],
         comments="test comment",
-        source_file="test.py"
+        source_file="test.py",
+        trace_id="094121"
     )
 
     dataset = EvalDataset()
@@ -240,3 +256,34 @@ def test_load_from_csv():
     dataset.add_from_csv("tests/data/datasets/sample_data/dataset.csv")
     assert dataset.ground_truths == [gt1]
     assert dataset.examples == [ex1]
+
+
+if __name__ == "__main__":
+    ex1 = Example(
+        input="test input",
+        actual_output="test output",
+        expected_output="expected output",
+        context=["context1", "context2"],
+        retrieval_context=["retrieval1"],
+        additional_metadata={"key": "value"},
+        tools_called=["tool1"],
+        expected_tools=["tool1", "tool2"],
+        name="test example",
+        trace_id="123"
+    )
+
+    gt1 = GroundTruthExample(
+        input="test input",
+        expected_output="expected output",
+        context=["context1"],
+        retrieval_context=["retrieval1"],
+        additional_metadata={"key": "value"},
+        tools_called=["tool1"],
+        expected_tools=["tool1"],
+        comments="test comment",
+        source_file="test.py",
+        trace_id="094121"
+    )
+    dataset = EvalDataset(ground_truths=[gt1], examples=[ex1])
+    dataset.save_as("json", "tests/data/datasets/sample_data/", "dataset")
+    dataset.save_as("csv", "tests/data/datasets/sample_data/", "dataset")

From 5bed97f51624aafd9ee9e623010fb2fb57aa4c24 Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Mon, 30 Dec 2024 16:37:08 -0800
Subject: [PATCH 05/10] Refactor; remove bottom main code and add
 documentation.

---
 tests/data/datasets/test_dataset.py | 33 ++---------------------------
 1 file changed, 2 insertions(+), 31 deletions(-)

diff --git a/tests/data/datasets/test_dataset.py b/tests/data/datasets/test_dataset.py
index 0ece857c..0db5a4d8 100644
--- a/tests/data/datasets/test_dataset.py
+++ b/tests/data/datasets/test_dataset.py
@@ -210,6 +210,8 @@ def test_load_from_json():
 
     dataset.add_from_json("tests/data/datasets/sample_data/dataset.json")
     assert dataset.ground_truths == [gt1]
+
+    # We can't do the same comparison as above because the timestamps are different
     assert len(dataset.examples) == 1
     loaded_example = dataset.examples[0]
     assert loaded_example.input == ex1.input
@@ -256,34 +258,3 @@ def test_load_from_csv():
     dataset.add_from_csv("tests/data/datasets/sample_data/dataset.csv")
     assert dataset.ground_truths == [gt1]
     assert dataset.examples == [ex1]
-
-
-if __name__ == "__main__":
-    ex1 = Example(
-        input="test input",
-        actual_output="test output",
-        expected_output="expected output",
-        context=["context1", "context2"],
-        retrieval_context=["retrieval1"],
-        additional_metadata={"key": "value"},
-        tools_called=["tool1"],
-        expected_tools=["tool1", "tool2"],
-        name="test example",
-        trace_id="123"
-    )
-
-    gt1 = GroundTruthExample(
-        input="test input",
-        expected_output="expected output",
-        context=["context1"],
-        retrieval_context=["retrieval1"],
-        additional_metadata={"key": "value"},
-        tools_called=["tool1"],
-        expected_tools=["tool1"],
-        comments="test comment",
-        source_file="test.py",
-        trace_id="094121"
-    )
-    dataset = EvalDataset(ground_truths=[gt1], examples=[ex1])
-    dataset.save_as("json", "tests/data/datasets/sample_data/", "dataset")
-    dataset.save_as("csv", "tests/data/datasets/sample_data/", "dataset")

From 07f06f39c276dcb989bfa938ee9035fc7e39d2f2 Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Mon, 30 Dec 2024 16:54:36 -0800
Subject: [PATCH 06/10] Add documentation string for JSON helper func to
 specify example JSON format.

---
 judgeval/data/datasets/dataset.py | 59 ++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/judgeval/data/datasets/dataset.py b/judgeval/data/datasets/dataset.py
index 9089ecee..8279e43f 100644
--- a/judgeval/data/datasets/dataset.py
+++ b/judgeval/data/datasets/dataset.py
@@ -158,7 +158,64 @@ def add_from_json(self, file_path: str) -> None:
         Adds examples and ground truths from a JSON file.
 
         The format of the JSON file is expected to be a dictionary with two keys: "examples" and "ground_truths". 
-        The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth. 
+        The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
+
+        The JSON file is expected to have the following format:
+        {
+            "ground_truths": [
+                {
+                    "input": "test input",
+                    "actual_output": null,
+                    "expected_output": "expected output",
+                    "context": [
+                    "context1"
+                ],
+                "retrieval_context": [
+                    "retrieval1"
+                ],
+                "additional_metadata": {
+                    "key": "value"
+                },
+                "comments": "test comment",
+                "tools_called": [
+                    "tool1"
+                ],
+                "expected_tools": [
+                    "tool1"
+                ],
+                "source_file": "test.py",
+                "trace_id": "094121"
+            }
+        ],
+        "examples": [
+            {
+                "input": "test input",
+                "actual_output": "test output",
+                "expected_output": "expected output",
+                "context": [
+                    "context1",
+                    "context2"
+                ],
+                "retrieval_context": [
+                    "retrieval1"
+                ],
+                "additional_metadata": {
+                    "key": "value"
+                },
+                "tools_called": [
+                    "tool1"
+                ],
+                "expected_tools": [
+                    "tool1",
+                    "tool2"
+                ],
+                "name": "test example",
+                "example_id": null,
+                "timestamp": "20241230_160117",
+                "trace_id": "123"
+            }
+            ]
+        }
         """
         try:
             with open(file_path, "r") as file:

From b20a058f51b1265caddffb55ccf7bfe9efffa303 Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Mon, 30 Dec 2024 16:59:30 -0800
Subject: [PATCH 07/10] Update GroundTruthExample UTs to reflect TraceID being
 added to the class

---
 tests/data/datasets/test_ground_truth.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/data/datasets/test_ground_truth.py b/tests/data/datasets/test_ground_truth.py
index 58f58c52..bf486d18 100644
--- a/tests/data/datasets/test_ground_truth.py
+++ b/tests/data/datasets/test_ground_truth.py
@@ -56,6 +56,7 @@ def test_ground_truth_example_to_dict():
         "tools_called": None,
         "expected_tools": None,
         "source_file": None,
+        "trace_id": None
     }
     
     assert example.to_dict() == expected_dict
@@ -79,7 +80,8 @@ def test_ground_truth_example_str_representation():
         "comments=None, "
         "tools_called=None, "
         "expected_tools=None, "
-        "source_file=None)"
+        "source_file=None, "
+        "trace_id=None)"
     )
     
     assert str(example) == expected_str

From bc372aba7d69562967e96b6d5882bcfe0779ac34 Mon Sep 17 00:00:00 2001
From: JCamyre <jwcamry03@gmail.com>
Date: Mon, 30 Dec 2024 17:18:19 -0800
Subject: [PATCH 08/10] Add comment clarifying why we specify 'trace_id' as a
 str when reading csv's using pandas.

---
 judgeval/data/datasets/dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/judgeval/data/datasets/dataset.py b/judgeval/data/datasets/dataset.py
index 8279e43f..abaebbc4 100644
--- a/judgeval/data/datasets/dataset.py
+++ b/judgeval/data/datasets/dataset.py
@@ -252,6 +252,7 @@ def add_from_csv(
                 "Please install pandas to use this method. 'pip install pandas'"
             )
         
+        # Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
         df = pd.read_csv(file_path, dtype={'trace_id': str})
         """
         Expect the CSV to have headers

From 532240019dde040132acf3ae6dcdee48899f0b01 Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Mon, 30 Dec 2024 17:56:54 -0800
Subject: [PATCH 09/10] Edit UT to pass GH requirements >:)

---
 tests/scorers/test_prompt_scorer.py | 56 +++++++++++++++--------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/tests/scorers/test_prompt_scorer.py b/tests/scorers/test_prompt_scorer.py
index 4bdf3e89..8675d71e 100644
--- a/tests/scorers/test_prompt_scorer.py
+++ b/tests/scorers/test_prompt_scorer.py
@@ -27,6 +27,10 @@ def mock_model():
 
 # Simple implementation of PromptScorer for testing
 class SampleScorer(PromptScorer):
+    def __init__(self, mock_model, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model = mock_model
+
     def build_measure_prompt(self, example: Example) -> List[dict]:
         return [
             {"role": "system", "content": "Test system prompt"},
@@ -44,19 +48,19 @@ def success_check(self, **kwargs) -> bool:
 
 # Tests for PromptScorer
 class TestPromptScorer:
-    def test_init(self):
-        scorer = SampleScorer("test_scorer")
+    def test_init(self, mock_model):
+        scorer = SampleScorer(name="test_scorer", mock_model=mock_model)
         assert scorer.name == "test_scorer"
         assert scorer.threshold == 0.5
         assert scorer.include_reason is True
         assert scorer.async_mode is True
         
-    def test_init_strict_mode(self):
-        scorer = SampleScorer("test_scorer", strict_mode=True)
+    def test_init_strict_mode(self, mock_model):
+        scorer = SampleScorer(name="test_scorer", mock_model=mock_model, strict_mode=True)
         assert scorer.threshold == 1
         
-    def test_enforce_prompt_format(self):
-        scorer = SampleScorer("test_scorer")
+    def test_enforce_prompt_format(self, mock_model):
+        scorer = SampleScorer(name="test_scorer", mock_model=mock_model)
         prompt = [{"role": "system", "content": "Base prompt"}]
         schema = {"score": float, "reason": str}
         
@@ -65,23 +69,21 @@ def test_enforce_prompt_format(self):
         assert '"score": <score> (float)' in formatted[0]["content"]
         assert '"reason": <reason> (str)' in formatted[0]["content"]
         
-    def test_enforce_prompt_format_invalid_input(self):
-        scorer = SampleScorer("test_scorer")
+    def test_enforce_prompt_format_invalid_input(self, mock_model):
+        scorer = SampleScorer(name="test_scorer", mock_model=mock_model)
         with pytest.raises(TypeError):
             scorer.enforce_prompt_format("invalid", {})
             
     @pytest.mark.asyncio
     async def test_a_score_example(self, example, mock_model):
-        scorer = SampleScorer("test_scorer")
-        scorer.model = mock_model
+        scorer = SampleScorer(name="test_scorer", mock_model=mock_model)
         
         result = await scorer.a_score_example(example, _show_indicator=False)
         assert result == 0.8
         assert scorer.reason == "Test reason"
         
     def test_score_example_sync(self, example, mock_model):
-        scorer = SampleScorer("test_scorer", async_mode=False)
-        scorer.model = mock_model
+        scorer = SampleScorer(name="test_scorer", mock_model=mock_model, async_mode=False)
         
         result = scorer.score_example(example, _show_indicator=False)
         assert result == 0.8
@@ -102,18 +104,18 @@ def classifier_options(self):
     
     def test_classifier_init(self, classifier_conversation, classifier_options):
         scorer = ClassifierScorer(
-            "test_classifier",
-            classifier_conversation,
-            classifier_options
+            name="test_classifier",
+            conversation=classifier_conversation,
+            options=classifier_options
         )
         assert scorer.conversation == classifier_conversation
         assert scorer.options == classifier_options
         
     def test_build_measure_prompt(self, example, classifier_conversation, classifier_options):
         scorer = ClassifierScorer(
-            "test_classifier",
-            classifier_conversation,
-            classifier_options
+            name="test_classifier",
+            conversation=classifier_conversation,
+            options=classifier_options
         )
         
         prompt = scorer.build_measure_prompt(example)
@@ -121,9 +123,9 @@ def test_build_measure_prompt(self, example, classifier_conversation, classifier
         
     def test_process_response(self, classifier_conversation, classifier_options):
         scorer = ClassifierScorer(
-            "test_classifier",
-            classifier_conversation,
-            classifier_options
+            name="test_classifier",
+            conversation=classifier_conversation,
+            options=classifier_options
         )
         
         response = {"choice": "positive", "reason": "Test reason"}
@@ -133,9 +135,9 @@ def test_process_response(self, classifier_conversation, classifier_options):
         
     def test_process_response_invalid_choice(self, classifier_conversation, classifier_options):
         scorer = ClassifierScorer(
-            "test_classifier",
-            classifier_conversation,
-            classifier_options
+            name="test_classifier",
+            conversation=classifier_conversation,
+            options=classifier_options
         )
         
         response = {"choice": "invalid", "reason": "Test reason"}
@@ -144,9 +146,9 @@ def test_process_response_invalid_choice(self, classifier_conversation, classifi
             
     def test_success_check(self, classifier_conversation, classifier_options):
         scorer = ClassifierScorer(
-            "test_classifier",
-            classifier_conversation,
-            classifier_options
+            name="test_classifier",
+            conversation=classifier_conversation,
+            options=classifier_options
         )
         
         scorer.score = 1.0

From caeaa0b8051fa0468ea97516914cf0cb5d42d79f Mon Sep 17 00:00:00 2001
From: SecroLoL <azshan@stanford.edu>
Date: Wed, 1 Jan 2025 19:14:35 -0800
Subject: [PATCH 10/10] Fix broken PromptScorer/Classifier Scorer UTs. Pydantic
 attribute issues are resolved and now UTs pass.

---
 judgeval/scorers/prompt_scorer.py   |  5 +++++
 tests/scorers/test_prompt_scorer.py | 11 ++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/judgeval/scorers/prompt_scorer.py b/judgeval/scorers/prompt_scorer.py
index 7971500a..b1829afe 100644
--- a/judgeval/scorers/prompt_scorer.py
+++ b/judgeval/scorers/prompt_scorer.py
@@ -47,6 +47,10 @@ class PromptScorer(CustomScorer, BaseModel):
     score_type: str
     threshold: float = Field(default=0.5)
     using_native_model: bool = Field(default=True)
+
+    # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
+    response: Optional[dict] = None
+    result: Optional[float] = None
     
     def __init__(
         self,
@@ -295,6 +299,7 @@ def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapp
         BaseModel.__init__(
             self,
             name=name,
+            slug=slug,
             score_type=name,
             conversation=conversation,
             options=options,
diff --git a/tests/scorers/test_prompt_scorer.py b/tests/scorers/test_prompt_scorer.py
index 8675d71e..e5e7e9ed 100644
--- a/tests/scorers/test_prompt_scorer.py
+++ b/tests/scorers/test_prompt_scorer.py
@@ -1,6 +1,7 @@
 import pytest
+from pydantic import BaseModel, Field
 from unittest.mock import MagicMock, AsyncMock
-from typing import List, Dict
+from typing import List, Dict, Any
 
 from judgeval.data import Example
 from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
@@ -27,6 +28,9 @@ def mock_model():
 
 # Simple implementation of PromptScorer for testing
 class SampleScorer(PromptScorer):
+
+    model: Any = Field(default=None)
+
     def __init__(self, mock_model, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.model = mock_model
@@ -105,6 +109,7 @@ def classifier_options(self):
     def test_classifier_init(self, classifier_conversation, classifier_options):
         scorer = ClassifierScorer(
             name="test_classifier",
+            slug="test_classifier_slug",
             conversation=classifier_conversation,
             options=classifier_options
         )
@@ -114,6 +119,7 @@ def test_classifier_init(self, classifier_conversation, classifier_options):
     def test_build_measure_prompt(self, example, classifier_conversation, classifier_options):
         scorer = ClassifierScorer(
             name="test_classifier",
+            slug="test_classifier_slug",
             conversation=classifier_conversation,
             options=classifier_options
         )
@@ -124,6 +130,7 @@ def test_build_measure_prompt(self, example, classifier_conversation, classifier
     def test_process_response(self, classifier_conversation, classifier_options):
         scorer = ClassifierScorer(
             name="test_classifier",
+            slug="test_classifier_slug",
             conversation=classifier_conversation,
             options=classifier_options
         )
@@ -136,6 +143,7 @@ def test_process_response(self, classifier_conversation, classifier_options):
     def test_process_response_invalid_choice(self, classifier_conversation, classifier_options):
         scorer = ClassifierScorer(
             name="test_classifier",
+            slug="test_classifier_slug",
             conversation=classifier_conversation,
             options=classifier_options
         )
@@ -147,6 +155,7 @@ def test_process_response_invalid_choice(self, classifier_conversation, classifi
     def test_success_check(self, classifier_conversation, classifier_options):
         scorer = ClassifierScorer(
             name="test_classifier",
+            slug="test_classifier_slug",
             conversation=classifier_conversation,
             options=classifier_options
         )