JudgmentLabs · JCamyre · Jan 2, 2025 · Dec 30, 2024 · Dec 30, 2024 · Dec 31, 2024
diff --git a/judgeval/data/datasets/20241111_175859.csv b/judgeval/data/datasets/20241111_175859.csv
diff --git a/judgeval/data/datasets/20241111_175859.json b/judgeval/data/datasets/20241111_175859.json
diff --git a/judgeval/data/datasets/dataset.py b/judgeval/data/datasets/dataset.py
@@ -158,7 +158,64 @@ def add_from_json(self, file_path: str) -> None:
         Adds examples and ground truths from a JSON file.
 
         The format of the JSON file is expected to be a dictionary with two keys: "examples" and "ground_truths". 
-        The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth. 
+        The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
+
+        The JSON file is expected to have the following format:
+        {
+            "ground_truths": [
+                {
+                    "input": "test input",
+                    "actual_output": null,
+                    "expected_output": "expected output",
+                    "context": [
+                    "context1"
+                ],
+                "retrieval_context": [
+                    "retrieval1"
+                ],
+                "additional_metadata": {
+                    "key": "value"
+                },
+                "comments": "test comment",
+                "tools_called": [
+                    "tool1"
+                ],
+                "expected_tools": [
+                    "tool1"
+                ],
+                "source_file": "test.py",
+                "trace_id": "094121"
+            }
+        ],
+        "examples": [
+            {
+                "input": "test input",
+                "actual_output": "test output",
+                "expected_output": "expected output",
+                "context": [
+                    "context1",
+                    "context2"
+                ],
+                "retrieval_context": [
+                    "retrieval1"
+                ],
+                "additional_metadata": {
+                    "key": "value"
+                },
+                "tools_called": [
+                    "tool1"
+                ],
+                "expected_tools": [
+                    "tool1",
+                    "tool2"
+                ],
+                "name": "test example",
+                "example_id": null,
+                "timestamp": "20241230_160117",
+                "trace_id": "123"
+            }
+            ]
+        }
         """
         try:
             with open(file_path, "r") as file:
@@ -195,17 +252,21 @@ def add_from_csv(
                 "Please install pandas to use this method. 'pip install pandas'"
             )
 
-        df = pd.read_csv(file_path)
+        df = pd.read_csv(file_path, dtype={'trace_id': str})
         """
         Expect the CSV to have headers
 
         "input", "actual_output", "expected_output", "context", \
         "retrieval_context", "additional_metadata", "tools_called", \
-        "expected_tools", "name", "comments", "source_file", "example"
+        "expected_tools", "name", "comments", "source_file", "example", \
+        "trace_id"
 
         We want to collect the examples and ground truths separately which can
         be determined by the "example" column. If the value is True, then it is an
         example, otherwise it is a ground truth.
+
+        We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
+        This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
         """
         examples, ground_truths = [], []
 
@@ -219,8 +280,8 @@ def add_from_csv(
                 "additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
                 "tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
                 "expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
+                "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
             }
-
             if row["example"]:
                 data["name"] = row["name"] if pd.notna(row["name"]) else None
                 # every Example has `input` and `actual_output` fields
@@ -230,6 +291,7 @@ def add_from_csv(
                 else:
                     raise ValueError("Every example must have an 'input' and 'actual_output' field.")
             else:
+                # GroundTruthExample has `comments` and `source_file` fields
                 data["comments"] = row["comments"] if pd.notna(row["comments"]) else None
                 data["source_file"] = row["source_file"] if pd.notna(row["source_file"]) else None
                 # every GroundTruthExample has `input` field
@@ -281,7 +343,8 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
                 writer.writerow([
                     "input", "actual_output", "expected_output", "context", \
                     "retrieval_context", "additional_metadata", "tools_called", \
-                    "expected_tools", "name", "comments", "source_file", "example"
+                    "expected_tools", "name", "comments", "source_file", "example", \
+                    "trace_id"
                 ])
                 for e in self.examples:
                     writer.writerow(
@@ -298,6 +361,7 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
                             None,  # Example does not have comments
                             None,  # Example does not have source file
                             True,  # Adding an Example
+                            e.trace_id
                         ]
                     )
 
@@ -316,6 +380,7 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
                             g.comments,
                             g.source_file,
                             False,  # Adding a GroundTruthExample, not an Example
+                            g.trace_id
                         ]
                     )
         else:

diff --git a/judgeval/data/datasets/ground_truth.py b/judgeval/data/datasets/ground_truth.py
@@ -19,6 +19,7 @@ class GroundTruthExample(BaseModel):
     tools_called: Optional[List[str]] = None
     expected_tools: Optional[List[str]] = None
     source_file: Optional[str] = None
+    trace_id: Optional[str] = None
 
     def to_dict(self):
         return {
@@ -32,6 +33,7 @@ def to_dict(self):
             "tools_called": self.tools_called,
             "expected_tools": self.expected_tools,
             "source_file": self.source_file,
+            "trace_id": self.trace_id,
         }
 
     def __str__(self):
@@ -46,6 +48,7 @@ def __str__(self):
             f"comments={self.comments}, "
             f"tools_called={self.tools_called}, "
             f"expected_tools={self.expected_tools}, "
-            f"source_file={self.source_file}"
+            f"source_file={self.source_file}, "
+            f"trace_id={self.trace_id}"
             f")"
-        )
+        )
diff --git a/judgeval/data/datasets/sample.csv b/judgeval/data/datasets/sample.csv
diff --git a/judgeval/data/example.py b/judgeval/data/example.py
@@ -56,6 +56,7 @@ def to_dict(self):
             "name": self.name,
             "example_id": self.example_id,
             "timestamp": self.timestamp,
+            "trace_id": self.trace_id
         }
 
     def __str__(self):

diff --git a/tests/data/datasets/sample_data/dataset.csv b/tests/data/datasets/sample_data/dataset.csv
@@ -0,0 +1,3 @@
+input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example,trace_id
+test input,test output,expected output,context1;context2,retrieval1,{'key': 'value'},tool1,tool1;tool2,test example,,,True,123
+test input,,expected output,context1,retrieval1,{'key': 'value'},tool1,tool1,,test comment,test.py,False,094121
diff --git a/tests/data/datasets/sample_data/dataset.json b/tests/data/datasets/sample_data/dataset.json
@@ -0,0 +1,55 @@
+{
+    "ground_truths": [
+        {
+            "input": "test input",
+            "actual_output": null,
+            "expected_output": "expected output",
+            "context": [
+                "context1"
+            ],
+            "retrieval_context": [
+                "retrieval1"
+            ],
+            "additional_metadata": {
+                "key": "value"
+            },
+            "comments": "test comment",
+            "tools_called": [
+                "tool1"
+            ],
+            "expected_tools": [
+                "tool1"
+            ],
+            "source_file": "test.py",
+            "trace_id": "094121"
+        }
+    ],
+    "examples": [
+        {
+            "input": "test input",
+            "actual_output": "test output",
+            "expected_output": "expected output",
+            "context": [
+                "context1",
+                "context2"
+            ],
+            "retrieval_context": [
+                "retrieval1"
+            ],
+            "additional_metadata": {
+                "key": "value"
+            },
+            "tools_called": [
+                "tool1"
+            ],
+            "expected_tools": [
+                "tool1",
+                "tool2"
+            ],
+            "name": "test example",
+            "example_id": null,
+            "timestamp": "20241230_160117",
+            "trace_id": "123"
+        }
+    ]
+}
diff --git a/tests/data/datasets/test_dataset.py b/tests/data/datasets/test_dataset.py
@@ -128,7 +128,8 @@ def test_add_from_csv(mock_read_csv, dataset):
         'name': ['name1', None],
         'comments': [None, 'comment2'],
         'source_file': [None, 'file2'],
-        'example': [True, False]
+        'example': [True, False],
+        'trace_id': [None, '123']
     })
     mock_read_csv.return_value = mock_df
 
@@ -175,3 +176,85 @@ def test_str_representation(dataset, sample_example, sample_ground_truth):
     assert "EvalDataset" in str_rep
     assert "ground_truths" in str_rep
     assert "examples" in str_rep
+
+# new UTs for dataset UX testing
+
+def test_load_from_json():
+    ex1 = Example(
+        input="test input",
+        actual_output="test output",
+        expected_output="expected output",
+        context=["context1", "context2"],
+        retrieval_context=["retrieval1"],
+        additional_metadata={"key": "value"},
+        tools_called=["tool1"],
+        expected_tools=["tool1", "tool2"],
+        name="test example",
+        trace_id="123"
+    )
+
+    gt1 = GroundTruthExample(
+        input="test input",
+        expected_output="expected output",
+        context=["context1"],
+        retrieval_context=["retrieval1"],
+        additional_metadata={"key": "value"},
+        tools_called=["tool1"],
+        expected_tools=["tool1"],
+        comments="test comment",
+        source_file="test.py",
+        trace_id="094121"
+    )
+
+    dataset = EvalDataset()
+
+    dataset.add_from_json("tests/data/datasets/sample_data/dataset.json")
+    assert dataset.ground_truths == [gt1]
+
+    # We can't do the same comparison as above because the timestamps are different
+    assert len(dataset.examples) == 1
+    loaded_example = dataset.examples[0]
+    assert loaded_example.input == ex1.input
+    assert loaded_example.actual_output == ex1.actual_output
+    assert loaded_example.expected_output == ex1.expected_output
+    assert loaded_example.context == ex1.context
+    assert loaded_example.retrieval_context == ex1.retrieval_context
+    assert loaded_example.additional_metadata == ex1.additional_metadata
+    assert loaded_example.tools_called == ex1.tools_called
+    assert loaded_example.expected_tools == ex1.expected_tools
+    assert loaded_example.name == ex1.name
+    assert loaded_example.trace_id == ex1.trace_id
+
+
+def test_load_from_csv():
+    ex1 = Example(
+        input="test input",
+        actual_output="test output",
+        expected_output="expected output",
+        context=["context1", "context2"],
+        retrieval_context=["retrieval1"],
+        additional_metadata={"key": "value"},
+        tools_called=["tool1"],
+        expected_tools=["tool1", "tool2"],
+        name="test example",
+        trace_id="123"
+    )
+
+    gt1 = GroundTruthExample(
+        input="test input",
+        expected_output="expected output",
+        context=["context1"],
+        retrieval_context=["retrieval1"],
+        additional_metadata={"key": "value"},
+        tools_called=["tool1"],
+        expected_tools=["tool1"],
+        comments="test comment",
+        source_file="test.py",
+        trace_id="094121"
+    )
+
+    dataset = EvalDataset()
+
+    dataset.add_from_csv("tests/data/datasets/sample_data/dataset.csv")
+    assert dataset.ground_truths == [gt1]
+    assert dataset.examples == [ex1]