Skip to content

Add trace ID to datasets, update UTs accordingly #31

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions judgeval/data/datasets/20241111_175859.csv

This file was deleted.

44 changes: 0 additions & 44 deletions judgeval/data/datasets/20241111_175859.json

This file was deleted.

75 changes: 70 additions & 5 deletions judgeval/data/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,64 @@ def add_from_json(self, file_path: str) -> None:
Adds examples and ground truths from a JSON file.

The format of the JSON file is expected to be a dictionary with two keys: "examples" and "ground_truths".
The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.

The JSON file is expected to have the following format:
{
"ground_truths": [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does ground truths mean in relation to the examples, I thought the examples has the ground truth (with the actual_output field)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks

{
"input": "test input",
"actual_output": null,
"expected_output": "expected output",
"context": [
"context1"
],
"retrieval_context": [
"retrieval1"
],
"additional_metadata": {
"key": "value"
},
"comments": "test comment",
"tools_called": [
"tool1"
],
"expected_tools": [
"tool1"
],
"source_file": "test.py",
"trace_id": "094121"
}
],
"examples": [
{
"input": "test input",
"actual_output": "test output",
"expected_output": "expected output",
"context": [
"context1",
"context2"
],
"retrieval_context": [
"retrieval1"
],
"additional_metadata": {
"key": "value"
},
"tools_called": [
"tool1"
],
"expected_tools": [
"tool1",
"tool2"
],
"name": "test example",
"example_id": null,
"timestamp": "20241230_160117",
"trace_id": "123"
}
]
}
"""
try:
with open(file_path, "r") as file:
Expand Down Expand Up @@ -195,17 +252,21 @@ def add_from_csv(
"Please install pandas to use this method. 'pip install pandas'"
)

df = pd.read_csv(file_path)
df = pd.read_csv(file_path, dtype={'trace_id': str})
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What else can dtype be?
A trace_id or trace_file?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)

"""
Expect the CSV to have headers

"input", "actual_output", "expected_output", "context", \
"retrieval_context", "additional_metadata", "tools_called", \
"expected_tools", "name", "comments", "source_file", "example"
"expected_tools", "name", "comments", "source_file", "example", \
"trace_id"

We want to collect the examples and ground truths separately which can
be determined by the "example" column. If the value is True, then it is an
example, otherwise it is a ground truth.

We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
"""
examples, ground_truths = [], []

Expand All @@ -219,8 +280,8 @@ def add_from_csv(
"additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
"tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
"expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
}

if row["example"]:
data["name"] = row["name"] if pd.notna(row["name"]) else None
# every Example has `input` and `actual_output` fields
Expand All @@ -230,6 +291,7 @@ def add_from_csv(
else:
raise ValueError("Every example must have an 'input' and 'actual_output' field.")
else:
# GroundTruthExample has `comments` and `source_file` fields
data["comments"] = row["comments"] if pd.notna(row["comments"]) else None
data["source_file"] = row["source_file"] if pd.notna(row["source_file"]) else None
# every GroundTruthExample has `input` field
Expand Down Expand Up @@ -281,7 +343,8 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
writer.writerow([
"input", "actual_output", "expected_output", "context", \
"retrieval_context", "additional_metadata", "tools_called", \
"expected_tools", "name", "comments", "source_file", "example"
"expected_tools", "name", "comments", "source_file", "example", \
"trace_id"
])
for e in self.examples:
writer.writerow(
Expand All @@ -298,6 +361,7 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
None, # Example does not have comments
None, # Example does not have source file
True, # Adding an Example
e.trace_id
]
)

Expand All @@ -316,6 +380,7 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
g.comments,
g.source_file,
False, # Adding a GroundTruthExample, not an Example
g.trace_id
]
)
else:
Expand Down
7 changes: 5 additions & 2 deletions judgeval/data/datasets/ground_truth.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class GroundTruthExample(BaseModel):
tools_called: Optional[List[str]] = None
expected_tools: Optional[List[str]] = None
source_file: Optional[str] = None
trace_id: Optional[str] = None

def to_dict(self):
return {
Expand All @@ -32,6 +33,7 @@ def to_dict(self):
"tools_called": self.tools_called,
"expected_tools": self.expected_tools,
"source_file": self.source_file,
"trace_id": self.trace_id,
}

def __str__(self):
Expand All @@ -46,6 +48,7 @@ def __str__(self):
f"comments={self.comments}, "
f"tools_called={self.tools_called}, "
f"expected_tools={self.expected_tools}, "
f"source_file={self.source_file}"
f"source_file={self.source_file}, "
f"trace_id={self.trace_id}"
f")"
)
)
4 changes: 0 additions & 4 deletions judgeval/data/datasets/sample.csv

This file was deleted.

1 change: 1 addition & 0 deletions judgeval/data/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def to_dict(self):
"name": self.name,
"example_id": self.example_id,
"timestamp": self.timestamp,
"trace_id": self.trace_id
}

def __str__(self):
Expand Down
3 changes: 3 additions & 0 deletions tests/data/datasets/sample_data/dataset.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example,trace_id
test input,test output,expected output,context1;context2,retrieval1,{'key': 'value'},tool1,tool1;tool2,test example,,,True,123
test input,,expected output,context1,retrieval1,{'key': 'value'},tool1,tool1,,test comment,test.py,False,094121
55 changes: 55 additions & 0 deletions tests/data/datasets/sample_data/dataset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
"ground_truths": [
{
"input": "test input",
"actual_output": null,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the difference between ground_truths and examples that ground_truths don't have an actual_output (this is for the LLM system to produce)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep

"expected_output": "expected output",
"context": [
"context1"
],
"retrieval_context": [
"retrieval1"
],
"additional_metadata": {
"key": "value"
},
"comments": "test comment",
"tools_called": [
"tool1"
],
"expected_tools": [
"tool1"
],
"source_file": "test.py",
"trace_id": "094121"
}
],
"examples": [
{
"input": "test input",
"actual_output": "test output",
"expected_output": "expected output",
"context": [
"context1",
"context2"
],
"retrieval_context": [
"retrieval1"
],
"additional_metadata": {
"key": "value"
},
"tools_called": [
"tool1"
],
"expected_tools": [
"tool1",
"tool2"
],
"name": "test example",
"example_id": null,
"timestamp": "20241230_160117",
"trace_id": "123"
}
]
}
85 changes: 84 additions & 1 deletion tests/data/datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ def test_add_from_csv(mock_read_csv, dataset):
'name': ['name1', None],
'comments': [None, 'comment2'],
'source_file': [None, 'file2'],
'example': [True, False]
'example': [True, False],
'trace_id': [None, '123']
})
mock_read_csv.return_value = mock_df

Expand Down Expand Up @@ -175,3 +176,85 @@ def test_str_representation(dataset, sample_example, sample_ground_truth):
assert "EvalDataset" in str_rep
assert "ground_truths" in str_rep
assert "examples" in str_rep

# new UTs for dataset UX testing

def test_load_from_json():
ex1 = Example(
input="test input",
actual_output="test output",
expected_output="expected output",
context=["context1", "context2"],
retrieval_context=["retrieval1"],
additional_metadata={"key": "value"},
tools_called=["tool1"],
expected_tools=["tool1", "tool2"],
name="test example",
trace_id="123"
)

gt1 = GroundTruthExample(
input="test input",
expected_output="expected output",
context=["context1"],
retrieval_context=["retrieval1"],
additional_metadata={"key": "value"},
tools_called=["tool1"],
expected_tools=["tool1"],
comments="test comment",
source_file="test.py",
trace_id="094121"
)

dataset = EvalDataset()

dataset.add_from_json("tests/data/datasets/sample_data/dataset.json")
assert dataset.ground_truths == [gt1]

# We can't do the same comparison as above because the timestamps are different
assert len(dataset.examples) == 1
loaded_example = dataset.examples[0]
assert loaded_example.input == ex1.input
assert loaded_example.actual_output == ex1.actual_output
assert loaded_example.expected_output == ex1.expected_output
assert loaded_example.context == ex1.context
assert loaded_example.retrieval_context == ex1.retrieval_context
assert loaded_example.additional_metadata == ex1.additional_metadata
assert loaded_example.tools_called == ex1.tools_called
assert loaded_example.expected_tools == ex1.expected_tools
assert loaded_example.name == ex1.name
assert loaded_example.trace_id == ex1.trace_id


def test_load_from_csv():
ex1 = Example(
input="test input",
actual_output="test output",
expected_output="expected output",
context=["context1", "context2"],
retrieval_context=["retrieval1"],
additional_metadata={"key": "value"},
tools_called=["tool1"],
expected_tools=["tool1", "tool2"],
name="test example",
trace_id="123"
)

gt1 = GroundTruthExample(
input="test input",
expected_output="expected output",
context=["context1"],
retrieval_context=["retrieval1"],
additional_metadata={"key": "value"},
tools_called=["tool1"],
expected_tools=["tool1"],
comments="test comment",
source_file="test.py",
trace_id="094121"
)

dataset = EvalDataset()

dataset.add_from_csv("tests/data/datasets/sample_data/dataset.csv")
assert dataset.ground_truths == [gt1]
assert dataset.examples == [ex1]
Loading