Skip to content

Commit ca23e6d

Browse files
authored
Merge pull request #31 from JudgmentLabs/add_trace_to_datasets
Add trace ID to datasets, update UTs accordingly
2 parents 91007b1 + caeaa0b commit ca23e6d

File tree

12 files changed

+266
-89
lines changed

12 files changed

+266
-89
lines changed

judgeval/data/datasets/20241111_175859.csv

Lines changed: 0 additions & 4 deletions
This file was deleted.

judgeval/data/datasets/20241111_175859.json

Lines changed: 0 additions & 44 deletions
This file was deleted.

judgeval/data/datasets/dataset.py

Lines changed: 71 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,64 @@ def add_from_json(self, file_path: str) -> None:
158158
Adds examples and ground truths from a JSON file.
159159
160160
The format of the JSON file is expected to be a dictionary with two keys: "examples" and "ground_truths".
161-
The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
161+
The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
162+
163+
The JSON file is expected to have the following format:
164+
{
165+
"ground_truths": [
166+
{
167+
"input": "test input",
168+
"actual_output": null,
169+
"expected_output": "expected output",
170+
"context": [
171+
"context1"
172+
],
173+
"retrieval_context": [
174+
"retrieval1"
175+
],
176+
"additional_metadata": {
177+
"key": "value"
178+
},
179+
"comments": "test comment",
180+
"tools_called": [
181+
"tool1"
182+
],
183+
"expected_tools": [
184+
"tool1"
185+
],
186+
"source_file": "test.py",
187+
"trace_id": "094121"
188+
}
189+
],
190+
"examples": [
191+
{
192+
"input": "test input",
193+
"actual_output": "test output",
194+
"expected_output": "expected output",
195+
"context": [
196+
"context1",
197+
"context2"
198+
],
199+
"retrieval_context": [
200+
"retrieval1"
201+
],
202+
"additional_metadata": {
203+
"key": "value"
204+
},
205+
"tools_called": [
206+
"tool1"
207+
],
208+
"expected_tools": [
209+
"tool1",
210+
"tool2"
211+
],
212+
"name": "test example",
213+
"example_id": null,
214+
"timestamp": "20241230_160117",
215+
"trace_id": "123"
216+
}
217+
]
218+
}
162219
"""
163220
try:
164221
with open(file_path, "r") as file:
@@ -195,17 +252,22 @@ def add_from_csv(
195252
"Please install pandas to use this method. 'pip install pandas'"
196253
)
197254

198-
df = pd.read_csv(file_path)
255+
# Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
256+
df = pd.read_csv(file_path, dtype={'trace_id': str})
199257
"""
200258
Expect the CSV to have headers
201259
202260
"input", "actual_output", "expected_output", "context", \
203261
"retrieval_context", "additional_metadata", "tools_called", \
204-
"expected_tools", "name", "comments", "source_file", "example"
262+
"expected_tools", "name", "comments", "source_file", "example", \
263+
"trace_id"
205264
206265
We want to collect the examples and ground truths separately which can
207266
be determined by the "example" column. If the value is True, then it is an
208267
example, otherwise it is a ground truth.
268+
269+
We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
270+
This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
209271
"""
210272
examples, ground_truths = [], []
211273

@@ -219,8 +281,8 @@ def add_from_csv(
219281
"additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
220282
"tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
221283
"expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
284+
"trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
222285
}
223-
224286
if row["example"]:
225287
data["name"] = row["name"] if pd.notna(row["name"]) else None
226288
# every Example has `input` and `actual_output` fields
@@ -230,6 +292,7 @@ def add_from_csv(
230292
else:
231293
raise ValueError("Every example must have an 'input' and 'actual_output' field.")
232294
else:
295+
# GroundTruthExample has `comments` and `source_file` fields
233296
data["comments"] = row["comments"] if pd.notna(row["comments"]) else None
234297
data["source_file"] = row["source_file"] if pd.notna(row["source_file"]) else None
235298
# every GroundTruthExample has `input` field
@@ -281,7 +344,8 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
281344
writer.writerow([
282345
"input", "actual_output", "expected_output", "context", \
283346
"retrieval_context", "additional_metadata", "tools_called", \
284-
"expected_tools", "name", "comments", "source_file", "example"
347+
"expected_tools", "name", "comments", "source_file", "example", \
348+
"trace_id"
285349
])
286350
for e in self.examples:
287351
writer.writerow(
@@ -298,6 +362,7 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
298362
None, # Example does not have comments
299363
None, # Example does not have source file
300364
True, # Adding an Example
365+
e.trace_id
301366
]
302367
)
303368

@@ -316,6 +381,7 @@ def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: s
316381
g.comments,
317382
g.source_file,
318383
False, # Adding a GroundTruthExample, not an Example
384+
g.trace_id
319385
]
320386
)
321387
else:

judgeval/data/datasets/ground_truth.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class GroundTruthExample(BaseModel):
1919
tools_called: Optional[List[str]] = None
2020
expected_tools: Optional[List[str]] = None
2121
source_file: Optional[str] = None
22+
trace_id: Optional[str] = None
2223

2324
def to_dict(self):
2425
return {
@@ -32,6 +33,7 @@ def to_dict(self):
3233
"tools_called": self.tools_called,
3334
"expected_tools": self.expected_tools,
3435
"source_file": self.source_file,
36+
"trace_id": self.trace_id,
3537
}
3638

3739
def __str__(self):
@@ -46,6 +48,7 @@ def __str__(self):
4648
f"comments={self.comments}, "
4749
f"tools_called={self.tools_called}, "
4850
f"expected_tools={self.expected_tools}, "
49-
f"source_file={self.source_file}"
51+
f"source_file={self.source_file}, "
52+
f"trace_id={self.trace_id}"
5053
f")"
51-
)
54+
)

judgeval/data/datasets/sample.csv

Lines changed: 0 additions & 4 deletions
This file was deleted.

judgeval/data/example.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def to_dict(self):
5656
"name": self.name,
5757
"example_id": self.example_id,
5858
"timestamp": self.timestamp,
59+
"trace_id": self.trace_id
5960
}
6061

6162
def __str__(self):

judgeval/scorers/prompt_scorer.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ class PromptScorer(CustomScorer, BaseModel):
4747
score_type: str
4848
threshold: float = Field(default=0.5)
4949
using_native_model: bool = Field(default=True)
50+
51+
# DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
52+
response: Optional[dict] = None
53+
result: Optional[float] = None
5054

5155
def __init__(
5256
self,
@@ -295,6 +299,7 @@ def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapp
295299
BaseModel.__init__(
296300
self,
297301
name=name,
302+
slug=slug,
298303
score_type=name,
299304
conversation=conversation,
300305
options=options,
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
input,actual_output,expected_output,context,retrieval_context,additional_metadata,tools_called,expected_tools,name,comments,source_file,example,trace_id
2+
test input,test output,expected output,context1;context2,retrieval1,{'key': 'value'},tool1,tool1;tool2,test example,,,True,123
3+
test input,,expected output,context1,retrieval1,{'key': 'value'},tool1,tool1,,test comment,test.py,False,094121
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"ground_truths": [
3+
{
4+
"input": "test input",
5+
"actual_output": null,
6+
"expected_output": "expected output",
7+
"context": [
8+
"context1"
9+
],
10+
"retrieval_context": [
11+
"retrieval1"
12+
],
13+
"additional_metadata": {
14+
"key": "value"
15+
},
16+
"comments": "test comment",
17+
"tools_called": [
18+
"tool1"
19+
],
20+
"expected_tools": [
21+
"tool1"
22+
],
23+
"source_file": "test.py",
24+
"trace_id": "094121"
25+
}
26+
],
27+
"examples": [
28+
{
29+
"input": "test input",
30+
"actual_output": "test output",
31+
"expected_output": "expected output",
32+
"context": [
33+
"context1",
34+
"context2"
35+
],
36+
"retrieval_context": [
37+
"retrieval1"
38+
],
39+
"additional_metadata": {
40+
"key": "value"
41+
},
42+
"tools_called": [
43+
"tool1"
44+
],
45+
"expected_tools": [
46+
"tool1",
47+
"tool2"
48+
],
49+
"name": "test example",
50+
"example_id": null,
51+
"timestamp": "20241230_160117",
52+
"trace_id": "123"
53+
}
54+
]
55+
}

0 commit comments

Comments
 (0)