Skip to content

Trace Datasets #291

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/demo/simple_trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ async def gather_information(city: str):
weather = await get_weather(city)
attractions = await get_attractions(city)

# judgment.async_evaluate(
# scorers=[AnswerRelevancyScorer(threshold=0.5)],
# input="What is the weather in Paris?",
# actual_output=weather,
# model="gpt-4",
# )
judgment.async_evaluate(
scorers=[AnswerRelevancyScorer(threshold=0.5)],
input="What is the weather in Paris?",
actual_output=weather,
model="gpt-4",
)

return {
"weather": weather,
Expand Down
12 changes: 6 additions & 6 deletions src/e2etests/test_judgee_traces_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ async def test_trace_save_increment(client, cleanup_traces):
"project_name": "test_project",
"trace_id": trace_id,
"created_at": datetime.fromtimestamp(timestamp).isoformat(),
"entries": [
"trace_spans": [
{
"timestamp": datetime.fromtimestamp(timestamp).isoformat(),
"type": "span",
Expand Down Expand Up @@ -272,7 +272,7 @@ async def save_trace(index):
"project_name": "test_project",
"trace_id": trace_id,
"created_at": datetime.fromtimestamp(timestamp).isoformat(),
"entries": [
"trace_spans": [
{
"timestamp": datetime.fromtimestamp(timestamp).isoformat(),
"type": "span",
Expand Down Expand Up @@ -354,7 +354,7 @@ async def test_failed_trace_counting(client):
"project_name": "test_project",
"trace_id": str(uuid4()),
"created_at": str(timestamp), # Convert to string
# Missing entries, which should cause a validation error
# Missing trace_spans, which should cause a validation error
"duration": 0.1,
"token_counts": {"total": 10},
"empty_save": False,
Expand Down Expand Up @@ -463,7 +463,7 @@ async def test_burst_request_handling(client):
"project_name": "test_project",
"trace_id": trace_id,
"created_at": datetime.fromtimestamp(timestamp).isoformat(),
"entries": [
"trace_spans": [
{
"timestamp": datetime.fromtimestamp(timestamp).isoformat(),
"type": "span",
Expand All @@ -488,8 +488,8 @@ async def save_trace():
# Create a unique trace ID for each request
local_trace_data = trace_data.copy()
local_trace_data["trace_id"] = str(uuid4())
local_trace_data["entries"][0]["span_id"] = str(uuid4())
local_trace_data["entries"][0]["trace_id"] = local_trace_data["trace_id"]
local_trace_data["trace_spans"][0]["span_id"] = str(uuid4())
local_trace_data["trace_spans"][0]["trace_id"] = local_trace_data["trace_id"]

response = await client.post(
f"{SERVER_URL}/traces/save/",
Expand Down
11 changes: 0 additions & 11 deletions src/e2etests/test_tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,17 +590,6 @@ async def run_async_stream(prompt):
return result

# --- END NEW TESTS ---

# Helper function to print trace hierarchy
def print_trace_hierarchy(entries):
"""Print a hierarchical representation of the trace for debugging."""
# First, organize entries by parent_span_id
entries_by_parent = {}
for entry in entries:
parent_id = entry["parent_span_id"]
if parent_id not in entries_by_parent:
entries_by_parent[parent_id] = []
entries_by_parent[parent_id].append(entry)

# --- NEW COMPREHENSIVE TOKEN COUNTING TEST ---

Expand Down
2 changes: 1 addition & 1 deletion src/judgeval/common/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ def save(self, overwrite: bool = False) -> Tuple[str, dict]:
"project_name": self.project_name,
"created_at": datetime.utcfromtimestamp(self.start_time).isoformat(),
"duration": total_duration,
"entries": [span.model_dump() for span in self.trace_spans],
"trace_spans": [span.model_dump() for span in self.trace_spans],
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
"overwrite": overwrite,
"offline_mode": self.tracer.offline_mode,
Expand Down
18 changes: 12 additions & 6 deletions src/judgeval/data/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,29 @@
import os
import yaml
from dataclasses import dataclass, field
from typing import List, Union, Literal
from typing import List, Union, Literal, Optional

from judgeval.data import Example
from judgeval.data import Example, Trace
from judgeval.common.logger import debug, error, warning, info

@dataclass
class EvalDataset:
examples: List[Example]
traces: List[Trace]
_alias: Union[str, None] = field(default=None)
_id: Union[str, None] = field(default=None)
judgment_api_key: str = field(default="")
organization_id: str = field(default="")
def __init__(self,
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
examples: List[Example] = [],
examples: Optional[List[Example]] = None,
traces: Optional[List[Trace]] = None
):
debug(f"Initializing EvalDataset with {len(examples)} examples")
if not judgment_api_key:
warning("No judgment_api_key provided")
self.examples = examples
self.examples = examples or []
self.traces = traces or []
self._alias = None
self._id = None
self.judgment_api_key = judgment_api_key
Expand Down Expand Up @@ -218,8 +220,11 @@ def add_from_yaml(self, file_path: str) -> None:
self.add_example(e)

def add_example(self, e: Example) -> None:
self.examples = self.examples + [e]
self.examples.append(e)
# TODO if we need to add rank, then we need to do it here

def add_trace(self, t: Trace) -> None:
self.traces.append(t)

def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
"""
Expand Down Expand Up @@ -307,6 +312,7 @@ def __str__(self):
return (
f"{self.__class__.__name__}("
f"examples={self.examples}, "
f"traces={self.traces}, "
f"_alias={self._alias}, "
f"_id={self._id}"
f")"
Expand Down
4 changes: 3 additions & 1 deletion src/judgeval/data/datasets/eval_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
JUDGMENT_DATASETS_INSERT_API_URL,
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
)
from judgeval.data import Example
from judgeval.data import Example, Trace
from judgeval.data.datasets import EvalDataset


Expand Down Expand Up @@ -58,6 +58,7 @@ def push(self, dataset: EvalDataset, alias: str, project_name: str, overwrite: O
"dataset_alias": alias,
"project_name": project_name,
"examples": [e.to_dict() for e in dataset.examples],
"traces": [t.model_dump() for t in dataset.traces],
"overwrite": overwrite,
}
try:
Expand Down Expand Up @@ -202,6 +203,7 @@ def pull(self, alias: str, project_name: str) -> EvalDataset:
info(f"Successfully pulled dataset with alias '{alias}'")
payload = response.json()
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
dataset.traces = [Trace(**t) for t in payload.get("traces", [])]
dataset._alias = payload.get("alias")
dataset._id = payload.get("id")
progress.update(
Expand Down
2 changes: 1 addition & 1 deletion src/judgeval/data/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ class Trace(BaseModel):
name: str
created_at: str
duration: float
entries: List[TraceSpan]
trace_spans: List[TraceSpan]
overwrite: bool = False
offline_mode: bool = False
rules: Optional[Dict[str, Any]] = None
Expand Down
2 changes: 1 addition & 1 deletion src/judgeval/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
for i, trace in enumerate(tracer.traces):
# We set the root-level trace span with the expected tools of the Trace
trace = Trace(**trace)
trace.entries[0].expected_tools = examples[i].expected_tools
trace.trace_spans[0].expected_tools = examples[i].expected_tools
new_traces.append(trace)
trace_run.traces = new_traces
tracer.traces = []
Expand Down
2 changes: 1 addition & 1 deletion src/tests/common/test_tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def test_trace_client_span(trace_client):
assert len(trace_client.trace_spans) == initial_spans_count + 1

def test_trace_client_nested_spans(trace_client):
"""Test nested spans maintain proper depth recorded in entries"""
"""Test nested spans maintain proper depth recorded in trace_spans"""
root_span_id = current_span_var.get() # From the fixture

with trace_client.span("outer") as outer_span:
Expand Down
Loading