Skip to content

Commit 74b2c2e

Browse files
committed
Sequence to Trace Conversion
1 parent 471089a commit 74b2c2e

17 files changed

+76
-404
lines changed

src/demo/dataset.py

Lines changed: 0 additions & 16 deletions
This file was deleted.

src/demo/demo.py

Lines changed: 0 additions & 46 deletions
This file was deleted.

src/demo/sequence_test.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,7 @@
1818
@tracer.observe(span_type="tool")
1919
def search_tavily(query):
2020
"""Fetch travel data using Tavily API."""
21-
API_KEY = os.getenv("TAVILY_API_KEY")
22-
client = TavilyClient(api_key=API_KEY)
23-
results = client.search(query, num_results=3)
24-
return results
21+
return "results"
2522

2623
# @judgment.observe(span_type="tool")
2724
def get_attractions(destination):
@@ -84,23 +81,22 @@ def create_travel_plan(destination, start_date, end_date, research_data):
8481
- Weather: {research_data['weather']}
8582
"""
8683

87-
response = client.chat.completions.create(
88-
model="gpt-4.1",
89-
messages=[
90-
{"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
91-
{"role": "user", "content": prompt}
92-
]
93-
).choices[0].message.content
84+
# response = client.chat.completions.create(
85+
# model="gpt-4o",
86+
# messages=[
87+
# {"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
88+
# {"role": "user", "content": prompt}
89+
# ]
90+
# ).choices[0].message.content
9491

95-
return response
92+
return "Here is travel plan"
9693

9794
@tracer.observe(span_type="function")
9895
def generate_itinerary(destination, start_date, end_date):
9996
"""Main function to generate a travel itinerary."""
10097
research_data = research_destination(destination, start_date, end_date)
10198
res = create_travel_plan(destination, start_date, end_date, research_data)
10299

103-
from judgeval.data import Sequence
104100
from judgeval.scorers import ToolOrderScorer
105101
from judgeval import JudgmentClient
106102

@@ -146,6 +142,7 @@ def generate_itinerary(destination, start_date, end_date):
146142
)
147143

148144
judgment.assert_test(
145+
project_name="travel_agent_demo",
149146
examples=[example],
150147
scorers=[ToolOrderScorer(threshold=0.5)],
151148
model="gpt-4.1-mini",

src/e2etests/test_all_scorers.py

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
ClassifierScorer,
2424
)
2525

26-
from judgeval.data import Example, Sequence
26+
from judgeval.data import Example
2727

2828

2929
def test_ac_scorer(client: JudgmentClient):
@@ -533,31 +533,6 @@ def test_execution_order_scorer(client: JudgmentClient):
533533
override=True
534534
)
535535

536-
def test_derailment_scorer(client: JudgmentClient):
537-
PROJECT_NAME = "test-project"
538-
EVAL_RUN_NAME = "test-run-derailment"
539-
540-
airlines_example = Example(
541-
input="Which airlines fly to Paris?",
542-
actual_output="Air France, Delta, and American Airlines offer direct flights."
543-
)
544-
weather_example = Example(
545-
input="What is the weather like in Texas?",
546-
actual_output="It's sunny with a high of 75°F in Texas."
547-
)
548-
airline_sequence = Sequence(
549-
name="Flight Details",
550-
items=[airlines_example, weather_example],
551-
)
552-
results = client.run_sequence_evaluation(
553-
eval_run_name=EVAL_RUN_NAME,
554-
project_name=PROJECT_NAME,
555-
sequences=[airline_sequence],
556-
scorers=[DerailmentScorer(threshold=0.5)],
557-
model="gpt-4.1",
558-
log_results=True,
559-
override=True,
560-
)
561536
def test_json_scorer(client: JudgmentClient):
562537
"""Test JSON scorer functionality."""
563538
example1 = Example(

src/e2etests/test_dataset_operations.py

Lines changed: 1 addition & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import string
99

1010
from judgeval.judgment_client import JudgmentClient
11-
from judgeval.data import Example, Sequence
11+
from judgeval.data import Example
1212

1313
@pytest.fixture(scope="module", autouse=True)
1414
def setup_and_teardown_module(client: JudgmentClient):
@@ -38,26 +38,6 @@ def test_dataset(self, client: JudgmentClient, project_name: str):
3838

3939
client.delete_dataset(alias="test_dataset_5", project_name=project_name)
4040

41-
def test_dataset_with_sequence(self, client: JudgmentClient, project_name: str):
42-
"""Test dataset creation and manipulation with a sequence."""
43-
dataset = client.create_dataset()
44-
examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
45-
sequence = Sequence(
46-
name="test_sequence",
47-
items=examples
48-
)
49-
dataset.add_sequence(sequence)
50-
client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)
51-
52-
dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
53-
assert dataset.sequences, "Failed to pull dataset"
54-
assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
55-
sequence = dataset.sequences[0]
56-
assert sequence.name == "test_sequence", "Sequence should have the correct name"
57-
assert len(sequence.items) == 3, "Sequence should have 3 items"
58-
59-
client.delete_dataset(alias="test_dataset_with_sequence", project_name=project_name)
60-
6141
def test_pull_all_project_dataset_stats(self, client: JudgmentClient, project_name: str):
6242
"""Test pulling statistics for all project datasets."""
6343
dataset = client.create_dataset()
@@ -132,51 +112,6 @@ def test_append_example_dataset(self, client: JudgmentClient, project_name: str)
132112
dataset = client.pull_dataset(alias="test_dataset_8", project_name=project_name)
133113
assert dataset, "Failed to pull dataset"
134114
assert len(dataset.examples) == 3, "Dataset should have 3 examples"
135-
136-
def test_append_sequence_dataset(self, client: JudgmentClient, project_name: str):
137-
"""Test dataset appending."""
138-
dataset = client.create_dataset()
139-
examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
140-
sequence = Sequence(
141-
name="test_sequence",
142-
items=examples
143-
)
144-
dataset.add_sequence(sequence)
145-
client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)
146-
147-
dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
148-
assert dataset.sequences, "Failed to pull dataset"
149-
assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
150-
sequence = dataset.sequences[0]
151-
assert sequence.name == "test_sequence", "Sequence should have the correct name"
152-
assert len(sequence.items) == 3, "Sequence should have 3 items"
153-
examples2 = [Example(input="input 4", actual_output="output 4"), Example(input="input 5", actual_output="output 5")]
154-
sequence2 = Sequence(
155-
name="test_sequence2",
156-
items=examples2
157-
)
158-
159-
client.append_sequence_dataset(alias="test_dataset_with_sequence", sequences=[sequence2], project_name=project_name)
160-
161-
dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
162-
assert dataset.sequences, "Failed to pull dataset"
163-
assert len(dataset.sequences) == 2, "Dataset should have 2 sequences"
164-
165-
test_sequence = None
166-
test_sequence2 = None
167-
for seq in dataset.sequences:
168-
if seq.name == "test_sequence":
169-
test_sequence = seq
170-
elif seq.name == "test_sequence2":
171-
test_sequence2 = seq
172-
173-
# Verify first sequence
174-
assert test_sequence is not None, "Could not find 'test_sequence'"
175-
assert len(test_sequence.items) == 3, "Sequence 'test_sequence' should have 3 items"
176-
177-
# Verify second sequence
178-
assert test_sequence2 is not None, "Could not find 'test_sequence2'"
179-
assert len(test_sequence2.items) == 2, "Sequence 'test_sequence2' should have 2 items"
180115

181116
def test_export_jsonl(self, client: JudgmentClient, random_name: str, project_name: str):
182117
"""Test JSONL dataset export functionality."""

src/judgeval/common/tracer.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -660,11 +660,13 @@ def save(self, overwrite: bool = False) -> Tuple[str, dict]:
660660
"entries": [span.model_dump() for span in self.trace_spans],
661661
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
662662
"overwrite": overwrite,
663+
"offline_mode": self.tracer.offline_mode,
663664
"parent_trace_id": self.parent_trace_id,
664665
"parent_name": self.parent_name
665666
}
666667
# --- Log trace data before saving ---
667-
self.trace_manager_client.save_trace(trace_data)
668+
if not self.tracer.offline_mode:
669+
self.trace_manager_client.save_trace(trace_data)
668670

669671
# upload annotations
670672
# TODO: batch to the log endpoint
@@ -930,6 +932,7 @@ def __init__(
930932
s3_aws_access_key_id: Optional[str] = None,
931933
s3_aws_secret_access_key: Optional[str] = None,
932934
s3_region_name: Optional[str] = None,
935+
offline_mode: bool = False,
933936
deep_tracing: bool = True # Deep tracing is enabled by default
934937
):
935938
if not hasattr(self, 'initialized'):
@@ -970,6 +973,7 @@ def __init__(
970973
aws_secret_access_key=s3_aws_secret_access_key,
971974
region_name=s3_region_name
972975
)
976+
self.offline_mode: bool = offline_mode
973977
self.deep_tracing: bool = deep_tracing # NEW: Store deep tracing setting
974978

975979
elif hasattr(self, 'project_name') and self.project_name != project_name:

src/judgeval/constants.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,15 @@ def _missing_(cls, value):
4040
ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
4141
# API URLs
4242
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
43-
JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
43+
JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
4444
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
4545
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
46-
JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
4746
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
4847
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
4948
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
5049
JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
5150
JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
5251
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
53-
JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/"
5452
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
5553
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
5654
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"

src/judgeval/data/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from judgeval.data.custom_example import CustomExample
33
from judgeval.data.scorer_data import ScorerData, create_scorer_data
44
from judgeval.data.result import ScoringResult, generate_scoring_result
5-
from judgeval.data.sequence import Sequence
65
from judgeval.data.trace import Trace, TraceSpan
76

87

@@ -14,7 +13,6 @@
1413
"create_scorer_data",
1514
"ScoringResult",
1615
"generate_scoring_result",
17-
"Sequence",
1816
"Trace",
1917
"TraceSpan",
2018
]

src/judgeval/data/datasets/dataset.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,12 @@
77
from dataclasses import dataclass, field
88
from typing import List, Union, Literal
99

10-
from judgeval.data import Example, Sequence
10+
from judgeval.data import Example
1111
from judgeval.common.logger import debug, error, warning, info
1212

1313
@dataclass
1414
class EvalDataset:
1515
examples: List[Example]
16-
sequences: List[Sequence]
1716
_alias: Union[str, None] = field(default=None)
1817
_id: Union[str, None] = field(default=None)
1918
judgment_api_key: str = field(default="")
@@ -22,13 +21,11 @@ def __init__(self,
2221
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
2322
organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
2423
examples: List[Example] = [],
25-
sequences: List[Sequence] = []
2624
):
2725
debug(f"Initializing EvalDataset with {len(examples)} examples")
2826
if not judgment_api_key:
2927
warning("No judgment_api_key provided")
3028
self.examples = examples
31-
self.sequences = sequences
3229
self._alias = None
3330
self._id = None
3431
self.judgment_api_key = judgment_api_key
@@ -223,10 +220,7 @@ def add_from_yaml(self, file_path: str) -> None:
223220
def add_example(self, e: Example) -> None:
224221
self.examples = self.examples + [e]
225222
# TODO if we need to add rank, then we need to do it here
226-
227-
def add_sequence(self, s: Sequence) -> None:
228-
self.sequences = self.sequences + [s]
229-
223+
230224
def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
231225
"""
232226
Saves the dataset as a file. Save only the examples.
@@ -313,7 +307,6 @@ def __str__(self):
313307
return (
314308
f"{self.__class__.__name__}("
315309
f"examples={self.examples}, "
316-
f"sequences={self.sequences}, "
317310
f"_alias={self._alias}, "
318311
f"_id={self._id}"
319312
f")"

0 commit comments

Comments
 (0)