Skip to content

Commit 186a33b

Browse files
alanzhang25JCamyre
andauthored
Sequence to Trace Conversion (#254)
* Sequence to Trace Conversion * trace save * comment out; * pydantic * updates --------- Co-authored-by: Joseph S Camyre <68767176+JCamyre@users.noreply.github.com>
1 parent c94df26 commit 186a33b

19 files changed

+113
-424
lines changed

src/demo/dataset.py

Lines changed: 0 additions & 16 deletions
This file was deleted.

src/demo/demo.py

Lines changed: 0 additions & 46 deletions
This file was deleted.

src/demo/sequence_test.py

Lines changed: 39 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -15,40 +15,50 @@
1515
tracer = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"), project_name="travel_agent_demo")
1616

1717

18-
@tracer.observe(span_type="tool")
18+
# @tracer.observe(span_type="tool")
1919
def search_tavily(query):
2020
"""Fetch travel data using Tavily API."""
21-
API_KEY = os.getenv("TAVILY_API_KEY")
22-
client = TavilyClient(api_key=API_KEY)
23-
results = client.search(query, num_results=3)
24-
return results
21+
# API_KEY = os.getenv("TAVILY_API_KEY")
22+
# client = TavilyClient(api_key=API_KEY)
23+
# results = client.search(query, num_results=3)
24+
# return results
25+
return "The weather in Tokyo is sunny with a high of 75°F."
2526

26-
# @judgment.observe(span_type="tool")
27+
@tracer.observe(span_type="tool")
2728
def get_attractions(destination):
2829
"""Search for top attractions in the destination."""
2930
prompt = f"Best tourist attractions in {destination}"
3031
attractions_search = search_tavily(prompt)
3132
return attractions_search
3233

33-
# @judgment.observe(span_type="tool")
34+
@tracer.observe(span_type="tool")
3435
def get_hotels(destination):
3536
"""Search for hotels in the destination."""
3637
prompt = f"Best hotels in {destination}"
3738
hotels_search = search_tavily(prompt)
3839
return hotels_search
3940

40-
# @judgment.observe(span_type="tool")
41+
@tracer.observe(span_type="tool")
4142
def get_flights(destination):
4243
"""Search for flights to the destination."""
4344
prompt = f"Flights to {destination} from major cities"
4445
flights_search = search_tavily(prompt)
4546
return flights_search
4647

47-
# @judgment.observe(span_type="tool")
48+
@tracer.observe(span_type="tool")
4849
def get_weather(destination, start_date, end_date):
4950
"""Search for weather information."""
5051
prompt = f"Weather forecast for {destination} from {start_date} to {end_date}"
5152
weather_search = search_tavily(prompt)
53+
example = Example(
54+
input="What is the weather in Tokyo?",
55+
actual_output=weather_search
56+
)
57+
tracer.async_evaluate(
58+
scorers=[AnswerRelevancyScorer(threshold=0.5)],
59+
example=example,
60+
model="gpt-4o-mini",
61+
)
5262
return weather_search
5363

5464
def research_destination(destination, start_date, end_date):
@@ -84,23 +94,22 @@ def create_travel_plan(destination, start_date, end_date, research_data):
8494
- Weather: {research_data['weather']}
8595
"""
8696

87-
response = client.chat.completions.create(
88-
model="gpt-4.1",
89-
messages=[
90-
{"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
91-
{"role": "user", "content": prompt}
92-
]
93-
).choices[0].message.content
97+
# response = client.chat.completions.create(
98+
# model="gpt-4o",
99+
# messages=[
100+
# {"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
101+
# {"role": "user", "content": prompt}
102+
# ]
103+
# ).choices[0].message.content
94104

95-
return response
105+
return "Here is travel plan"
96106

97107
@tracer.observe(span_type="function")
98108
def generate_itinerary(destination, start_date, end_date):
99109
"""Main function to generate a travel itinerary."""
100110
research_data = research_destination(destination, start_date, end_date)
101111
res = create_travel_plan(destination, start_date, end_date, research_data)
102112

103-
from judgeval.data import Sequence
104113
from judgeval.scorers import ToolOrderScorer
105114
from judgeval import JudgmentClient
106115

@@ -110,27 +119,29 @@ def generate_itinerary(destination, start_date, end_date):
110119
input={"destination": "Paris", "start_date": "2025-06-01", "end_date": "2025-06-02"},
111120
expected_tools=[
112121
{
113-
"tool_name": "search_tavily",
122+
"tool_name": "get_attractions",
114123
"parameters": {
115-
"query": "Best tourist attractions in Paris"
124+
"destination": "Paris"
116125
}
117126
},
118127
{
119-
"tool_name": "search_tavily",
128+
"tool_name": "get_hotels",
120129
"parameters": {
121-
"query": "Best hotels in Paris"
130+
"destination": "Paris"
122131
}
123132
},
124133
{
125-
"tool_name": "search_tavily",
134+
"tool_name": "get_flights",
126135
"parameters": {
127-
"query": "Flights to Paris from major cities"
136+
"destination": "Paris"
128137
}
129138
},
130139
{
131-
"tool_name": "search_tavily",
140+
"tool_name": "get_weather",
132141
"parameters": {
133-
"query": "Weather forecast for Paris from 2025-06-01 to 2025-06-02"
142+
"destination": "Paris",
143+
"start_date": "2025-06-01",
144+
"end_date": "2025-06-02"
134145
}
135146
}
136147
]
@@ -141,11 +152,12 @@ def generate_itinerary(destination, start_date, end_date):
141152
{"tool_name": "search_tavily", "parameters": {"query": "Best tourist attractions in Tokyo"}},
142153
{"tool_name": "search_tavily", "parameters": {"query": "Best hotels in Tokyo"}},
143154
{"tool_name": "search_tavily", "parameters": {"query": "Flights to Tokyo from major cities"}},
144-
{"tool_name": "search_tavily", "parameters": {"query": "Weather forecast for Tokyo from 2025-06-01 to 2025-06-02"}}
155+
{"tool_name": "search_tavily", "parameters": {"query": "Weather forecast for Tokyo from 2025-06-01 to 2025-06-03"}}
145156
]
146157
)
147158

148159
judgment.assert_test(
160+
project_name="travel_agent_demo",
149161
examples=[example],
150162
scorers=[ToolOrderScorer(threshold=0.5)],
151163
model="gpt-4.1-mini",

src/e2etests/test_all_scorers.py

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
ClassifierScorer,
2424
)
2525

26-
from judgeval.data import Example, Sequence
26+
from judgeval.data import Example
2727

2828

2929
def test_ac_scorer(client: JudgmentClient):
@@ -533,31 +533,6 @@ def test_execution_order_scorer(client: JudgmentClient):
533533
override=True
534534
)
535535

536-
def test_derailment_scorer(client: JudgmentClient):
537-
PROJECT_NAME = "test-project"
538-
EVAL_RUN_NAME = "test-run-derailment"
539-
540-
airlines_example = Example(
541-
input="Which airlines fly to Paris?",
542-
actual_output="Air France, Delta, and American Airlines offer direct flights."
543-
)
544-
weather_example = Example(
545-
input="What is the weather like in Texas?",
546-
actual_output="It's sunny with a high of 75°F in Texas."
547-
)
548-
airline_sequence = Sequence(
549-
name="Flight Details",
550-
items=[airlines_example, weather_example],
551-
)
552-
results = client.run_sequence_evaluation(
553-
eval_run_name=EVAL_RUN_NAME,
554-
project_name=PROJECT_NAME,
555-
sequences=[airline_sequence],
556-
scorers=[DerailmentScorer(threshold=0.5)],
557-
model="gpt-4.1",
558-
log_results=True,
559-
override=True,
560-
)
561536
def test_json_scorer(client: JudgmentClient):
562537
"""Test JSON scorer functionality."""
563538
example1 = Example(

src/e2etests/test_dataset_operations.py

Lines changed: 1 addition & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import string
99

1010
from judgeval.judgment_client import JudgmentClient
11-
from judgeval.data import Example, Sequence
11+
from judgeval.data import Example
1212

1313
@pytest.fixture(scope="module", autouse=True)
1414
def setup_and_teardown_module(client: JudgmentClient):
@@ -38,26 +38,6 @@ def test_dataset(self, client: JudgmentClient, project_name: str):
3838

3939
client.delete_dataset(alias="test_dataset_5", project_name=project_name)
4040

41-
def test_dataset_with_sequence(self, client: JudgmentClient, project_name: str):
42-
"""Test dataset creation and manipulation with a sequence."""
43-
dataset = client.create_dataset()
44-
examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
45-
sequence = Sequence(
46-
name="test_sequence",
47-
items=examples
48-
)
49-
dataset.add_sequence(sequence)
50-
client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)
51-
52-
dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
53-
assert dataset.sequences, "Failed to pull dataset"
54-
assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
55-
sequence = dataset.sequences[0]
56-
assert sequence.name == "test_sequence", "Sequence should have the correct name"
57-
assert len(sequence.items) == 3, "Sequence should have 3 items"
58-
59-
client.delete_dataset(alias="test_dataset_with_sequence", project_name=project_name)
60-
6141
def test_pull_all_project_dataset_stats(self, client: JudgmentClient, project_name: str):
6242
"""Test pulling statistics for all project datasets."""
6343
dataset = client.create_dataset()
@@ -132,51 +112,6 @@ def test_append_example_dataset(self, client: JudgmentClient, project_name: str)
132112
dataset = client.pull_dataset(alias="test_dataset_8", project_name=project_name)
133113
assert dataset, "Failed to pull dataset"
134114
assert len(dataset.examples) == 3, "Dataset should have 3 examples"
135-
136-
def test_append_sequence_dataset(self, client: JudgmentClient, project_name: str):
137-
"""Test dataset appending."""
138-
dataset = client.create_dataset()
139-
examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
140-
sequence = Sequence(
141-
name="test_sequence",
142-
items=examples
143-
)
144-
dataset.add_sequence(sequence)
145-
client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)
146-
147-
dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
148-
assert dataset.sequences, "Failed to pull dataset"
149-
assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
150-
sequence = dataset.sequences[0]
151-
assert sequence.name == "test_sequence", "Sequence should have the correct name"
152-
assert len(sequence.items) == 3, "Sequence should have 3 items"
153-
examples2 = [Example(input="input 4", actual_output="output 4"), Example(input="input 5", actual_output="output 5")]
154-
sequence2 = Sequence(
155-
name="test_sequence2",
156-
items=examples2
157-
)
158-
159-
client.append_sequence_dataset(alias="test_dataset_with_sequence", sequences=[sequence2], project_name=project_name)
160-
161-
dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
162-
assert dataset.sequences, "Failed to pull dataset"
163-
assert len(dataset.sequences) == 2, "Dataset should have 2 sequences"
164-
165-
test_sequence = None
166-
test_sequence2 = None
167-
for seq in dataset.sequences:
168-
if seq.name == "test_sequence":
169-
test_sequence = seq
170-
elif seq.name == "test_sequence2":
171-
test_sequence2 = seq
172-
173-
# Verify first sequence
174-
assert test_sequence is not None, "Could not find 'test_sequence'"
175-
assert len(test_sequence.items) == 3, "Sequence 'test_sequence' should have 3 items"
176-
177-
# Verify second sequence
178-
assert test_sequence2 is not None, "Could not find 'test_sequence2'"
179-
assert len(test_sequence2.items) == 2, "Sequence 'test_sequence2' should have 2 items"
180115

181116
def test_export_jsonl(self, client: JudgmentClient, random_name: str, project_name: str):
182117
"""Test JSONL dataset export functionality."""

src/judgeval/common/tracer.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def fetch_trace(self, trace_id: str):
146146

147147
return response.json()
148148

149-
def save_trace(self, trace_data: dict):
149+
def save_trace(self, trace_data: dict, offline_mode: bool = False):
150150
"""
151151
Saves a trace to the Judgment Supabase and optionally to S3 if configured.
152152
@@ -183,7 +183,7 @@ def save_trace(self, trace_data: dict):
183183
except Exception as e:
184184
warnings.warn(f"Failed to save trace to S3: {str(e)}")
185185

186-
if "ui_results_url" in response.json():
186+
if not offline_mode and "ui_results_url" in response.json():
187187
pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
188188
rprint(pretty_str)
189189

@@ -660,11 +660,12 @@ def save(self, overwrite: bool = False) -> Tuple[str, dict]:
660660
"entries": [span.model_dump() for span in self.trace_spans],
661661
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
662662
"overwrite": overwrite,
663+
"offline_mode": self.tracer.offline_mode,
663664
"parent_trace_id": self.parent_trace_id,
664665
"parent_name": self.parent_name
665666
}
666667
# --- Log trace data before saving ---
667-
self.trace_manager_client.save_trace(trace_data)
668+
self.trace_manager_client.save_trace(trace_data, offline_mode=self.tracer.offline_mode)
668669

669670
# upload annotations
670671
# TODO: batch to the log endpoint
@@ -930,6 +931,7 @@ def __init__(
930931
s3_aws_access_key_id: Optional[str] = None,
931932
s3_aws_secret_access_key: Optional[str] = None,
932933
s3_region_name: Optional[str] = None,
934+
offline_mode: bool = False,
933935
deep_tracing: bool = True # Deep tracing is enabled by default
934936
):
935937
if not hasattr(self, 'initialized'):
@@ -970,6 +972,7 @@ def __init__(
970972
aws_secret_access_key=s3_aws_secret_access_key,
971973
region_name=s3_region_name
972974
)
975+
self.offline_mode: bool = offline_mode
973976
self.deep_tracing: bool = deep_tracing # NEW: Store deep tracing setting
974977

975978
elif hasattr(self, 'project_name') and self.project_name != project_name:

0 commit comments

Comments
 (0)