Skip to content

Sequence to Trace Conversion #254

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 0 additions & 16 deletions src/demo/dataset.py

This file was deleted.

46 changes: 0 additions & 46 deletions src/demo/demo.py

This file was deleted.

66 changes: 39 additions & 27 deletions src/demo/sequence_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,40 +15,50 @@
tracer = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"), project_name="travel_agent_demo")


@tracer.observe(span_type="tool")
# @tracer.observe(span_type="tool")
def search_tavily(query):
"""Fetch travel data using Tavily API."""
API_KEY = os.getenv("TAVILY_API_KEY")
client = TavilyClient(api_key=API_KEY)
results = client.search(query, num_results=3)
return results
# API_KEY = os.getenv("TAVILY_API_KEY")
# client = TavilyClient(api_key=API_KEY)
# results = client.search(query, num_results=3)
# return results
return "The weather in Tokyo is sunny with a high of 75Β°F."

# @judgment.observe(span_type="tool")
@tracer.observe(span_type="tool")
def get_attractions(destination):
"""Search for top attractions in the destination."""
prompt = f"Best tourist attractions in {destination}"
attractions_search = search_tavily(prompt)
return attractions_search

# @judgment.observe(span_type="tool")
@tracer.observe(span_type="tool")
def get_hotels(destination):
"""Search for hotels in the destination."""
prompt = f"Best hotels in {destination}"
hotels_search = search_tavily(prompt)
return hotels_search

# @judgment.observe(span_type="tool")
@tracer.observe(span_type="tool")
def get_flights(destination):
"""Search for flights to the destination."""
prompt = f"Flights to {destination} from major cities"
flights_search = search_tavily(prompt)
return flights_search

# @judgment.observe(span_type="tool")
@tracer.observe(span_type="tool")
def get_weather(destination, start_date, end_date):
"""Search for weather information."""
prompt = f"Weather forecast for {destination} from {start_date} to {end_date}"
weather_search = search_tavily(prompt)
example = Example(
input="What is the weather in Tokyo?",
actual_output=weather_search
)
tracer.async_evaluate(
scorers=[AnswerRelevancyScorer(threshold=0.5)],
example=example,
model="gpt-4o-mini",
)
return weather_search

def research_destination(destination, start_date, end_date):
Expand Down Expand Up @@ -84,23 +94,22 @@ def create_travel_plan(destination, start_date, end_date, research_data):
- Weather: {research_data['weather']}
"""

response = client.chat.completions.create(
model="gpt-4.1",
messages=[
{"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
{"role": "user", "content": prompt}
]
).choices[0].message.content
# response = client.chat.completions.create(
# model="gpt-4o",
# messages=[
# {"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
# {"role": "user", "content": prompt}
# ]
# ).choices[0].message.content

return response
return "Here is travel plan"

@tracer.observe(span_type="function")
def generate_itinerary(destination, start_date, end_date):
"""Main function to generate a travel itinerary."""
research_data = research_destination(destination, start_date, end_date)
res = create_travel_plan(destination, start_date, end_date, research_data)

from judgeval.data import Sequence
from judgeval.scorers import ToolOrderScorer
from judgeval import JudgmentClient

Expand All @@ -110,27 +119,29 @@ def generate_itinerary(destination, start_date, end_date):
input={"destination": "Paris", "start_date": "2025-06-01", "end_date": "2025-06-02"},
expected_tools=[
{
"tool_name": "search_tavily",
"tool_name": "get_attractions",
"parameters": {
"query": "Best tourist attractions in Paris"
"destination": "Paris"
}
},
{
"tool_name": "search_tavily",
"tool_name": "get_hotels",
"parameters": {
"query": "Best hotels in Paris"
"destination": "Paris"
}
},
{
"tool_name": "search_tavily",
"tool_name": "get_flights",
"parameters": {
"query": "Flights to Paris from major cities"
"destination": "Paris"
}
},
{
"tool_name": "search_tavily",
"tool_name": "get_weather",
"parameters": {
"query": "Weather forecast for Paris from 2025-06-01 to 2025-06-02"
"destination": "Paris",
"start_date": "2025-06-01",
"end_date": "2025-06-02"
}
}
]
Expand All @@ -141,11 +152,12 @@ def generate_itinerary(destination, start_date, end_date):
{"tool_name": "search_tavily", "parameters": {"query": "Best tourist attractions in Tokyo"}},
{"tool_name": "search_tavily", "parameters": {"query": "Best hotels in Tokyo"}},
{"tool_name": "search_tavily", "parameters": {"query": "Flights to Tokyo from major cities"}},
{"tool_name": "search_tavily", "parameters": {"query": "Weather forecast for Tokyo from 2025-06-01 to 2025-06-02"}}
{"tool_name": "search_tavily", "parameters": {"query": "Weather forecast for Tokyo from 2025-06-01 to 2025-06-03"}}
]
)

judgment.assert_test(
project_name="travel_agent_demo",
examples=[example],
scorers=[ToolOrderScorer(threshold=0.5)],
model="gpt-4.1-mini",
Expand Down
27 changes: 1 addition & 26 deletions src/e2etests/test_all_scorers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
ClassifierScorer,
)

from judgeval.data import Example, Sequence
from judgeval.data import Example


def test_ac_scorer(client: JudgmentClient):
Expand Down Expand Up @@ -533,31 +533,6 @@ def test_execution_order_scorer(client: JudgmentClient):
override=True
)

def test_derailment_scorer(client: JudgmentClient):
PROJECT_NAME = "test-project"
EVAL_RUN_NAME = "test-run-derailment"

airlines_example = Example(
input="Which airlines fly to Paris?",
actual_output="Air France, Delta, and American Airlines offer direct flights."
)
weather_example = Example(
input="What is the weather like in Texas?",
actual_output="It's sunny with a high of 75Β°F in Texas."
)
airline_sequence = Sequence(
name="Flight Details",
items=[airlines_example, weather_example],
)
results = client.run_sequence_evaluation(
eval_run_name=EVAL_RUN_NAME,
project_name=PROJECT_NAME,
sequences=[airline_sequence],
scorers=[DerailmentScorer(threshold=0.5)],
model="gpt-4.1",
log_results=True,
override=True,
)
def test_json_scorer(client: JudgmentClient):
"""Test JSON scorer functionality."""
example1 = Example(
Expand Down
67 changes: 1 addition & 66 deletions src/e2etests/test_dataset_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import string

from judgeval.judgment_client import JudgmentClient
from judgeval.data import Example, Sequence
from judgeval.data import Example

@pytest.fixture(scope="module", autouse=True)
def setup_and_teardown_module(client: JudgmentClient):
Expand Down Expand Up @@ -38,26 +38,6 @@ def test_dataset(self, client: JudgmentClient, project_name: str):

client.delete_dataset(alias="test_dataset_5", project_name=project_name)

def test_dataset_with_sequence(self, client: JudgmentClient, project_name: str):
"""Test dataset creation and manipulation with a sequence."""
dataset = client.create_dataset()
examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
sequence = Sequence(
name="test_sequence",
items=examples
)
dataset.add_sequence(sequence)
client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)

dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
assert dataset.sequences, "Failed to pull dataset"
assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
sequence = dataset.sequences[0]
assert sequence.name == "test_sequence", "Sequence should have the correct name"
assert len(sequence.items) == 3, "Sequence should have 3 items"

client.delete_dataset(alias="test_dataset_with_sequence", project_name=project_name)

def test_pull_all_project_dataset_stats(self, client: JudgmentClient, project_name: str):
"""Test pulling statistics for all project datasets."""
dataset = client.create_dataset()
Expand Down Expand Up @@ -132,51 +112,6 @@ def test_append_example_dataset(self, client: JudgmentClient, project_name: str)
dataset = client.pull_dataset(alias="test_dataset_8", project_name=project_name)
assert dataset, "Failed to pull dataset"
assert len(dataset.examples) == 3, "Dataset should have 3 examples"

def test_append_sequence_dataset(self, client: JudgmentClient, project_name: str):
"""Test dataset appending."""
dataset = client.create_dataset()
examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
sequence = Sequence(
name="test_sequence",
items=examples
)
dataset.add_sequence(sequence)
client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)

dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
assert dataset.sequences, "Failed to pull dataset"
assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
sequence = dataset.sequences[0]
assert sequence.name == "test_sequence", "Sequence should have the correct name"
assert len(sequence.items) == 3, "Sequence should have 3 items"
examples2 = [Example(input="input 4", actual_output="output 4"), Example(input="input 5", actual_output="output 5")]
sequence2 = Sequence(
name="test_sequence2",
items=examples2
)

client.append_sequence_dataset(alias="test_dataset_with_sequence", sequences=[sequence2], project_name=project_name)

dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
assert dataset.sequences, "Failed to pull dataset"
assert len(dataset.sequences) == 2, "Dataset should have 2 sequences"

test_sequence = None
test_sequence2 = None
for seq in dataset.sequences:
if seq.name == "test_sequence":
test_sequence = seq
elif seq.name == "test_sequence2":
test_sequence2 = seq

# Verify first sequence
assert test_sequence is not None, "Could not find 'test_sequence'"
assert len(test_sequence.items) == 3, "Sequence 'test_sequence' should have 3 items"

# Verify second sequence
assert test_sequence2 is not None, "Could not find 'test_sequence2'"
assert len(test_sequence2.items) == 2, "Sequence 'test_sequence2' should have 2 items"

def test_export_jsonl(self, client: JudgmentClient, random_name: str, project_name: str):
"""Test JSONL dataset export functionality."""
Expand Down
9 changes: 6 additions & 3 deletions src/judgeval/common/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def fetch_trace(self, trace_id: str):

return response.json()

def save_trace(self, trace_data: dict):
def save_trace(self, trace_data: dict, offline_mode: bool = False):
"""
Saves a trace to the Judgment Supabase and optionally to S3 if configured.

Expand Down Expand Up @@ -183,7 +183,7 @@ def save_trace(self, trace_data: dict):
except Exception as e:
warnings.warn(f"Failed to save trace to S3: {str(e)}")

if "ui_results_url" in response.json():
if not offline_mode and "ui_results_url" in response.json():
pretty_str = f"\nπŸ” You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
rprint(pretty_str)

Expand Down Expand Up @@ -660,11 +660,12 @@ def save(self, overwrite: bool = False) -> Tuple[str, dict]:
"entries": [span.model_dump() for span in self.trace_spans],
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
"overwrite": overwrite,
"offline_mode": self.tracer.offline_mode,
"parent_trace_id": self.parent_trace_id,
"parent_name": self.parent_name
}
# --- Log trace data before saving ---
self.trace_manager_client.save_trace(trace_data)
self.trace_manager_client.save_trace(trace_data, offline_mode=self.tracer.offline_mode)

# upload annotations
# TODO: batch to the log endpoint
Expand Down Expand Up @@ -930,6 +931,7 @@ def __init__(
s3_aws_access_key_id: Optional[str] = None,
s3_aws_secret_access_key: Optional[str] = None,
s3_region_name: Optional[str] = None,
offline_mode: bool = False,
deep_tracing: bool = True # Deep tracing is enabled by default
):
if not hasattr(self, 'initialized'):
Expand Down Expand Up @@ -970,6 +972,7 @@ def __init__(
aws_secret_access_key=s3_aws_secret_access_key,
region_name=s3_region_name
)
self.offline_mode: bool = offline_mode
self.deep_tracing: bool = deep_tracing # NEW: Store deep tracing setting

elif hasattr(self, 'project_name') and self.project_name != project_name:
Expand Down
Loading
Loading