JudgmentLabs · JCamyre · May 19, 2025 · May 16, 2025 · May 18, 2025 · May 18, 2025
diff --git a/src/demo/dataset.py b/src/demo/dataset.py
diff --git a/src/demo/demo.py b/src/demo/demo.py
diff --git a/src/demo/sequence_test.py b/src/demo/sequence_test.py
@@ -15,40 +15,50 @@
 tracer = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"), project_name="travel_agent_demo")
 
 
-@tracer.observe(span_type="tool")
+# @tracer.observe(span_type="tool")
 def search_tavily(query):
     """Fetch travel data using Tavily API."""
-    API_KEY = os.getenv("TAVILY_API_KEY")
-    client = TavilyClient(api_key=API_KEY)
-    results = client.search(query, num_results=3)
-    return results
+    # API_KEY = os.getenv("TAVILY_API_KEY")
+    # client = TavilyClient(api_key=API_KEY)
+    # results = client.search(query, num_results=3)
+    # return results
+    return "The weather in Tokyo is sunny with a high of 75°F."
 
-# @judgment.observe(span_type="tool")
+@tracer.observe(span_type="tool")
 def get_attractions(destination):
     """Search for top attractions in the destination."""
     prompt = f"Best tourist attractions in {destination}"
     attractions_search = search_tavily(prompt)
     return attractions_search
 
-# @judgment.observe(span_type="tool")
+@tracer.observe(span_type="tool")
 def get_hotels(destination):
     """Search for hotels in the destination."""
     prompt = f"Best hotels in {destination}"
     hotels_search = search_tavily(prompt)
     return hotels_search
 
-# @judgment.observe(span_type="tool")
+@tracer.observe(span_type="tool")
 def get_flights(destination):
     """Search for flights to the destination."""
     prompt = f"Flights to {destination} from major cities"
     flights_search = search_tavily(prompt)
     return flights_search
 
-# @judgment.observe(span_type="tool")
+@tracer.observe(span_type="tool")
 def get_weather(destination, start_date, end_date):
     """Search for weather information."""
     prompt = f"Weather forecast for {destination} from {start_date} to {end_date}"
     weather_search = search_tavily(prompt)
+    example = Example(
+        input="What is the weather in Tokyo?",
+        actual_output=weather_search
+    )
+    tracer.async_evaluate(
+        scorers=[AnswerRelevancyScorer(threshold=0.5)],
+        example=example,
+        model="gpt-4o-mini",
+    )
     return weather_search
 
 def research_destination(destination, start_date, end_date):
@@ -84,23 +94,22 @@ def create_travel_plan(destination, start_date, end_date, research_data):
     - Weather: {research_data['weather']}
     """
 
-    response = client.chat.completions.create(
-        model="gpt-4.1",
-        messages=[
-            {"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
-            {"role": "user", "content": prompt}
-        ]
-    ).choices[0].message.content
+    # response = client.chat.completions.create(
+    #     model="gpt-4o",
+    #     messages=[
+    #         {"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
+    #         {"role": "user", "content": prompt}
+    #     ]
+    # ).choices[0].message.content
 
-    return response
+    return "Here is travel plan"
 
 @tracer.observe(span_type="function")
 def generate_itinerary(destination, start_date, end_date):
     """Main function to generate a travel itinerary."""
     research_data = research_destination(destination, start_date, end_date)
     res = create_travel_plan(destination, start_date, end_date, research_data)
 
-from judgeval.data import Sequence
 from judgeval.scorers import ToolOrderScorer    
 from judgeval import JudgmentClient
 
@@ -110,27 +119,29 @@ def generate_itinerary(destination, start_date, end_date):
         input={"destination": "Paris", "start_date": "2025-06-01", "end_date": "2025-06-02"},
         expected_tools=[
             {
-                "tool_name": "search_tavily",
+                "tool_name": "get_attractions",
                 "parameters": {
-                    "query": "Best tourist attractions in Paris"
+                    "destination": "Paris"
                 }
             },
             {
-                "tool_name": "search_tavily",
+                "tool_name": "get_hotels",
                 "parameters": {
-                    "query": "Best hotels in Paris"
+                    "destination": "Paris"
                 }
             },
             {
-                "tool_name": "search_tavily",
+                "tool_name": "get_flights",
                 "parameters": {
-                    "query": "Flights to Paris from major cities"
+                    "destination": "Paris"
                 }
             },
             {
-                "tool_name": "search_tavily",
+                "tool_name": "get_weather",
                 "parameters": {
-                    "query": "Weather forecast for Paris from 2025-06-01 to 2025-06-02"
+                    "destination": "Paris",
+                    "start_date": "2025-06-01",
+                    "end_date": "2025-06-02"
                 }
             }
         ]
@@ -141,11 +152,12 @@ def generate_itinerary(destination, start_date, end_date):
             {"tool_name": "search_tavily", "parameters": {"query": "Best tourist attractions in Tokyo"}},
             {"tool_name": "search_tavily", "parameters": {"query": "Best hotels in Tokyo"}},
             {"tool_name": "search_tavily", "parameters": {"query": "Flights to Tokyo from major cities"}},
-            {"tool_name": "search_tavily", "parameters": {"query": "Weather forecast for Tokyo from 2025-06-01 to 2025-06-02"}}
+            {"tool_name": "search_tavily", "parameters": {"query": "Weather forecast for Tokyo from 2025-06-01 to 2025-06-03"}}
         ]
     )
 
     judgment.assert_test(
+        project_name="travel_agent_demo",
         examples=[example],
         scorers=[ToolOrderScorer(threshold=0.5)],
         model="gpt-4.1-mini",

diff --git a/src/e2etests/test_all_scorers.py b/src/e2etests/test_all_scorers.py
@@ -23,7 +23,7 @@
     ClassifierScorer,
 )
 
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 
 
 def test_ac_scorer(client: JudgmentClient):
@@ -533,31 +533,6 @@ def test_execution_order_scorer(client: JudgmentClient):
         override=True
     )
 
-def test_derailment_scorer(client: JudgmentClient):
-    PROJECT_NAME = "test-project"
-    EVAL_RUN_NAME = "test-run-derailment"
-
-    airlines_example = Example(
-    input="Which airlines fly to Paris?",
-    actual_output="Air France, Delta, and American Airlines offer direct flights."
-    )
-    weather_example = Example(
-        input="What is the weather like in Texas?",
-        actual_output="It's sunny with a high of 75°F in Texas."
-    )
-    airline_sequence = Sequence(
-        name="Flight Details",
-        items=[airlines_example, weather_example],
-    )
-    results = client.run_sequence_evaluation(
-        eval_run_name=EVAL_RUN_NAME,
-        project_name=PROJECT_NAME,
-        sequences=[airline_sequence],
-        scorers=[DerailmentScorer(threshold=0.5)],
-        model="gpt-4.1",
-        log_results=True,
-        override=True,
-    )
 def test_json_scorer(client: JudgmentClient):
         """Test JSON scorer functionality."""
         example1 = Example(

diff --git a/src/e2etests/test_dataset_operations.py b/src/e2etests/test_dataset_operations.py
@@ -8,7 +8,7 @@
 import string
 
 from judgeval.judgment_client import JudgmentClient
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 
 @pytest.fixture(scope="module", autouse=True)
 def setup_and_teardown_module(client: JudgmentClient):
@@ -38,26 +38,6 @@ def test_dataset(self, client: JudgmentClient, project_name: str):
 
         client.delete_dataset(alias="test_dataset_5", project_name=project_name)
 
-    def test_dataset_with_sequence(self, client: JudgmentClient, project_name: str):
-        """Test dataset creation and manipulation with a sequence."""
-        dataset = client.create_dataset()
-        examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
-        sequence = Sequence(
-            name="test_sequence",
-            items=examples
-        )
-        dataset.add_sequence(sequence)
-        client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)
-
-        dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-        assert dataset.sequences, "Failed to pull dataset"
-        assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
-        sequence = dataset.sequences[0]
-        assert sequence.name == "test_sequence", "Sequence should have the correct name"
-        assert len(sequence.items) == 3, "Sequence should have 3 items"
-
-        client.delete_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-
     def test_pull_all_project_dataset_stats(self, client: JudgmentClient, project_name: str):
         """Test pulling statistics for all project datasets."""
         dataset = client.create_dataset()
@@ -132,51 +112,6 @@ def test_append_example_dataset(self, client: JudgmentClient, project_name: str)
         dataset = client.pull_dataset(alias="test_dataset_8", project_name=project_name)
         assert dataset, "Failed to pull dataset"
         assert len(dataset.examples) == 3, "Dataset should have 3 examples"
-
-    def test_append_sequence_dataset(self, client: JudgmentClient, project_name: str):
-        """Test dataset appending."""
-        dataset = client.create_dataset()
-        examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
-        sequence = Sequence(
-            name="test_sequence",
-            items=examples
-        )
-        dataset.add_sequence(sequence)
-        client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)
-
-        dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-        assert dataset.sequences, "Failed to pull dataset"
-        assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
-        sequence = dataset.sequences[0]
-        assert sequence.name == "test_sequence", "Sequence should have the correct name"
-        assert len(sequence.items) == 3, "Sequence should have 3 items"
-        examples2 = [Example(input="input 4", actual_output="output 4"), Example(input="input 5", actual_output="output 5")]
-        sequence2 = Sequence(
-            name="test_sequence2",
-            items=examples2
-        )
-
-        client.append_sequence_dataset(alias="test_dataset_with_sequence", sequences=[sequence2], project_name=project_name)
-
-        dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-        assert dataset.sequences, "Failed to pull dataset"
-        assert len(dataset.sequences) == 2, "Dataset should have 2 sequences"
-
-        test_sequence = None
-        test_sequence2 = None
-        for seq in dataset.sequences:
-            if seq.name == "test_sequence":
-                test_sequence = seq
-            elif seq.name == "test_sequence2":
-                test_sequence2 = seq
-
-        # Verify first sequence
-        assert test_sequence is not None, "Could not find 'test_sequence'"
-        assert len(test_sequence.items) == 3, "Sequence 'test_sequence' should have 3 items"
-
-        # Verify second sequence
-        assert test_sequence2 is not None, "Could not find 'test_sequence2'"
-        assert len(test_sequence2.items) == 2, "Sequence 'test_sequence2' should have 2 items"
 
     def test_export_jsonl(self, client: JudgmentClient, random_name: str, project_name: str):
         """Test JSONL dataset export functionality."""

diff --git a/src/judgeval/common/tracer.py b/src/judgeval/common/tracer.py
@@ -146,7 +146,7 @@ def fetch_trace(self, trace_id: str):
 
         return response.json()
 
-    def save_trace(self, trace_data: dict):
+    def save_trace(self, trace_data: dict, offline_mode: bool = False):
         """
         Saves a trace to the Judgment Supabase and optionally to S3 if configured.
 
@@ -183,7 +183,7 @@ def save_trace(self, trace_data: dict):
             except Exception as e:
                 warnings.warn(f"Failed to save trace to S3: {str(e)}")
 
-        if "ui_results_url" in response.json():
+        if not offline_mode and "ui_results_url" in response.json():
             pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
             rprint(pretty_str)
 
@@ -660,11 +660,12 @@ def save(self, overwrite: bool = False) -> Tuple[str, dict]:
             "entries": [span.model_dump() for span in self.trace_spans],
             "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
             "overwrite": overwrite,
+            "offline_mode": self.tracer.offline_mode,
             "parent_trace_id": self.parent_trace_id,
             "parent_name": self.parent_name
         }        
         # --- Log trace data before saving ---
-        self.trace_manager_client.save_trace(trace_data)
+        self.trace_manager_client.save_trace(trace_data, offline_mode=self.tracer.offline_mode)
 
         # upload annotations
         # TODO: batch to the log endpoint
@@ -930,6 +931,7 @@ def __init__(
         s3_aws_access_key_id: Optional[str] = None,
         s3_aws_secret_access_key: Optional[str] = None,
         s3_region_name: Optional[str] = None,
+        offline_mode: bool = False,
         deep_tracing: bool = True  # Deep tracing is enabled by default
         ):
         if not hasattr(self, 'initialized'):
@@ -970,6 +972,7 @@ def __init__(
                     aws_secret_access_key=s3_aws_secret_access_key,
                     region_name=s3_region_name
                 )
+            self.offline_mode: bool = offline_mode
             self.deep_tracing: bool = deep_tracing  # NEW: Store deep tracing setting
 
         elif hasattr(self, 'project_name') and self.project_name != project_name: