JudgmentLabs
diff --git a/‎src/demo/dataset.py
Lines changed: 0 additions & 16 deletions b/‎src/demo/dataset.py
Lines changed: 0 additions & 16 deletions
diff --git a/‎src/demo/demo.py
Lines changed: 0 additions & 46 deletions b/‎src/demo/demo.py
Lines changed: 0 additions & 46 deletions
diff --git a/‎src/demo/sequence_test.py
Lines changed: 10 additions & 13 deletions b/‎src/demo/sequence_test.py
Lines changed: 10 additions & 13 deletions
diff --git a/‎src/e2etests/test_all_scorers.py
Lines changed: 1 addition & 26 deletions b/‎src/e2etests/test_all_scorers.py
Lines changed: 1 addition & 26 deletions
diff --git a/‎src/e2etests/test_dataset_operations.py
Lines changed: 1 addition & 66 deletions b/‎src/e2etests/test_dataset_operations.py
Lines changed: 1 addition & 66 deletions
diff --git a/‎src/judgeval/common/tracer.py
Lines changed: 5 additions & 1 deletion b/‎src/judgeval/common/tracer.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/judgeval/constants.py
Lines changed: 1 addition & 3 deletions b/‎src/judgeval/constants.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/judgeval/data/__init__.py
Lines changed: 0 additions & 2 deletions b/‎src/judgeval/data/__init__.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/judgeval/data/datasets/dataset.py
Lines changed: 2 additions & 9 deletions b/‎src/judgeval/data/datasets/dataset.py
Lines changed: 2 additions & 9 deletions
@@ -18,10 +18,7 @@
 @tracer.observe(span_type="tool")
 def search_tavily(query):
     """Fetch travel data using Tavily API."""
-    API_KEY = os.getenv("TAVILY_API_KEY")
-    client = TavilyClient(api_key=API_KEY)
-    results = client.search(query, num_results=3)
-    return results
+    return "results"
 
 # @judgment.observe(span_type="tool")
 def get_attractions(destination):
@@ -84,23 +81,22 @@ def create_travel_plan(destination, start_date, end_date, research_data):
     - Weather: {research_data['weather']}
     """
 
-    response = client.chat.completions.create(
-        model="gpt-4.1",
-        messages=[
-            {"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
-            {"role": "user", "content": prompt}
-        ]
-    ).choices[0].message.content
+    # response = client.chat.completions.create(
+    #     model="gpt-4o",
+    #     messages=[
+    #         {"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
+    #         {"role": "user", "content": prompt}
+    #     ]
+    # ).choices[0].message.content
 
-    return response
+    return "Here is travel plan"
 
 @tracer.observe(span_type="function")
 def generate_itinerary(destination, start_date, end_date):
     """Main function to generate a travel itinerary."""
     research_data = research_destination(destination, start_date, end_date)
     res = create_travel_plan(destination, start_date, end_date, research_data)
 
-from judgeval.data import Sequence
 from judgeval.scorers import ToolOrderScorer    
 from judgeval import JudgmentClient
 
@@ -146,6 +142,7 @@ def generate_itinerary(destination, start_date, end_date):
     )
 
     judgment.assert_test(
+        project_name="travel_agent_demo",
         examples=[example],
         scorers=[ToolOrderScorer(threshold=0.5)],
         model="gpt-4.1-mini",
 
@@ -23,7 +23,7 @@
     ClassifierScorer,
 )
 
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 
 
 def test_ac_scorer(client: JudgmentClient):
@@ -533,31 +533,6 @@ def test_execution_order_scorer(client: JudgmentClient):
         override=True
     )
 
-def test_derailment_scorer(client: JudgmentClient):
-    PROJECT_NAME = "test-project"
-    EVAL_RUN_NAME = "test-run-derailment"
-
-    airlines_example = Example(
-    input="Which airlines fly to Paris?",
-    actual_output="Air France, Delta, and American Airlines offer direct flights."
-    )
-    weather_example = Example(
-        input="What is the weather like in Texas?",
-        actual_output="It's sunny with a high of 75°F in Texas."
-    )
-    airline_sequence = Sequence(
-        name="Flight Details",
-        items=[airlines_example, weather_example],
-    )
-    results = client.run_sequence_evaluation(
-        eval_run_name=EVAL_RUN_NAME,
-        project_name=PROJECT_NAME,
-        sequences=[airline_sequence],
-        scorers=[DerailmentScorer(threshold=0.5)],
-        model="gpt-4.1",
-        log_results=True,
-        override=True,
-    )
 def test_json_scorer(client: JudgmentClient):
         """Test JSON scorer functionality."""
         example1 = Example(
 
@@ -8,7 +8,7 @@
 import string
 
 from judgeval.judgment_client import JudgmentClient
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 
 @pytest.fixture(scope="module", autouse=True)
 def setup_and_teardown_module(client: JudgmentClient):
@@ -38,26 +38,6 @@ def test_dataset(self, client: JudgmentClient, project_name: str):
 
         client.delete_dataset(alias="test_dataset_5", project_name=project_name)
 
-    def test_dataset_with_sequence(self, client: JudgmentClient, project_name: str):
-        """Test dataset creation and manipulation with a sequence."""
-        dataset = client.create_dataset()
-        examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
-        sequence = Sequence(
-            name="test_sequence",
-            items=examples
-        )
-        dataset.add_sequence(sequence)
-        client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)
-
-        dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-        assert dataset.sequences, "Failed to pull dataset"
-        assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
-        sequence = dataset.sequences[0]
-        assert sequence.name == "test_sequence", "Sequence should have the correct name"
-        assert len(sequence.items) == 3, "Sequence should have 3 items"
-        
-        client.delete_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-
     def test_pull_all_project_dataset_stats(self, client: JudgmentClient, project_name: str):
         """Test pulling statistics for all project datasets."""
         dataset = client.create_dataset()
@@ -132,51 +112,6 @@ def test_append_example_dataset(self, client: JudgmentClient, project_name: str)
         dataset = client.pull_dataset(alias="test_dataset_8", project_name=project_name)
         assert dataset, "Failed to pull dataset"
         assert len(dataset.examples) == 3, "Dataset should have 3 examples"
-    
-    def test_append_sequence_dataset(self, client: JudgmentClient, project_name: str):
-        """Test dataset appending."""
-        dataset = client.create_dataset()
-        examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
-        sequence = Sequence(
-            name="test_sequence",
-            items=examples
-        )
-        dataset.add_sequence(sequence)
-        client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)
-
-        dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-        assert dataset.sequences, "Failed to pull dataset"
-        assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
-        sequence = dataset.sequences[0]
-        assert sequence.name == "test_sequence", "Sequence should have the correct name"
-        assert len(sequence.items) == 3, "Sequence should have 3 items"
-        examples2 = [Example(input="input 4", actual_output="output 4"), Example(input="input 5", actual_output="output 5")]
-        sequence2 = Sequence(
-            name="test_sequence2",
-            items=examples2
-        )
-        
-        client.append_sequence_dataset(alias="test_dataset_with_sequence", sequences=[sequence2], project_name=project_name)
-
-        dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-        assert dataset.sequences, "Failed to pull dataset"
-        assert len(dataset.sequences) == 2, "Dataset should have 2 sequences"
-
-        test_sequence = None
-        test_sequence2 = None
-        for seq in dataset.sequences:
-            if seq.name == "test_sequence":
-                test_sequence = seq
-            elif seq.name == "test_sequence2":
-                test_sequence2 = seq
-
-        # Verify first sequence
-        assert test_sequence is not None, "Could not find 'test_sequence'"
-        assert len(test_sequence.items) == 3, "Sequence 'test_sequence' should have 3 items"
-
-        # Verify second sequence
-        assert test_sequence2 is not None, "Could not find 'test_sequence2'"
-        assert len(test_sequence2.items) == 2, "Sequence 'test_sequence2' should have 2 items"
 
     def test_export_jsonl(self, client: JudgmentClient, random_name: str, project_name: str):
         """Test JSONL dataset export functionality."""
 
@@ -660,11 +660,13 @@ def save(self, overwrite: bool = False) -> Tuple[str, dict]:
             "entries": [span.model_dump() for span in self.trace_spans],
             "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
             "overwrite": overwrite,
+            "offline_mode": self.tracer.offline_mode,
             "parent_trace_id": self.parent_trace_id,
             "parent_name": self.parent_name
         }        
         # --- Log trace data before saving ---
-        self.trace_manager_client.save_trace(trace_data)
+        if not self.tracer.offline_mode:
+            self.trace_manager_client.save_trace(trace_data)
 
         # upload annotations
         # TODO: batch to the log endpoint
@@ -930,6 +932,7 @@ def __init__(
         s3_aws_access_key_id: Optional[str] = None,
         s3_aws_secret_access_key: Optional[str] = None,
         s3_region_name: Optional[str] = None,
+        offline_mode: bool = False,
         deep_tracing: bool = True  # Deep tracing is enabled by default
         ):
         if not hasattr(self, 'initialized'):
@@ -970,6 +973,7 @@ def __init__(
                     aws_secret_access_key=s3_aws_secret_access_key,
                     region_name=s3_region_name
                 )
+            self.offline_mode: bool = offline_mode
             self.deep_tracing: bool = deep_tracing  # NEW: Store deep tracing setting
 
         elif hasattr(self, 'project_name') and self.project_name != project_name:
 
@@ -40,17 +40,15 @@ def _missing_(cls, value):
 ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 # API URLs
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
-JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
+JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
-JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
 JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
 JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
 JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
 JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
-JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/"
 JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
 JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
 JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
 
@@ -2,7 +2,6 @@
 from judgeval.data.custom_example import CustomExample
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
-from judgeval.data.sequence import Sequence
 from judgeval.data.trace import Trace, TraceSpan
 
 
@@ -14,7 +13,6 @@
     "create_scorer_data",
     "ScoringResult",
     "generate_scoring_result",
-    "Sequence",
     "Trace",
     "TraceSpan",
 ]
@@ -7,13 +7,12 @@
 from dataclasses import dataclass, field
 from typing import List, Union, Literal
 
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 from judgeval.common.logger import debug, error, warning, info
 
 @dataclass
 class EvalDataset:
     examples: List[Example]
-    sequences: List[Sequence]
     _alias: Union[str, None] = field(default=None)
     _id: Union[str, None] = field(default=None)
     judgment_api_key: str = field(default="")
@@ -22,13 +21,11 @@ def __init__(self,
                  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),  
                  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
                  examples: List[Example] = [],
-                 sequences: List[Sequence] = []
                  ):
         debug(f"Initializing EvalDataset with {len(examples)} examples")
         if not judgment_api_key:
             warning("No judgment_api_key provided")
         self.examples = examples
-        self.sequences = sequences
         self._alias = None
         self._id = None
         self.judgment_api_key = judgment_api_key
@@ -223,10 +220,7 @@ def add_from_yaml(self, file_path: str) -> None:
     def add_example(self, e: Example) -> None:
         self.examples = self.examples + [e]
         # TODO if we need to add rank, then we need to do it here
-    
-    def add_sequence(self, s: Sequence) -> None:
-        self.sequences = self.sequences + [s]
-    
+
     def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
         """
         Saves the dataset as a file. Save only the examples.
@@ -313,7 +307,6 @@ def __str__(self):
         return (
             f"{self.__class__.__name__}("
             f"examples={self.examples}, "
-            f"sequences={self.sequences}, "
             f"_alias={self._alias}, "
             f"_id={self._id}"
             f")"