diff --git a/src/demo/dataset.py b/src/demo/dataset.py
deleted file mode 100644
index 5f1b5182..00000000
--- a/src/demo/dataset.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from judgeval import JudgmentClient
-from judgeval.data import Example, Sequence
-from judgeval.scorers import DerailmentScorer
-
-client = JudgmentClient()
-
-dataset = client.pull_dataset(alias="test", project_name="travel_agent_demo_test")
-
-client.run_sequence_evaluation(
-    sequences=dataset.sequences,
-    model="gpt-4.1",
-    project_name="travel_agent_demo_test",
-    scorers=[DerailmentScorer(threshold=0.5)],
-    log_results=True,
-    override=True,
-)
\ No newline at end of file
diff --git a/src/demo/demo.py b/src/demo/demo.py
deleted file mode 100644
index d8525f77..00000000
--- a/src/demo/demo.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from judgeval import JudgmentClient
-from judgeval.data import Example, Sequence
-from judgeval.scorers import DerailmentScorer
-
-client = JudgmentClient()
-
-airlines_example = Example(
-    input="Which airlines fly to Tokyo?",
-    actual_output="Japan Airlines, All Nippon Airways, and Chinese Airlines offer direct flights."
-)
-weather_example = Example(
-    input="What is the weather like in Japan?",
-    actual_output="It's cloudy with a high of 75°F and a low of 60°F in Japan."
-)
-airline_sequence = Sequence(
-    name="Flight Details",
-    items=[airlines_example, weather_example],
-)
-
-# Level 1: Top-level sequence
-top_example1 = Example(
-    input="I want to plan a trip to Tokyok.",
-    actual_output="That sounds great! When are you planning to go?"
-)
-top_example2 = Example(
-    input="Can you book a flight for me and anything else I need to know?",
-    actual_output="Sure, I'll help you with flights. hotels. and transportation."
-)
-top_level_sequence = Sequence(
-    name="Travel Planning",
-    items=[top_example1, top_example2, airline_sequence],
-)
-
-other_sequence = Sequence(
-    name="Other",
-    items=[Example(
-        input="What is the weather like in Tokyo?",
-        actual_output="It's cloudy with a high of 75°F and a low of 60°F in Tokyo."
-    )]
-)
-
-results = client.run_sequence_evaluation(
-    scorers=[DerailmentScorer(threshold=1)],
-    sequences=[other_sequence],
-    override=True,
-)
\ No newline at end of file
diff --git a/src/demo/sequence_test.py b/src/demo/sequence_test.py
index 62abdb13..343b375c 100644
--- a/src/demo/sequence_test.py
+++ b/src/demo/sequence_test.py
@@ -15,40 +15,50 @@
 tracer = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"), project_name="travel_agent_demo")
 
 
-@tracer.observe(span_type="tool")
+# @tracer.observe(span_type="tool")
 def search_tavily(query):
     """Fetch travel data using Tavily API."""
-    API_KEY = os.getenv("TAVILY_API_KEY")
-    client = TavilyClient(api_key=API_KEY)
-    results = client.search(query, num_results=3)
-    return results
+    # API_KEY = os.getenv("TAVILY_API_KEY")
+    # client = TavilyClient(api_key=API_KEY)
+    # results = client.search(query, num_results=3)
+    # return results
+    return "The weather in Tokyo is sunny with a high of 75°F."
 
-# @judgment.observe(span_type="tool")
+@tracer.observe(span_type="tool")
 def get_attractions(destination):
     """Search for top attractions in the destination."""
     prompt = f"Best tourist attractions in {destination}"
     attractions_search = search_tavily(prompt)
     return attractions_search
 
-# @judgment.observe(span_type="tool")
+@tracer.observe(span_type="tool")
 def get_hotels(destination):
     """Search for hotels in the destination."""
     prompt = f"Best hotels in {destination}"
     hotels_search = search_tavily(prompt)
     return hotels_search
 
-# @judgment.observe(span_type="tool")
+@tracer.observe(span_type="tool")
 def get_flights(destination):
     """Search for flights to the destination."""
     prompt = f"Flights to {destination} from major cities"
     flights_search = search_tavily(prompt)
     return flights_search
 
-# @judgment.observe(span_type="tool")
+@tracer.observe(span_type="tool")
 def get_weather(destination, start_date, end_date):
     """Search for weather information."""
     prompt = f"Weather forecast for {destination} from {start_date} to {end_date}"
     weather_search = search_tavily(prompt)
+    example = Example(
+        input="What is the weather in Tokyo?",
+        actual_output=weather_search
+    )
+    tracer.async_evaluate(
+        scorers=[AnswerRelevancyScorer(threshold=0.5)],
+        example=example,
+        model="gpt-4o-mini",
+    )
     return weather_search
 
 def research_destination(destination, start_date, end_date):
@@ -84,15 +94,15 @@ def create_travel_plan(destination, start_date, end_date, research_data):
     - Weather: {research_data['weather']}
     """
     
-    response = client.chat.completions.create(
-        model="gpt-4.1",
-        messages=[
-            {"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
-            {"role": "user", "content": prompt}
-        ]
-    ).choices[0].message.content
+    # response = client.chat.completions.create(
+    #     model="gpt-4o",
+    #     messages=[
+    #         {"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."},
+    #         {"role": "user", "content": prompt}
+    #     ]
+    # ).choices[0].message.content
     
-    return response
+    return "Here is travel plan"
 
 @tracer.observe(span_type="function")
 def generate_itinerary(destination, start_date, end_date):
@@ -100,7 +110,6 @@ def generate_itinerary(destination, start_date, end_date):
     research_data = research_destination(destination, start_date, end_date)
     res = create_travel_plan(destination, start_date, end_date, research_data)
 
-from judgeval.data import Sequence
 from judgeval.scorers import ToolOrderScorer    
 from judgeval import JudgmentClient
 
@@ -110,27 +119,29 @@ def generate_itinerary(destination, start_date, end_date):
         input={"destination": "Paris", "start_date": "2025-06-01", "end_date": "2025-06-02"},
         expected_tools=[
             {
-                "tool_name": "search_tavily",
+                "tool_name": "get_attractions",
                 "parameters": {
-                    "query": "Best tourist attractions in Paris"
+                    "destination": "Paris"
                 }
             },
             {
-                "tool_name": "search_tavily",
+                "tool_name": "get_hotels",
                 "parameters": {
-                    "query": "Best hotels in Paris"
+                    "destination": "Paris"
                 }
             },
             {
-                "tool_name": "search_tavily",
+                "tool_name": "get_flights",
                 "parameters": {
-                    "query": "Flights to Paris from major cities"
+                    "destination": "Paris"
                 }
             },
             {
-                "tool_name": "search_tavily",
+                "tool_name": "get_weather",
                 "parameters": {
-                    "query": "Weather forecast for Paris from 2025-06-01 to 2025-06-02"
+                    "destination": "Paris",
+                    "start_date": "2025-06-01",
+                    "end_date": "2025-06-02"
                 }
             }
         ]
@@ -141,11 +152,12 @@ def generate_itinerary(destination, start_date, end_date):
             {"tool_name": "search_tavily", "parameters": {"query": "Best tourist attractions in Tokyo"}},
             {"tool_name": "search_tavily", "parameters": {"query": "Best hotels in Tokyo"}},
             {"tool_name": "search_tavily", "parameters": {"query": "Flights to Tokyo from major cities"}},
-            {"tool_name": "search_tavily", "parameters": {"query": "Weather forecast for Tokyo from 2025-06-01 to 2025-06-02"}}
+            {"tool_name": "search_tavily", "parameters": {"query": "Weather forecast for Tokyo from 2025-06-01 to 2025-06-03"}}
         ]
     )
 
     judgment.assert_test(
+        project_name="travel_agent_demo",
         examples=[example],
         scorers=[ToolOrderScorer(threshold=0.5)],
         model="gpt-4.1-mini",
diff --git a/src/e2etests/test_all_scorers.py b/src/e2etests/test_all_scorers.py
index 9b8347e0..e2003d0e 100644
--- a/src/e2etests/test_all_scorers.py
+++ b/src/e2etests/test_all_scorers.py
@@ -23,7 +23,7 @@
     ClassifierScorer,
 )
 
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 
 
 def test_ac_scorer(client: JudgmentClient):
@@ -533,31 +533,6 @@ def test_execution_order_scorer(client: JudgmentClient):
         override=True
     )
 
-def test_derailment_scorer(client: JudgmentClient):
-    PROJECT_NAME = "test-project"
-    EVAL_RUN_NAME = "test-run-derailment"
-
-    airlines_example = Example(
-    input="Which airlines fly to Paris?",
-    actual_output="Air France, Delta, and American Airlines offer direct flights."
-    )
-    weather_example = Example(
-        input="What is the weather like in Texas?",
-        actual_output="It's sunny with a high of 75°F in Texas."
-    )
-    airline_sequence = Sequence(
-        name="Flight Details",
-        items=[airlines_example, weather_example],
-    )
-    results = client.run_sequence_evaluation(
-        eval_run_name=EVAL_RUN_NAME,
-        project_name=PROJECT_NAME,
-        sequences=[airline_sequence],
-        scorers=[DerailmentScorer(threshold=0.5)],
-        model="gpt-4.1",
-        log_results=True,
-        override=True,
-    )
 def test_json_scorer(client: JudgmentClient):
         """Test JSON scorer functionality."""
         example1 = Example(
diff --git a/src/e2etests/test_dataset_operations.py b/src/e2etests/test_dataset_operations.py
index 19f9735f..a9caae7f 100644
--- a/src/e2etests/test_dataset_operations.py
+++ b/src/e2etests/test_dataset_operations.py
@@ -8,7 +8,7 @@
 import string
 
 from judgeval.judgment_client import JudgmentClient
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 
 @pytest.fixture(scope="module", autouse=True)
 def setup_and_teardown_module(client: JudgmentClient):
@@ -38,26 +38,6 @@ def test_dataset(self, client: JudgmentClient, project_name: str):
 
         client.delete_dataset(alias="test_dataset_5", project_name=project_name)
 
-    def test_dataset_with_sequence(self, client: JudgmentClient, project_name: str):
-        """Test dataset creation and manipulation with a sequence."""
-        dataset = client.create_dataset()
-        examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
-        sequence = Sequence(
-            name="test_sequence",
-            items=examples
-        )
-        dataset.add_sequence(sequence)
-        client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)
-
-        dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-        assert dataset.sequences, "Failed to pull dataset"
-        assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
-        sequence = dataset.sequences[0]
-        assert sequence.name == "test_sequence", "Sequence should have the correct name"
-        assert len(sequence.items) == 3, "Sequence should have 3 items"
-        
-        client.delete_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-
     def test_pull_all_project_dataset_stats(self, client: JudgmentClient, project_name: str):
         """Test pulling statistics for all project datasets."""
         dataset = client.create_dataset()
@@ -132,51 +112,6 @@ def test_append_example_dataset(self, client: JudgmentClient, project_name: str)
         dataset = client.pull_dataset(alias="test_dataset_8", project_name=project_name)
         assert dataset, "Failed to pull dataset"
         assert len(dataset.examples) == 3, "Dataset should have 3 examples"
-    
-    def test_append_sequence_dataset(self, client: JudgmentClient, project_name: str):
-        """Test dataset appending."""
-        dataset = client.create_dataset()
-        examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")]
-        sequence = Sequence(
-            name="test_sequence",
-            items=examples
-        )
-        dataset.add_sequence(sequence)
-        client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True)
-
-        dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-        assert dataset.sequences, "Failed to pull dataset"
-        assert len(dataset.sequences) == 1, "Dataset should have 1 sequence"
-        sequence = dataset.sequences[0]
-        assert sequence.name == "test_sequence", "Sequence should have the correct name"
-        assert len(sequence.items) == 3, "Sequence should have 3 items"
-        examples2 = [Example(input="input 4", actual_output="output 4"), Example(input="input 5", actual_output="output 5")]
-        sequence2 = Sequence(
-            name="test_sequence2",
-            items=examples2
-        )
-        
-        client.append_sequence_dataset(alias="test_dataset_with_sequence", sequences=[sequence2], project_name=project_name)
-
-        dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name)
-        assert dataset.sequences, "Failed to pull dataset"
-        assert len(dataset.sequences) == 2, "Dataset should have 2 sequences"
-
-        test_sequence = None
-        test_sequence2 = None
-        for seq in dataset.sequences:
-            if seq.name == "test_sequence":
-                test_sequence = seq
-            elif seq.name == "test_sequence2":
-                test_sequence2 = seq
-
-        # Verify first sequence
-        assert test_sequence is not None, "Could not find 'test_sequence'"
-        assert len(test_sequence.items) == 3, "Sequence 'test_sequence' should have 3 items"
-
-        # Verify second sequence
-        assert test_sequence2 is not None, "Could not find 'test_sequence2'"
-        assert len(test_sequence2.items) == 2, "Sequence 'test_sequence2' should have 2 items"
 
     def test_export_jsonl(self, client: JudgmentClient, random_name: str, project_name: str):
         """Test JSONL dataset export functionality."""
diff --git a/src/judgeval/common/tracer.py b/src/judgeval/common/tracer.py
index b869e311..5984a2cb 100644
--- a/src/judgeval/common/tracer.py
+++ b/src/judgeval/common/tracer.py
@@ -146,7 +146,7 @@ def fetch_trace(self, trace_id: str):
         
         return response.json()
 
-    def save_trace(self, trace_data: dict):
+    def save_trace(self, trace_data: dict, offline_mode: bool = False):
         """
         Saves a trace to the Judgment Supabase and optionally to S3 if configured.
 
@@ -183,7 +183,7 @@ def save_trace(self, trace_data: dict):
             except Exception as e:
                 warnings.warn(f"Failed to save trace to S3: {str(e)}")
         
-        if "ui_results_url" in response.json():
+        if not offline_mode and "ui_results_url" in response.json():
             pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
             rprint(pretty_str)
 
@@ -660,11 +660,12 @@ def save(self, overwrite: bool = False) -> Tuple[str, dict]:
             "entries": [span.model_dump() for span in self.trace_spans],
             "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
             "overwrite": overwrite,
+            "offline_mode": self.tracer.offline_mode,
             "parent_trace_id": self.parent_trace_id,
             "parent_name": self.parent_name
         }        
         # --- Log trace data before saving ---
-        self.trace_manager_client.save_trace(trace_data)
+        self.trace_manager_client.save_trace(trace_data, offline_mode=self.tracer.offline_mode)
 
         # upload annotations
         # TODO: batch to the log endpoint
@@ -930,6 +931,7 @@ def __init__(
         s3_aws_access_key_id: Optional[str] = None,
         s3_aws_secret_access_key: Optional[str] = None,
         s3_region_name: Optional[str] = None,
+        offline_mode: bool = False,
         deep_tracing: bool = True  # Deep tracing is enabled by default
         ):
         if not hasattr(self, 'initialized'):
@@ -970,6 +972,7 @@ def __init__(
                     aws_secret_access_key=s3_aws_secret_access_key,
                     region_name=s3_region_name
                 )
+            self.offline_mode: bool = offline_mode
             self.deep_tracing: bool = deep_tracing  # NEW: Store deep tracing setting
 
         elif hasattr(self, 'project_name') and self.project_name != project_name:
diff --git a/src/judgeval/constants.py b/src/judgeval/constants.py
index 2d8d27ca..fab54477 100644
--- a/src/judgeval/constants.py
+++ b/src/judgeval/constants.py
@@ -40,17 +40,15 @@ def _missing_(cls, value):
 ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 # API URLs
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
-JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
+JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
-JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
 JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
 JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
 JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
 JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
-JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/"
 JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
 JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
 JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
diff --git a/src/judgeval/data/__init__.py b/src/judgeval/data/__init__.py
index b1baa68a..3baffe3e 100644
--- a/src/judgeval/data/__init__.py
+++ b/src/judgeval/data/__init__.py
@@ -2,7 +2,6 @@
 from judgeval.data.custom_example import CustomExample
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
-from judgeval.data.sequence import Sequence
 from judgeval.data.trace import Trace, TraceSpan
 
 
@@ -14,7 +13,6 @@
     "create_scorer_data",
     "ScoringResult",
     "generate_scoring_result",
-    "Sequence",
     "Trace",
     "TraceSpan",
 ]
diff --git a/src/judgeval/data/datasets/dataset.py b/src/judgeval/data/datasets/dataset.py
index fd7e49c5..9759ac17 100644
--- a/src/judgeval/data/datasets/dataset.py
+++ b/src/judgeval/data/datasets/dataset.py
@@ -7,13 +7,12 @@
 from dataclasses import dataclass, field
 from typing import List, Union, Literal
 
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 from judgeval.common.logger import debug, error, warning, info
 
 @dataclass
 class EvalDataset:
     examples: List[Example]
-    sequences: List[Sequence]
     _alias: Union[str, None] = field(default=None)
     _id: Union[str, None] = field(default=None)
     judgment_api_key: str = field(default="")
@@ -22,13 +21,11 @@ def __init__(self,
                  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),  
                  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
                  examples: List[Example] = [],
-                 sequences: List[Sequence] = []
                  ):
         debug(f"Initializing EvalDataset with {len(examples)} examples")
         if not judgment_api_key:
             warning("No judgment_api_key provided")
         self.examples = examples
-        self.sequences = sequences
         self._alias = None
         self._id = None
         self.judgment_api_key = judgment_api_key
@@ -223,10 +220,7 @@ def add_from_yaml(self, file_path: str) -> None:
     def add_example(self, e: Example) -> None:
         self.examples = self.examples + [e]
         # TODO if we need to add rank, then we need to do it here
-    
-    def add_sequence(self, s: Sequence) -> None:
-        self.sequences = self.sequences + [s]
-    
+
     def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
         """
         Saves the dataset as a file. Save only the examples.
@@ -313,7 +307,6 @@ def __str__(self):
         return (
             f"{self.__class__.__name__}("
             f"examples={self.examples}, "
-            f"sequences={self.sequences}, "
             f"_alias={self._alias}, "
             f"_id={self._id}"
             f")"
diff --git a/src/judgeval/data/datasets/eval_dataset_client.py b/src/judgeval/data/datasets/eval_dataset_client.py
index 73d2e01b..a84eae9e 100644
--- a/src/judgeval/data/datasets/eval_dataset_client.py
+++ b/src/judgeval/data/datasets/eval_dataset_client.py
@@ -7,14 +7,13 @@
 from judgeval.constants import (
     JUDGMENT_DATASETS_PUSH_API_URL,
     JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
-    JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
     JUDGMENT_DATASETS_PULL_API_URL, 
     JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
     JUDGMENT_DATASETS_DELETE_API_URL,
     JUDGMENT_DATASETS_INSERT_API_URL,
     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
 )
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 from judgeval.data.datasets import EvalDataset
 
 
@@ -59,8 +58,6 @@ def push(self, dataset: EvalDataset, alias: str, project_name: str, overwrite: O
                     "dataset_alias": alias,
                     "project_name": project_name,
                     "examples": [e.to_dict() for e in dataset.examples],
-                    "sequences": [s.model_dump() for s in dataset.sequences],
-                    "is_sequence": len(dataset.sequences) > 0,
                     "overwrite": overwrite,
                 }
             try:
@@ -151,63 +148,6 @@ def append_examples(self, alias: str, examples: List[Example], project_name: str
                     description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
                 )
             return True
-    
-    def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
-        debug(f"Appending dataset with alias '{alias}'")
-        """
-        Appends the dataset to Judgment platform
-
-        Mock request:
-        dataset = {
-            "alias": alias,
-            "examples": [...],
-            "project_name": project_name
-        } ==>
-        {
-            "_alias": alias,
-            "_id": "..."  # ID of the dataset
-        }
-        """
-        with Progress(
-            SpinnerColumn(style="rgb(106,0,255)"),
-            TextColumn("[progress.description]{task.description}"),
-            transient=False,
-        ) as progress:
-            task_id = progress.add_task(
-                f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
-                total=100,
-            )
-            content = {
-                    "dataset_alias": alias,
-                    "project_name": project_name,
-                    "sequences": [s.model_dump() for s in sequences],
-                }
-            try:
-                response = requests.post(
-                    JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL, 
-                    json=content,
-                    headers={
-                        "Content-Type": "application/json",
-                        "Authorization": f"Bearer {self.judgment_api_key}",
-                        "X-Organization-Id": self.organization_id
-                    },
-                    verify=True
-                )
-                if response.status_code != 200:
-                    error(f"Server error during append: {response.json()}")
-                    raise Exception(f"Server error during append: {response.json()}")
-                response.raise_for_status()
-            except requests.exceptions.HTTPError as err:
-                if response.status_code == 422:
-                    error(f"Validation error during append: {err.response.json()}")
-                else:
-                    error(f"HTTP error during append: {err}")
-            
-            progress.update(
-                    task_id,
-                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
-                )
-            return True
         
     def pull(self, alias: str, project_name: str) -> EvalDataset:
         debug(f"Pulling dataset with alias '{alias}'")
@@ -262,7 +202,6 @@ def pull(self, alias: str, project_name: str) -> EvalDataset:
                 info(f"Successfully pulled dataset with alias '{alias}'")
                 payload = response.json()
                 dataset.examples = [Example(**e) for e in payload.get("examples", [])]
-                dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
                 dataset._alias = payload.get("alias")
                 dataset._id = payload.get("id")
                 progress.update(
diff --git a/src/judgeval/data/example.py b/src/judgeval/data/example.py
index 373e1946..d7dd6e7e 100644
--- a/src/judgeval/data/example.py
+++ b/src/judgeval/data/example.py
@@ -37,7 +37,6 @@ class Example(BaseModel):
     example_index: Optional[int] = None
     timestamp: Optional[str] = None
     trace_id: Optional[str] = None
-    sequence_order: Optional[int] = 0
     
     def __init__(self, **data):
         if 'example_id' not in data:
diff --git a/src/judgeval/data/result.py b/src/judgeval/data/result.py
index 9a681b38..01b0504a 100644
--- a/src/judgeval/data/result.py
+++ b/src/judgeval/data/result.py
@@ -3,7 +3,7 @@
 from judgeval.common.logger import debug, error
 from pydantic import BaseModel
 from judgeval.data import ScorerData, Example, CustomExample
-from judgeval.data.sequence import Sequence
+from judgeval.data.trace import TraceSpan
 
 
 class ScoringResult(BaseModel):
@@ -24,7 +24,7 @@ class ScoringResult(BaseModel):
     name: Optional[str] = None
 
     # The original example object that was used to create the ScoringResult
-    data_object: Optional[Union[Sequence, CustomExample, Example]] = None
+    data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
     trace_id: Optional[str] = None
     
     # Additional fields for internal use
@@ -49,7 +49,7 @@ def __str__(self) -> str:
 
 
 def generate_scoring_result(
-    data_object: Union[Example, Sequence],
+    data_object: Union[Example, TraceSpan],
     scorers_data: List[ScorerData],
     run_duration: float,
     success: bool,
diff --git a/src/judgeval/data/sequence.py b/src/judgeval/data/sequence.py
deleted file mode 100644
index d9a691dd..00000000
--- a/src/judgeval/data/sequence.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from pydantic import BaseModel, Field, field_validator, model_validator
-from typing import List, Optional, Union, Any, Dict
-from judgeval.data.example import Example
-from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
-from uuid import uuid4
-from datetime import datetime, timezone
-
-class Sequence(BaseModel):
-    """
-    A sequence is a list of either Examples or nested Sequence objects.
-    """
-    sequence_id: str = Field(default_factory=lambda: str(uuid4()))
-    name: Optional[str] = "Sequence"
-    created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
-    items: List[Union["Sequence", Example]] = []
-    scorers: Optional[Any] = None
-    parent_sequence_id: Optional[str] = None
-    sequence_order: Optional[int] = 0
-    root_sequence_id: Optional[str] = None
-    inputs: Optional[Dict[str, Any]] = None
-    output: Optional[Any] = None
-    expected_tools: Optional[List[Dict[str, Any]]] = None
-
-    @field_validator("scorers")
-    def validate_scorer(cls, v):
-        for scorer in v or []:
-            if not isinstance(scorer, APIJudgmentScorer) and not isinstance(scorer, JudgevalScorer):
-                raise ValueError(f"Invalid scorer type: {type(scorer)}")
-        return v
-    
-    @model_validator(mode="after")
-    def populate_sequence_metadata(self) -> "Sequence":
-        """Recursively set parent_sequence_id, root_sequence_id, and sequence_order."""
-        # If root_sequence_id isn't already set, assign it to self
-        if self.root_sequence_id is None:
-            self.root_sequence_id = self.sequence_id
-
-        for idx, item in enumerate(self.items):
-            item.sequence_order = idx
-            if isinstance(item, Sequence):
-                item.parent_sequence_id = self.sequence_id
-                item.root_sequence_id = self.root_sequence_id
-                item.populate_sequence_metadata()
-        return self
-
-    class Config:
-        arbitrary_types_allowed = True
-
-# Update forward references so that "Sequence" inside items is resolved.
-Sequence.model_rebuild()
diff --git a/src/judgeval/data/trace.py b/src/judgeval/data/trace.py
index 72d02111..6e44298d 100644
--- a/src/judgeval/data/trace.py
+++ b/src/judgeval/data/trace.py
@@ -9,7 +9,7 @@ class TraceSpan(BaseModel):
     trace_id: str
     function: Optional[str] = None
     depth: int
-    created_at: Optional[float] = None
+    created_at: Optional[Any] = None
     parent_span_id: Optional[str] = None
     span_type: Optional[str] = "span"
     inputs: Optional[Dict[str, Any]] = None
@@ -17,6 +17,8 @@ class TraceSpan(BaseModel):
     duration: Optional[float] = None
     annotation: Optional[List[Dict[str, Any]]] = None
     evaluation_runs: Optional[List[EvaluationRun]] = []
+    expected_tools: Optional[List[Dict[str, Any]]] = None
+    additional_metadata: Optional[Dict[str, Any]] = None
 
     def model_dump(self, **kwargs):
         return {
@@ -124,6 +126,7 @@ class Trace(BaseModel):
     duration: float
     entries: List[TraceSpan]
     overwrite: bool = False
+    offline_mode: bool = False
     rules: Optional[Dict[str, Any]] = None
     has_notification: Optional[bool] = False
     
\ No newline at end of file
diff --git a/src/judgeval/data/sequence_run.py b/src/judgeval/data/trace_run.py
similarity index 92%
rename from src/judgeval/data/sequence_run.py
rename to src/judgeval/data/trace_run.py
index 01b11742..f0428361 100644
--- a/src/judgeval/data/sequence_run.py
+++ b/src/judgeval/data/trace_run.py
@@ -1,20 +1,20 @@
 
 from pydantic import BaseModel
 from typing import List, Optional, Dict, Any, Union, Callable
-from judgeval.data import Sequence
+from judgeval.data import Trace
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.judges import JudgevalJudge
 from judgeval.rules import Rule
 
 
-class SequenceRun(BaseModel):
+class TraceRun(BaseModel):
     """
     Stores example and evaluation scorers together for running an eval task
     
     Args: 
         project_name (str): The name of the project the evaluation results belong to
         eval_name (str): A name for this evaluation run
-        sequences (List[Sequence]): The sequences to evaluate
+        traces (List[Trace]): The traces to evaluate
         scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
         model (str): The model used as a judge when using LLM as a Judge
         aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -29,7 +29,7 @@ class SequenceRun(BaseModel):
     organization_id: Optional[str] = None
     project_name: Optional[str] = None
     eval_name: Optional[str] = None
-    sequences: Optional[List[Sequence]] = None
+    traces: Optional[List[Trace]] = None
     scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
     model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
     aggregator: Optional[str] = None
diff --git a/src/judgeval/evaluation_run.py b/src/judgeval/evaluation_run.py
index c2a852aa..1f126ab2 100644
--- a/src/judgeval/evaluation_run.py
+++ b/src/judgeval/evaluation_run.py
@@ -79,7 +79,7 @@ def validate_eval_name(cls, v, values):
             raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
         return v
 
-    @field_validator('examples', mode='before')
+    @field_validator('examples')
     def validate_examples(cls, v):
         if not v:
             raise ValueError("Examples cannot be empty.")
diff --git a/src/judgeval/judgment_client.py b/src/judgeval/judgment_client.py
index 0dd630b5..e5d0ce29 100644
--- a/src/judgeval/judgment_client.py
+++ b/src/judgeval/judgment_client.py
@@ -12,7 +12,7 @@
     ScoringResult, 
     Example,
     CustomExample,
-    Sequence,
+    Trace,
 )
 from judgeval.scorers import (
     APIJudgmentScorer, 
@@ -23,9 +23,9 @@
 from judgeval.run_evaluation import (
     run_eval, 
     assert_test,
-    run_sequence_eval
+    run_trace_eval
 )
-from judgeval.data.sequence_run import SequenceRun
+from judgeval.data.trace_run import TraceRun
 from judgeval.judges import JudgevalJudge
 from judgeval.constants import (
     JUDGMENT_EVAL_FETCH_API_URL, 
@@ -105,16 +105,16 @@ def a_run_evaluation(
             rules=rules
         )
 
-    def run_sequence_evaluation(
+    def run_trace_evaluation(
         self,
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
         model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
-        sequences: Optional[List[Sequence]] = None,
+        traces: Optional[List[Trace]] = None,
         examples: Optional[List[Example]] = None,
         test_file: Optional[str] = None,
         aggregator: Optional[str] = None,
         project_name: str = "default_project",
-        eval_run_name: str = "default_eval_sequence",
+        eval_run_name: str = "default_eval_trace",
         log_results: bool = True,
         append: bool = False,
         override: bool = False,
@@ -134,16 +134,16 @@ def run_sequence_evaluation(
             if examples and not function:
                 raise ValueError("Cannot pass in examples without a function")
             
-            if sequences and function:
-                raise ValueError("Cannot pass in sequences and function")
+            if traces and function:
+                raise ValueError("Cannot pass in traces and function")
             
-            if examples and sequences:
-                raise ValueError("Cannot pass in both examples and sequences")
+            if examples and traces:
+                raise ValueError("Cannot pass in both examples and traces")
             
-            sequence_run = SequenceRun(
+            trace_run = TraceRun(
                 project_name=project_name,
                 eval_name=eval_run_name,
-                sequences=sequences,
+                traces=traces,
                 scorers=scorers,
                 model=model,
                 aggregator=aggregator,
@@ -152,9 +152,9 @@ def run_sequence_evaluation(
                 judgment_api_key=self.judgment_api_key,
                 organization_id=self.organization_id,
             )
-            return run_sequence_eval(sequence_run, override, ignore_errors, function, tracer, examples)
+            return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
         except ValueError as e:
-            raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
+            raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}")
         except Exception as e:
             raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
 
@@ -245,12 +245,6 @@ def append_example_dataset(self, alias: str, examples: List[Example], project_na
         """
         return self.eval_dataset_client.append_examples(alias, examples, project_name)
     
-    def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
-        """
-        Appends a `Sequence` to the Judgment platform for storage.
-        """
-        return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
-    
     def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
         """
         Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -523,7 +517,7 @@ def assert_test(
             raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
 
         if function:
-            results = self.run_sequence_evaluation(
+            results = self.run_trace_evaluation(
                 examples=examples,
                 scorers=scorers,
                 model=model,
diff --git a/src/judgeval/run_evaluation.py b/src/judgeval/run_evaluation.py
index 79927214..4732767c 100644
--- a/src/judgeval/run_evaluation.py
+++ b/src/judgeval/run_evaluation.py
@@ -13,7 +13,6 @@
     ScoringResult,
     Example,
     CustomExample,
-    Sequence,
     Trace
 )
 from judgeval.scorers import (
@@ -25,11 +24,10 @@
 from judgeval.constants import (
     ROOT_API,
     JUDGMENT_EVAL_API_URL,
-    JUDGMENT_SEQUENCE_EVAL_API_URL,
+    JUDGMENT_TRACE_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
     MAX_CONCURRENT_EVALUATIONS,
     JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
-    JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL
 )
 from judgeval.common.exceptions import JudgmentAPIError
 from judgeval.common.logger import (
@@ -39,7 +37,7 @@
     example_logging_context
 )
 from judgeval.evaluation_run import EvaluationRun
-from judgeval.data.sequence_run import SequenceRun
+from judgeval.data.trace_run import TraceRun
 from judgeval.common.tracer import Tracer
 from langchain_core.callbacks import BaseCallbackHandler
 
@@ -98,20 +96,20 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
         raise JudgmentAPIError(error_message)
     return response_data
 
-def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]:
+def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]:
     """
     Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
     """
         
     try:
         # submit API request to execute evals
-        payload = sequence_run.model_dump(warnings=False)
+        payload = trace_run.model_dump(warnings=False)
         response = requests.post(
-            JUDGMENT_SEQUENCE_EVAL_API_URL, 
+            JUDGMENT_TRACE_EVAL_API_URL, 
             headers={
                 "Content-Type": "application/json",
-                "Authorization": f"Bearer {sequence_run.judgment_api_key}",
-                "X-Organization-Id": sequence_run.organization_id
+                "Authorization": f"Bearer {trace_run.judgment_api_key}",
+                "X-Organization-Id": trace_run.organization_id
             }, 
             json=payload,
             verify=True
@@ -282,7 +280,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
         raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
 
 
-def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
+def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]) -> str:
     """
     Logs evaluation results to the Judgment API database.
 
@@ -327,51 +325,6 @@ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[Eval
         error(f"Failed to save evaluation results to DB: {str(e)}")
         raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
 
-def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence:
-    """
-    Retrieves a sequence from a trace ID.
-    """
-    """
-    Logs evaluation results to the Judgment API database.
-
-    Args:
-        merged_results (List[ScoringResult]): The results to log
-        evaluation_run (EvaluationRun): The evaluation run containing project info and API key
-
-    Raises:
-        JudgmentAPIError: If there's an API error during logging
-        ValueError: If there's a validation error with the results
-    """
-    try:
-        res = requests.post(
-            JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL,
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {judgment_api_key}",
-                "X-Organization-Id": organization_id
-            },
-            json={
-                "trace_id": trace_id,
-                "trace_span_id": parent_span,
-            },
-            verify=True
-        )
-        
-        if not res.ok:
-            response_data = res.json()
-            error_message = response_data.get('detail', 'An unknown error occurred.')
-            error(f"Error {res.status_code}: {error_message}")
-            raise JudgmentAPIError(error_message)
-        
-        return Sequence(**res.json())
-    except requests.exceptions.RequestException as e:
-        error(f"Request failed while saving evaluation results to DB: {str(e)}")
-        raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
-    except Exception as e:
-        error(f"Failed to save evaluation results to DB: {str(e)}")
-        raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
-
-
 def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
         """Run a function with a spinner in the terminal."""
         spinner = itertools.cycle(['|', '/', '-', '\\'])
@@ -415,62 +368,59 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
             if missing_params:
                 print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
 
-def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
+def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
-    if not override and sequence_run.log_results and not sequence_run.append:
+    if not override and trace_run.log_results and not trace_run.append:
         check_eval_run_name_exists(
-            sequence_run.eval_name,
-            sequence_run.project_name,
-            sequence_run.judgment_api_key,
-            sequence_run.organization_id
+            trace_run.eval_name,
+            trace_run.project_name,
+            trace_run.judgment_api_key,
+            trace_run.organization_id
         )
 
-    if sequence_run.append:
+    if trace_run.append:
         # Check that the current experiment, if one exists, has the same type (examples of sequences)
         check_experiment_type(
-            sequence_run.eval_name,
-            sequence_run.project_name,
-            sequence_run.judgment_api_key,
-            sequence_run.organization_id,
+            trace_run.eval_name,
+            trace_run.project_name,
+            trace_run.judgment_api_key,
+            trace_run.organization_id,
             True
         )
 
     if function and tracer:
-        new_sequences: List[Sequence] = []
+        new_traces: List[Trace] = []
+        tracer.offline_mode = True
         for example in examples:
             if example.input:
                 result = run_with_spinner("Running agent function: ", function, **example.input)
             else:
                 result = run_with_spinner("Running agent function: ", function)
         for i, trace in enumerate(tracer.traces):
-            trace_id = trace['trace_id']
-            parent_span = trace['entries'][0]['span_id']
-            new_sequence = retrieve_sequence_from_trace(trace_id, parent_span, sequence_run.judgment_api_key, sequence_run.organization_id)
-            new_sequence.expected_tools = examples[i].expected_tools
-            new_sequences.append(new_sequence)
-        sequence_run.sequences = new_sequences
-        
-    for sequence in sequence_run.sequences:
-        sequence.scorers = sequence_run.scorers
+            # We set the root-level trace span with the expected tools of the Trace
+            trace = Trace(**trace)
+            trace.entries[0].expected_tools = examples[i].expected_tools
+            new_traces.append(trace)
+        trace_run.traces = new_traces
         
     # Execute evaluation using Judgment API
     info("Starting API evaluation")
     try:  # execute an EvaluationRun with just JudgmentScorers
         debug("Sending request to Judgment API")    
-        response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
+        response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
         scoring_results = [ScoringResult(**result) for result in response_data["results"]]
         info(f"Received {len(scoring_results)} results from API")
     except JudgmentAPIError as e:
         error(f"An error occurred while executing the Judgment API request: {str(e)}")
         raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
     except ValueError as e:
-        raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}")
+        raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: {str(e)}")
     
     # Convert the response data to `ScoringResult` objects
     debug("Processing API results")
-    # TODO: allow for custom scorer on sequences
-    if sequence_run.log_results:
-        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
+    # TODO: allow for custom scorer on traces
+    if trace_run.log_results:
+        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
         rprint(pretty_str)
 
     return scoring_results
diff --git a/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py b/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py
index c4130842..df92966a 100644
--- a/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py
+++ b/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py
@@ -5,13 +5,15 @@
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
-
+from typing import Optional, Dict
 class ToolOrderScorer(APIJudgmentScorer):
-    def __init__(self, threshold: float=1.0):
+    kwargs: Optional[Dict] = None
+    def __init__(self, threshold: float=1.0, exact_match: bool=False):
         super().__init__(
             threshold=threshold, 
             score_type=APIScorer.TOOL_ORDER,
         )
+        self.kwargs = {"exact_match": exact_match}
 
     @property
     def __name__(self):