diff --git a/Pipfile b/Pipfile
index 7a1a9f02..f5fd917a 100644
--- a/Pipfile
+++ b/Pipfile
@@ -16,6 +16,7 @@ pandas = "*"
 openai = "*"
 together = "*"
 anthropic = "*"
+patronus = "*"
 
 [dev-packages]
 pytest = "*"
diff --git a/demo/test_competitors.py b/demo/test_competitors.py
new file mode 100644
index 00000000..423906ce
--- /dev/null
+++ b/demo/test_competitors.py
@@ -0,0 +1,96 @@
+from dotenv import load_dotenv
+from patronus import Client
+import os 
+import asyncio
+import time
+from openai import OpenAI
+from anthropic import Anthropic
+
+load_dotenv()
+
+PATRONUS_API_KEY = os.getenv("PATRONUS_API_KEY")
+
+client = Client(api_key=PATRONUS_API_KEY)
+
+# Initialize clients
+openai_client = OpenAI()
+anthropic_client = Anthropic()
+
+async def make_upper(input: str) -> str:
+    output = input.upper()
+    result = client.evaluate(
+        evaluator="answer-relevance",
+        criteria="patronus:answer-relevance",
+        evaluated_model_input=input,
+        evaluated_model_output=output,
+        threshold=0.5,
+        model="gpt-4o-mini",
+        log_results=True
+    )
+    return output
+
+def llm_call(input):
+    time.sleep(1.3)
+    return "We have a 30 day full refund policy on shoes."
+
+async def answer_user_question(input):
+    output = llm_call(input)
+    result = client.evaluate(
+        evaluator="answer-relevance",
+        criteria="patronus:answer-relevance",
+        evaluated_model_input=input,
+        evaluated_model_output=output,
+        evaluated_model_retrieved_context=["All customers are eligible for a 30 day full refund at no extra cost."],
+        expected_output="We offer a 30-day full refund at no extra cost.",
+        threshold=0.5,
+        model="gpt-4o-mini",
+        log_results=True
+    )
+    return output
+
+async def make_poem(input: str) -> str:
+    try:
+        # Using Anthropic API
+        anthropic_response = anthropic_client.messages.create(
+            model="claude-3-sonnet-20240229",
+            messages=[{"role": "user", "content": input}],
+            max_tokens=30
+        )
+        anthropic_result = anthropic_response.content[0].text
+        
+        result = client.evaluate(
+            evaluator="answer-relevance",
+            criteria="patronus:answer-relevance",
+            evaluated_model_input=input,
+            evaluated_model_output=anthropic_result,
+            threshold=0.5,
+            model="gpt-4o-mini",
+            log_results=True
+        )
+        
+        # Using OpenAI API
+        openai_response = openai_client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "Make a short sentence with the input."},
+                {"role": "user", "content": input}
+            ]
+        )
+        openai_result = openai_response.choices[0].message.content
+        
+        return f"{anthropic_result} {openai_result}".lower()
+    
+    except Exception as e:
+        print(f"Error generating poem: {e}")
+        return ""
+
+async def test_evaluation_mixed(input):
+    upper = await make_upper(input)
+    result = await make_poem(upper)
+    await answer_user_question("What if these shoes don't fit?")
+    return result
+
+if __name__ == "__main__":
+    test_input = "Write a poem about Nissan R32 GTR"
+    asyncio.run(test_evaluation_mixed(test_input))
+    
diff --git a/docs/notebooks/prompt_scorer.ipynb b/docs/notebooks/prompt_scorer.ipynb
index efe0323c..fb3f0223 100644
--- a/docs/notebooks/prompt_scorer.ipynb
+++ b/docs/notebooks/prompt_scorer.ipynb
@@ -157,7 +157,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/e2etests/judgment_client_test.py b/e2etests/judgment_client_test.py
index 4eb7484b..d3394f94 100644
--- a/e2etests/judgment_client_test.py
+++ b/e2etests/judgment_client_test.py
@@ -16,6 +16,8 @@
 import random
 import string
 
+from judgeval.scorers.prompt_scorer import ClassifierScorer
+
 load_dotenv()
 
 def get_client():
@@ -35,36 +37,32 @@ def test_dataset(client: JudgmentClient):
     print(dataset)
 
 def test_run_eval(client: JudgmentClient):
+    # Single step in our workflow, an outreach Sales Agent
 
     example1 = Example(
-        input="What if these shoes don't fit?",
-        actual_output="We offer a 30-day full refund at no extra cost.",
-        retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
-        trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
+        input="Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.",
+        actual_output="Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex",
+        retrieval_context=["TechCorp launched AI analytics platform in 2024", "Sarah Chen is CEO, ex-Google executive", "Current client base: 50+ enterprise customers"],
     )
 
     example2 = Example(
-        input="How do I reset my password?",
-        actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
-        expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
-        name="Password Reset",
-        context=["User Account"],
-        retrieval_context=["Password reset instructions"],
-        tools_called=["authentication"],
-        expected_tools=["authentication"],
-        additional_metadata={"difficulty": "medium"}
+        input="Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.",
+        actual_output="Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex",
+        expected_output="A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans",
+        context=["Business Development"],
+        retrieval_context=["GreenEnergy Solutions won 2023 sustainability award", "New solar technology 30% more efficient", "Planning European market expansion"],
     )
 
     scorer = FaithfulnessScorer(threshold=0.5)
     scorer2 = HallucinationScorer(threshold=0.5)
     c_scorer = CustomFaithfulnessMetric(threshold=0.6)
 
-    PROJECT_NAME = "test_project_JOSEPH"
-    EVAL_RUN_NAME = "yomadude"
+    PROJECT_NAME = "OutreachWorkflow"
+    EVAL_RUN_NAME = "ColdEmailGenerator-Improve-BasePrompt"
     
-    _ = client.run_evaluation(
+    client.run_evaluation(
         examples=[example1, example2],
-        scorers=[scorer, c_scorer],
+        scorers=[scorer, scorer2],
         model="QWEN",
         metadata={"batch": "test"},
         project_name=PROJECT_NAME,
@@ -146,8 +144,6 @@ def test_override_eval(client: JudgmentClient):
         if "already exists" not in str(e):
             raise
         print(f"Successfully caught expected error: {e}")
-    
-    
 
 def test_evaluate_dataset(client: JudgmentClient):
 
@@ -194,8 +190,10 @@ def test_classifier_scorer(client: JudgmentClient):
         examples=[example1],
         scorers=[classifier_scorer, faithfulness_scorer],
         model="QWEN",
+        log_results=True,
+        eval_run_name="ToneScorerTest",
+        project_name="ToneScorerTest",
     )
-    print(res)
 
 if __name__ == "__main__":
     # Test client functionality
@@ -204,29 +202,29 @@ def test_classifier_scorer(client: JudgmentClient):
     print("Client initialized successfully")
     print("*" * 40)
 
-    print("Testing dataset creation, pushing, and pulling")
-    test_dataset(ui_client)
-    print("Dataset creation, pushing, and pulling successful")
-    print("*" * 40)
+    # print("Testing dataset creation, pushing, and pulling")
+    # test_dataset(ui_client)
+    # print("Dataset creation, pushing, and pulling successful")
+    # print("*" * 40)
     
     print("Testing evaluation run")
     test_run_eval(ui_client)
     print("Evaluation run successful")
     print("*" * 40)
     
-    print("Testing evaluation run override")
-    test_override_eval(client)
-    print("Evaluation run override successful")
-    print("*" * 40)
+    # print("Testing evaluation run override")
+    # test_override_eval(client)
+    # print("Evaluation run override successful")
+    # print("*" * 40)
     
-    print("Testing dataset evaluation")
-    test_evaluate_dataset(ui_client)
-    print("Dataset evaluation successful")
-    print("*" * 40)
+    # print("Testing dataset evaluation")
+    # test_evaluate_dataset(ui_client)
+    # print("Dataset evaluation successful")
+    # print("*" * 40)
     
-    print("Testing classifier scorer")
-    test_classifier_scorer(ui_client)
-    print("Classifier scorer test successful")
-    print("*" * 40)
+    # print("Testing classifier scorer")
+    # test_classifier_scorer(ui_client)
+    # print("Classifier scorer test successful")
+    # print("*" * 40)
 
     print("All tests passed successfully")
diff --git a/e2etests/test_prompt_scoring.py b/e2etests/test_prompt_scoring.py
index 51f8c9c3..ac535d76 100644
--- a/e2etests/test_prompt_scoring.py
+++ b/e2etests/test_prompt_scoring.py
@@ -36,7 +36,7 @@ def __init__(
         )
         self.score = 0.0
 
-    def build_measure_prompt(self, example: Example):
+    def _build_measure_prompt(self, example: Example):
         SYSTEM_ROLE = (
             'You are a great judge of emotional intelligence. You understand the feelings ' 
             'and intentions of others. You will be tasked with judging whether the following '
@@ -51,16 +51,16 @@ def build_measure_prompt(self, example: Example):
         ] 
         return conversation
     
-    def build_schema(self):
+    def _build_schema(self):
         return {
             "score": int,
             "reason": str
         }
     
-    def process_response(self, response):
+    def _process_response(self, response):
         return response["score"], response["reason"]
     
-    def success_check(self):
+    def _success_check(self):
         POSITIVITY_THRESHOLD = 3  # we want all model responses to be somewhat positive in tone
         return self.score <= POSITIVITY_THRESHOLD
 
diff --git a/e2etests/test_tracer.py b/e2etests/test_tracer.py
index 2f97280c..d2262377 100644
--- a/e2etests/test_tracer.py
+++ b/e2etests/test_tracer.py
@@ -14,11 +14,11 @@
 from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer
 
 # Initialize the tracer and clients
-judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
+judgment = Tracer(api_key=os.getenv("UI_JUDGMENT_API_KEY"))
 openai_client = wrap(OpenAI())
 anthropic_client = wrap(Anthropic())
 
-@judgment.observe
+@judgment.observe(span_type="tool")
 async def make_upper(input: str) -> str:
     """Convert input to uppercase and evaluate using judgment API.
     
@@ -28,6 +28,7 @@ async def make_upper(input: str) -> str:
         The uppercase version of the input string
     """
     output = input.upper()
+    
     await judgment.get_current_trace().async_evaluate(
         scorers=[FaithfulnessScorer(threshold=0.5)],
         input="What if these shoes don't fit?",
@@ -38,9 +39,10 @@ async def make_upper(input: str) -> str:
         model="gpt-4o-mini",
         log_results=True
     )
+
     return output
 
-@judgment.observe
+@judgment.observe(span_type="tool")
 async def make_lower(input):
     output = input.lower()
     
@@ -59,11 +61,12 @@ async def make_lower(input):
     )
     return output
 
-@judgment.observe
+@judgment.observe(span_type="llm")
 def llm_call(input):
+    time.sleep(1.3)
     return "We have a 30 day full refund policy on shoes."
 
-@judgment.observe
+@judgment.observe(span_type="tool")
 async def answer_user_question(input):
     output = llm_call(input)
     await judgment.get_current_trace().async_evaluate(
@@ -77,7 +80,7 @@ async def answer_user_question(input):
     )
     return output
 
-@judgment.observe
+@judgment.observe(span_type="tool")
 async def make_poem(input: str) -> str:
     """Generate a poem using both Anthropic and OpenAI APIs.
     
@@ -95,6 +98,15 @@ async def make_poem(input: str) -> str:
         )
         anthropic_result = anthropic_response.content[0].text
         
+        await judgment.get_current_trace().async_evaluate(
+            input=input,
+            actual_output=anthropic_result,
+            score_type=APIScorer.ANSWER_RELEVANCY,
+            threshold=0.5,
+            model="gpt-4o-mini",
+            log_results=True
+        )
+        
         # Using OpenAI API
         openai_response = openai_client.chat.completions.create(
             model="gpt-4o-mini",
@@ -112,7 +124,8 @@ async def make_poem(input: str) -> str:
         return ""
 
 async def test_evaluation_mixed(input):
-    with judgment.trace("test_evaluation") as trace:
+    PROJECT_NAME = "NewPoemBot"
+    with judgment.trace("Use-claude", project_name=PROJECT_NAME, overwrite=True) as trace:
         upper = await make_upper(input)
         result = await make_poem(upper)
         await answer_user_question("What if these shoes don't fit?")
diff --git a/judgeval/common/tracer.py b/judgeval/common/tracer.py
index bc6bf071..34b135cd 100644
--- a/judgeval/common/tracer.py
+++ b/judgeval/common/tracer.py
@@ -28,6 +28,7 @@
 import json
 import warnings
 from pydantic import BaseModel
+from http import HTTPStatus
 
 from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
 from judgeval.judgment_client import JudgmentClient
@@ -38,7 +39,7 @@
 # Define type aliases for better code readability and maintainability
 ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic]  # Supported API clients
 TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation']  # Valid trace entry types
-
+SpanType = Literal['span', 'tool', 'llm', 'evaluation']
 @dataclass
 class TraceEntry:
     """Represents a single trace entry with its visual representation.
@@ -58,7 +59,8 @@ class TraceEntry:
     duration: Optional[float] = None  # Time taken (for exit/evaluation entries)
     output: Any = None  # Function output value
     # Use field() for mutable defaults to avoid shared state issues
-    inputs: dict = field(default_factory=dict)  
+    inputs: dict = field(default_factory=dict)
+    span_type: SpanType = "span"
     evaluation_result: Optional[List[ScoringResult]] = field(default=None)
     
     def print_entry(self):
@@ -93,7 +95,8 @@ def to_dict(self) -> dict:
             "duration": self.duration,
             "output": output,
             "inputs": self.inputs or None,  # Convert empty dict to None
-            "evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None
+            "evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None,
+            "span_type": self.span_type
         }
 
     def _serialize_output(self) -> Any:
@@ -112,17 +115,19 @@ def _serialize_output(self) -> Any:
 
 class TraceClient:
     """Client for managing a single trace context"""
-    def __init__(self, tracer, trace_id: str, name: str):
+    def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project"):
         self.tracer = tracer
         self.trace_id = trace_id
         self.name = name
+        self.project_name = project_name
         self.client: JudgmentClient = tracer.client
         self.entries: List[TraceEntry] = []
         self.start_time = time.time()
-        self._current_span = None
+        self.span_type = None
+        self._current_span: Optional[TraceEntry] = None
         
     @contextmanager
-    def span(self, name: str):
+    def span(self, name: str, span_type: SpanType = "span"):
         """Context manager for creating a trace span"""
         start_time = time.time()
         
@@ -132,7 +137,8 @@ def span(self, name: str):
             function=name,
             depth=self.tracer.depth,
             message=name,
-            timestamp=start_time
+            timestamp=start_time,
+            span_type=span_type
         ))
         
         self.tracer.depth += 1
@@ -152,7 +158,8 @@ def span(self, name: str):
                 depth=self.tracer.depth,
                 message=f"← {name}",
                 timestamp=time.time(),
-                duration=duration
+                duration=duration,
+                span_type=span_type
             ))
             self._current_span = prev_span
             
@@ -199,6 +206,7 @@ def record_evaluation(self, results: List[ScoringResult], start_time: float):
         """Record evaluation results for the current span"""
         if self._current_span:
             duration = time.time() - start_time  # Calculate duration from start_time
+            
             self.add_entry(TraceEntry(
                 type="evaluation",
                 function=self._current_span,
@@ -206,7 +214,8 @@ def record_evaluation(self, results: List[ScoringResult], start_time: float):
                 message=f"Evaluation results for {self._current_span}",
                 timestamp=time.time(),
                 evaluation_result=results,
-                duration=duration
+                duration=duration,
+                span_type="evaluation"
             ))
 
     def record_input(self, inputs: dict):
@@ -218,7 +227,8 @@ def record_input(self, inputs: dict):
                 depth=self.tracer.depth,
                 message=f"Inputs to {self._current_span}",
                 timestamp=time.time(),
-                inputs=inputs
+                inputs=inputs,
+                span_type=self.span_type
             ))
 
     async def _update_coroutine_output(self, entry: TraceEntry, coroutine: Any):
@@ -240,7 +250,8 @@ def record_output(self, output: Any):
                 depth=self.tracer.depth,
                 message=f"Output from {self._current_span}",
                 timestamp=time.time(),
-                output="<pending>" if inspect.iscoroutine(output) else output
+                output="<pending>" if inspect.iscoroutine(output) else output,
+                span_type=self.span_type
             )
             self.add_entry(entry)
             
@@ -266,45 +277,39 @@ def get_duration(self) -> float:
     
     def condense_trace(self, entries: List[dict]) -> List[dict]:
         """
-        Condenses trace entries into a single entry for each function.
-        
-        Groups entries by function call and combines them into a single entry with:
-        - depth: deepest depth for this function call
-        - duration: time from first to last timestamp 
-        - function: function name
-        - inputs: non-None inputs
-        - output: non-None outputs
-        - evaluation_result: evaluation results
-        - timestamp: first timestamp of the function call
+        Condenses trace entries into a single entry for each function call.
         """
         condensed = []
-        current_func = None
-        current_entry = None
+        active_functions = []  # Stack to track nested function calls
+        function_entries = {}  # Store entries for each function
 
         for entry in entries:
+            function = entry["function"]
+            
             if entry["type"] == "enter":
-                # Start of new function call
-                current_func = entry["function"]
-                current_entry = {
+                # Initialize new function entry
+                function_entries[function] = {
                     "depth": entry["depth"],
-                    "function": entry["function"],
+                    "function": function,
                     "timestamp": entry["timestamp"],
                     "inputs": None,
                     "output": None,
-                    "evaluation_result": None
+                    "evaluation_result": None,
+                    "span_type": entry.get("span_type", "span")
                 }
-            
-            elif entry["type"] == "exit" and entry["function"] == current_func:
-                # End of current function
+                active_functions.append(function)
+                
+            elif entry["type"] == "exit" and function in active_functions:
+                # Complete function entry
+                current_entry = function_entries[function]
                 current_entry["duration"] = entry["timestamp"] - current_entry["timestamp"]
                 condensed.append(current_entry)
-                current_func = None
-                current_entry = None
-            
-            elif current_func and entry["function"] == current_func:
-                # Additional entries for current function
-                if entry["depth"] > current_entry["depth"]:
-                    current_entry["depth"] = entry["depth"]
+                active_functions.remove(function)
+                del function_entries[function]
+                
+            elif function in active_functions:
+                # Update existing function entry with additional data
+                current_entry = function_entries[function]
                 
                 if entry["type"] == "input" and entry["inputs"]:
                     current_entry["inputs"] = entry["inputs"]
@@ -315,9 +320,11 @@ def condense_trace(self, entries: List[dict]) -> List[dict]:
                 if entry["type"] == "evaluation" and entry["evaluation_result"]:
                     current_entry["evaluation_result"] = entry["evaluation_result"]
 
+        # Sort by timestamp
+        condensed.sort(key=lambda x: x["timestamp"])
         return condensed
 
-    def save(self) -> Tuple[str, dict]:
+    def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str, dict]:
         """
         Save the current trace to the database.
         Returns a tuple of (trace_id, trace_data) where trace_data is the trace data that was saved.
@@ -333,6 +340,7 @@ def save(self) -> Tuple[str, dict]:
             "trace_id": self.trace_id,
             "api_key": self.tracer.api_key,
             "name": self.name,
+            "project_name": self.project_name,
             "created_at": datetime.fromtimestamp(self.start_time).isoformat(),
             "duration": total_duration,
             "token_counts": {
@@ -340,7 +348,9 @@ def save(self) -> Tuple[str, dict]:
                 "completion_tokens": 0,  # Dummy value
                 "total_tokens": 0,  # Dummy value
             },  # TODO: Add token counts
-            "entries": condensed_entries
+            "entries": condensed_entries,
+            "empty_save": empty_save,
+            "overwrite": overwrite
         }
 
         # Save trace data by making POST request to API
@@ -351,7 +361,11 @@ def save(self) -> Tuple[str, dict]:
                 "Content-Type": "application/json",
             }
         )
-        response.raise_for_status()
+        
+        if response.status_code == HTTPStatus.BAD_REQUEST:
+            raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
+        elif response.status_code != HTTPStatus.OK:
+            raise ValueError(f"Failed to save trace data: {response.text}")
         
         return self.trace_id, trace_data
 
@@ -369,17 +383,17 @@ def __init__(self, api_key: str):
             if not api_key:
                 raise ValueError("Tracer must be configured with a Judgment API key")
             
-            self.api_key = api_key
-            self.client = JudgmentClient(judgment_api_key=api_key)
-            self.depth = 0
-            self._current_trace: Optional[TraceClient] = None
-            self.initialized = True
+            self.api_key: str = api_key
+            self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
+            self.depth: int = 0
+            self._current_trace: Optional[str] = None
+            self.initialized: bool = True
         
     @contextmanager
-    def trace(self, name: str = None) -> Generator[TraceClient, None, None]:
+    def trace(self, name: str, project_name: str = "default_project", overwrite: bool = False) -> Generator[TraceClient, None, None]:
         """Start a new trace context using a context manager"""
         trace_id = str(uuid.uuid4())
-        trace = TraceClient(self, trace_id, name or "unnamed_trace")
+        trace = TraceClient(self, trace_id, name, project_name=project_name)
         prev_trace = self._current_trace
         self._current_trace = trace
         
@@ -387,7 +401,7 @@ def trace(self, name: str = None) -> Generator[TraceClient, None, None]:
         with trace.span(name or "unnamed_trace") as span:
             try:
                 # Save the trace to the database to handle Evaluations' trace_id referential integrity
-                trace.save()
+                trace.save(empty_save=True, overwrite=overwrite)
                 yield trace
             finally:
                 self._current_trace = prev_trace
@@ -398,16 +412,17 @@ def get_current_trace(self) -> Optional[TraceClient]:
         """
         return self._current_trace    
 
-    def observe(self, func=None, *, name=None):
+    def observe(self, func=None, *, name=None, span_type: SpanType = "span"):
         """
         Decorator to trace function execution with detailed entry/exit information.
         
         Args:
             func: The function to trace
             name: Optional custom name for the function
+            span_type: The type of span to use for this observation (default: "span")
         """
         if func is None:
-            return lambda f: self.observe(f, name=name)
+            return lambda f: self.observe(f, name=name, span_type=span_type)
         
         if asyncio.iscoroutinefunction(func):
             @functools.wraps(func)
@@ -415,7 +430,10 @@ async def async_wrapper(*args, **kwargs):
                 if self._current_trace:
                     span_name = name or func.__name__
                     
-                    with self._current_trace.span(span_name) as span:
+                    with self._current_trace.span(span_name, span_type=span_type) as span:
+                        # Set the span type
+                        span.span_type = span_type
+                        
                         # Record inputs
                         span.record_input({
                             'args': list(args),
@@ -438,7 +456,10 @@ def wrapper(*args, **kwargs):
                 if self._current_trace:
                     span_name = name or func.__name__
                     
-                    with self._current_trace.span(span_name) as span:
+                    with self._current_trace.span(span_name, span_type=span_type) as span:
+                        # Set the span type
+                        span.span_type = span_type
+                        
                         # Record inputs
                         span.record_input({
                             'args': list(args),
@@ -471,7 +492,7 @@ def traced_create(*args, **kwargs):
         if not (tracer and tracer._current_trace):
             return original_create(*args, **kwargs)
 
-        with tracer._current_trace.span(span_name) as span:
+        with tracer._current_trace.span(span_name, span_type="llm") as span:
             # Format and record the input parameters
             input_data = _format_input_data(client, **kwargs)
             span.record_input(input_data)
diff --git a/judgeval/data/result.py b/judgeval/data/result.py
index 9b9f4c1d..dc24c670 100644
--- a/judgeval/data/result.py
+++ b/judgeval/data/result.py
@@ -7,6 +7,7 @@
 class ScoringResult:
     """
     A ScoringResult contains the output of one or more scorers applied to a single example.
+    Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
 
     Args:
         success (bool): Whether the evaluation was successful. 
@@ -32,6 +33,9 @@ class ScoringResult:
     retrieval_context: Optional[List[str]] = None
     trace_id: Optional[str] = None
     
+    example_id: Optional[str] = None
+    eval_run_name: Optional[str] = None
+    
     def to_dict(self) -> dict:
         """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
         return {
@@ -42,7 +46,8 @@ def to_dict(self) -> dict:
             "expected_output": self.expected_output,
             "context": self.context,
             "retrieval_context": self.retrieval_context,
-            "trace_id": self.trace_id
+            "trace_id": self.trace_id,
+            "example_id": self.example_id
         }
     
     def __str__(self) -> str:
diff --git a/judgeval/data/scorer_data.py b/judgeval/data/scorer_data.py
index 787bc6c4..85272f7f 100644
--- a/judgeval/data/scorer_data.py
+++ b/judgeval/data/scorer_data.py
@@ -76,7 +76,7 @@ def create_scorer_data(scorer: CustomScorer) -> ScorerData:
             score=scorer.score,
             threshold=scorer.threshold,
             reason=scorer.reason,
-            success=scorer.success_check(),
+            success=scorer._success_check(),
             strict_mode=scorer.strict_mode,
             evaluation_model=scorer.evaluation_model,
             error=None,
diff --git a/judgeval/judgment_client.py b/judgeval/judgment_client.py
index fe4636a1..d8190ceb 100644
--- a/judgeval/judgment_client.py
+++ b/judgeval/judgment_client.py
@@ -129,20 +129,34 @@ def pull_dataset(self, alias: str) -> EvalDataset:
         return dataset
     
     # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
-    def pull_eval(self, project_name: str, eval_run_name: str) -> List[ScoringResult]:
+    def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
+        """Pull evaluation results from the server.
+
+        Args:
+            project_name (str): Name of the project
+            eval_run_name (str): Name of the evaluation run
+
+        Returns:
+            Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
+                - id (str): The evaluation run ID
+                - results (List[ScoringResult]): List of scoring results
+        """
         eval_run_request_body = EvalRunRequestBody(project_name=project_name, 
                                                    eval_name=eval_run_name, 
                                                    judgment_api_key=self.judgment_api_key)
-        eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL, 
+        eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL,
                                  json=eval_run_request_body.model_dump())
         if eval_run.status_code != requests.codes.ok:
             raise ValueError(f"Error fetching eval results: {eval_run.json()}")
-        eval_results = []
+
+        eval_run_result = [{}]
         for result in eval_run.json():
-            result = result.get("result", dict())
-            filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
-            eval_results.append(ScoringResult(**filtered_result))
-        return eval_results
+            result_id = result.get("id", "")
+            result_data = result.get("result", dict())
+            filtered_result = {k: v for k, v in result_data.items() if k in ScoringResult.__annotations__}
+            eval_run_result[0]["id"] = result_id
+            eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
+        return eval_run_result
         
     def _validate_api_key(self):
         """
@@ -191,3 +205,37 @@ def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
             return ClassifierScorer(**scorer_config)
         except Exception as e:
             raise JudgmentAPIError(f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}")
+
+    def push_classifier_scorer(self, scorer: ClassifierScorer, slug: str = None) -> str:
+        """
+        Pushes a classifier scorer configuration to the Judgment API.
+
+        Args:
+            slug (str): Slug identifier for the scorer. If it exists, the scorer will be updated.
+            scorer (ClassifierScorer): The classifier scorer to save
+
+        Returns:
+            str: The slug identifier of the saved scorer
+
+        Raises:
+            JudgmentAPIError: If there's an error saving the scorer
+        """
+        request_body = {
+            "name": scorer.name,
+            "conversation": [m.model_dump() for m in scorer.conversation],
+            "options": scorer.options,
+            "judgment_api_key": self.judgment_api_key,
+            "slug": slug
+        }
+        
+        response = requests.post(
+            f"{ROOT_API}/save_scorer/",
+            json=request_body
+        )
+        
+        if response.status_code == 500:
+            raise JudgmentAPIError(f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}")
+        elif response.status_code != 200:
+            raise JudgmentAPIError(f"Failed to save classifier scorer: {response.json().get('detail', '')}")
+            
+        return response.json()["slug"]
\ No newline at end of file
diff --git a/judgeval/playground.py b/judgeval/playground.py
index 19db5809..907ad24e 100644
--- a/judgeval/playground.py
+++ b/judgeval/playground.py
@@ -566,7 +566,7 @@ def _calculate_score(self) -> float:
         score = faithfulness_count / number_of_verdicts
         return 0 if self.strict_mode and score < self.threshold else score
 
-    def success_check(self) -> bool:
+    def _success_check(self) -> bool:
         if self.error is not None:
             self.success = False
         else:
diff --git a/judgeval/scorers/custom_scorer.py b/judgeval/scorers/custom_scorer.py
index 75816e7d..d21e47ee 100644
--- a/judgeval/scorers/custom_scorer.py
+++ b/judgeval/scorers/custom_scorer.py
@@ -101,7 +101,7 @@ async def a_score_example(self, example, *args, **kwargs) -> float:
         raise NotImplementedError("You must implement the `a_score` method in your custom scorer") 
     
     @abstractmethod
-    def success_check(self) -> bool:
+    def _success_check(self) -> bool:
         """
         For unit testing, determines whether the test case passes or fails
         """
diff --git a/judgeval/scorers/prompt_scorer.py b/judgeval/scorers/prompt_scorer.py
index b1829afe..fb996a96 100644
--- a/judgeval/scorers/prompt_scorer.py
+++ b/judgeval/scorers/prompt_scorer.py
@@ -49,8 +49,8 @@ class PromptScorer(CustomScorer, BaseModel):
     using_native_model: bool = Field(default=True)
 
     # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
-    response: Optional[dict] = None
-    result: Optional[float] = None
+    _response: Optional[dict] = None
+    _result: Optional[float] = None
     
     def __init__(
         self,
@@ -100,11 +100,11 @@ def score_example(
             else:
                 result, reason = self.evaluate(example)
                 self.reason = reason
-                self.result = result
+                self._result = result
                 self.verbose_logs = create_verbose_logs(
                     self,
                     steps=[
-                        f"Results: {self.result}\nReason: {self.reason}",
+                        f"Results: {self._result}\nReason: {self.reason}",
                     ],
                 )
                 return result
@@ -120,11 +120,11 @@ async def a_score_example(
         with scorer_progress_meter(self, display_meter=_show_indicator):
             result, reason = await self.a_evaluate(example)
             self.reason = reason
-            self.result = result
+            self._result = result
             self.verbose_logs = create_verbose_logs(
                 self,
                 steps=[
-                    f"Results: {self.result}\nReason: {self.reason}",
+                    f"Results: {self._result}\nReason: {self.reason}",
                 ],
             )
             return result
@@ -138,11 +138,11 @@ def evaluate(self, example: Example) -> Tuple[Any, str]:
 
         NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
         """
-        prompt = self.build_measure_prompt(example)
+        prompt = self._build_measure_prompt(example)
         if self.using_native_model:
             res = self.model.generate(prompt)
             response = parse_response_json(res, self)
-            result, reason = self.process_response(response)
+            result, reason = self._process_response(response)
             return result, reason
         else:
             raise NotImplementedError("Non-native judge models are not supported in synchronous mode yet.")
@@ -156,25 +156,25 @@ async def a_evaluate(self, example: Example) -> Tuple[Any, str]:
 
         NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
         """
-        judge_prompt = self.build_measure_prompt(example)
-        schema = self.build_schema()
-        prompt = self.enforce_prompt_format(judge_prompt=judge_prompt, schema=schema)
+        judge_prompt = self._build_measure_prompt(example)
+        schema = self._build_schema()
+        prompt = self._enforce_prompt_format(judge_prompt=judge_prompt, schema=schema)
         if self.using_native_model:
             res = await self.model.a_generate(prompt)
             response = parse_response_json(res, self)
-            self.response = response
+            self._response = response
 
-            result, reason = self.process_response(response)
+            result, reason = self._process_response(response)
             self.score = result
             self.reason = reason
-            self.response = response
+            self._response = response
             return result, reason
         else:
             raise NotImplementedError("Non-native judge models are not supported in async mode yet.")
 
     # TODO: can we make this take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
     @abstractmethod
-    def build_measure_prompt(self, example: Example) -> List[dict]:
+    def _build_measure_prompt(self, example: Example) -> List[dict]:
         # builds the prompt that is sent to the model inside of the `score_example()` method
         # returns either a string prompt or a conversation prompt of the form [{"role": "system", "content": "..."}, ...]
 
@@ -197,7 +197,7 @@ def build_measure_prompt(self, example: Example) -> List[dict]:
     
     # TODO: does this need to take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
     @abstractmethod
-    def build_schema(self) -> dict:
+    def _build_schema(self) -> dict:
         """
         This function returns a dictionary that represents the schema of the JSON response that the judge model should return.
 
@@ -208,7 +208,7 @@ def build_schema(self) -> dict:
         """
         pass
     
-    def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict):
+    def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict):
         """
         Formats the final prompt to the judge model.
 
@@ -248,7 +248,7 @@ def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict):
             raise TypeError(f"Prompt must be a list of dictionaries. Got {type(judge_prompt)} instead.")
 
     @abstractmethod 
-    def process_response(self, response: dict):
+    def _process_response(self, response: dict):
         """
         Customizable method for processing the response from the judge model.
 
@@ -264,7 +264,7 @@ def process_response(self, response: dict):
         pass
 
     @abstractmethod
-    def success_check(self, **kwargs) -> bool:
+    def _success_check(self, **kwargs) -> bool:
         """
         Determines whether or not the PromptScorer should consider the evaluation of a single example successful.
         """
@@ -320,7 +320,16 @@ def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapp
             verbose_mode=verbose_mode,
         )
 
-    def build_measure_prompt(self, example: Example) -> List[dict]:
+    def _build_measure_prompt(self, example: Example) -> List[dict]:
+        """
+        Builds the measure prompt for the classifier scorer.
+
+        Args:
+            example (Example): The example to build the prompt for
+
+        Returns:
+            List[dict]: The measure prompt for the classifier scorer
+        """
         replacement_words = {
             "{{actual_output}}": example.actual_output,
             "{{expected_output}}": example.expected_output,
@@ -341,10 +350,10 @@ def build_measure_prompt(self, example: Example) -> List[dict]:
                         message["content"] = content.replace(key, str(value))
         return conversation_copy
 
-    def build_schema(self) -> dict:
+    def _build_schema(self) -> dict:
         return self.options
     
-    def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]:
+    def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]:
         """
         Enforces the judge model to choose an option from the schema.
 
@@ -369,15 +378,45 @@ def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[
         judge_prompt[0]["content"] = system_role
         return judge_prompt
 
-    def process_response(self, response: dict) -> Tuple[float, str]:
+    def _process_response(self, response: dict) -> Tuple[float, str]:
         choice = response.get("choice")
         if choice not in self.options:
             raise ValueError(f"Invalid choice: {choice}. Expected one of: {self.options.keys()}")
         reason = response.get("reason", "No reason could be found in model response.")
         return self.options[choice], reason
 
-    def success_check(self, **kwargs) -> bool:
+    def _success_check(self, **kwargs) -> bool:
         return self.score >= self.threshold
+    
+    def update_name(self, name: str):
+        """
+        Updates the name of the scorer.
+        """
+        self.name = name
+        
+    def update_threshold(self, threshold: float):
+        """
+        Updates the threshold of the scorer.
+        """
+        self.threshold = threshold
+    
+    def update_conversation(self, conversation: List[dict]):
+        """
+        Updates the conversation with the new conversation.
+        
+        Sample conversation:
+        [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
+        """
+        self.conversation = conversation
+        
+    def update_options(self, options: Mapping[str, float]):
+        """
+        Updates the options with the new options.
+        
+        Sample options:
+        {"yes": 1, "no": 0}
+        """
+        self.options = options
 
     def __str__(self):
         return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
diff --git a/tests/common/test_tracer.py b/tests/common/test_tracer.py
index cb6abd61..edc58197 100644
--- a/tests/common/test_tracer.py
+++ b/tests/common/test_tracer.py
@@ -149,17 +149,21 @@ def test_record_input_output(trace_client):
 
 def test_condense_trace(trace_client):
     """Test trace condensing functionality"""
+    # Store the base depth from the enter event
+    base_depth = 0
     entries = [
-        {"type": "enter", "function": "test_func", "depth": 0, "timestamp": 1.0},
-        {"type": "input", "function": "test_func", "depth": 1, "timestamp": 1.1, "inputs": {"x": 1}},
-        {"type": "output", "function": "test_func", "depth": 1, "timestamp": 1.2, "output": "result"},
-        {"type": "exit", "function": "test_func", "depth": 0, "timestamp": 2.0},
+        {"type": "enter", "function": "test_func", "depth": base_depth, "timestamp": 1.0},
+        {"type": "input", "function": "test_func", "depth": base_depth + 1, "timestamp": 1.1, "inputs": {"x": 1}},
+        {"type": "output", "function": "test_func", "depth": base_depth + 1, "timestamp": 1.2, "output": "result"},
+        {"type": "exit", "function": "test_func", "depth": base_depth, "timestamp": 2.0},
     ]
     
     condensed = trace_client.condense_trace(entries)
+    print(f"{condensed=}")
+    # Test that the condensed entry's depth matches the enter event's depth
     assert len(condensed) == 1
     assert condensed[0]["function"] == "test_func"
-    assert condensed[0]["depth"] == 1
+    assert condensed[0]["depth"] == entries[0]["depth"]  # Should match the input event's depth
     assert condensed[0]["inputs"] == {"x": 1}
     assert condensed[0]["output"] == "result"
     assert condensed[0]["duration"] == 1.0
@@ -167,50 +171,35 @@ def test_condense_trace(trace_client):
 @patch('requests.post')
 def test_save_trace(mock_post, trace_client):
     """Test saving trace data"""
-    mock_post.return_value.raise_for_status = Mock()
+    # Configure mock response properly
+    mock_response = Mock()
+    mock_response.status_code = 200
+    mock_response.text = '{"message": "success"}'
+    mock_response.raise_for_status.return_value = None
+    mock_post.return_value = mock_response
     
     with trace_client.span("test_span"):
         trace_client.record_input({"arg": 1})
         trace_client.record_output("result")
     
     trace_id, data = trace_client.save()
-    
     assert mock_post.called
     assert data["trace_id"] == trace_client.trace_id
-    assert data["name"] == "test_trace"
-    assert len(data["entries"]) > 0
-    assert isinstance(data["created_at"], str)
-    assert isinstance(data["duration"], float)
-
-def test_observe_decorator(tracer):
-    """Test the @tracer.observe decorator"""
-    @tracer.observe
-    def test_function(x, y):
-        return x + y
-    
-    with tracer.trace("test_trace"):
-        result = test_function(1, 2)
-    
-    assert result == 3
-
-def test_observe_decorator_with_error(tracer):
-    """Test decorator error handling"""
-    @tracer.observe
-    def failing_function():
-        raise ValueError("Test error")
-    
-    with tracer.trace("test_trace"):
-        with pytest.raises(ValueError):
-            failing_function()
 
 @patch('requests.post')
 def test_wrap_openai(mock_post, tracer):
     """Test wrapping OpenAI client"""
+    # Configure mock response properly
+    mock_response = Mock()
+    mock_response.status_code = 200
+    mock_response.text = '{"message": "success"}'
+    mock_post.return_value = mock_response
+    
     client = OpenAI()
-    mock_response = MagicMock()
-    mock_response.choices = [MagicMock(message=MagicMock(content="test response"))]
-    mock_response.usage = MagicMock(prompt_tokens=10, completion_tokens=20, total_tokens=30)
-    client.chat.completions.create = MagicMock(return_value=mock_response)
+    mock_completion = MagicMock()
+    mock_completion.choices = [MagicMock(message=MagicMock(content="test response"))]
+    mock_completion.usage = MagicMock(prompt_tokens=10, completion_tokens=20, total_tokens=30)
+    client.chat.completions.create = MagicMock(return_value=mock_completion)
     
     wrapped_client = wrap(client)
     
@@ -220,16 +209,22 @@ def test_wrap_openai(mock_post, tracer):
             messages=[{"role": "user", "content": "test"}]
         )
     
-    assert response == mock_response
+    assert response == mock_completion
 
 @patch('requests.post')
 def test_wrap_anthropic(mock_post, tracer):
     """Test wrapping Anthropic client"""
+    # Configure mock response properly
+    mock_response = Mock()
+    mock_response.status_code = 200
+    mock_response.text = '{"message": "success"}'
+    mock_post.return_value = mock_response
+    
     client = Anthropic()
-    mock_response = MagicMock()
-    mock_response.content = [MagicMock(text="test response")]
-    mock_response.usage = MagicMock(input_tokens=10, output_tokens=20)
-    client.messages.create = MagicMock(return_value=mock_response)
+    mock_completion = MagicMock()
+    mock_completion.content = [MagicMock(text="test response")]
+    mock_completion.usage = MagicMock(input_tokens=10, output_tokens=20)
+    client.messages.create = MagicMock(return_value=mock_completion)
     
     wrapped_client = wrap(client)
     
@@ -239,7 +234,7 @@ def test_wrap_anthropic(mock_post, tracer):
             messages=[{"role": "user", "content": "test"}]
         )
     
-    assert response == mock_response
+    assert response == mock_completion
 
 def test_wrap_unsupported_client(tracer):
     """Test wrapping unsupported client type"""
@@ -266,3 +261,24 @@ def test_tracer_invalid_api_key(mocker):
     
     with pytest.raises(JudgmentAPIError, match="Issue with passed in Judgment API key: API key is invalid"):
         Tracer(api_key="invalid_key")
+
+def test_observe_decorator(tracer):
+    """Test the @tracer.observe decorator"""
+    @tracer.observe
+    def test_function(x, y):
+        return x + y
+    
+    with tracer.trace("test_trace"):
+        result = test_function(1, 2)
+    
+    assert result == 3
+
+def test_observe_decorator_with_error(tracer):
+    """Test decorator error handling"""
+    @tracer.observe
+    def failing_function():
+        raise ValueError("Test error")
+    
+    with tracer.trace("test_trace"):
+        with pytest.raises(ValueError):
+            failing_function()
diff --git a/tests/data/test_scorer_data.py b/tests/data/test_scorer_data.py
index 1f1e7829..a9ea1dc9 100644
--- a/tests/data/test_scorer_data.py
+++ b/tests/data/test_scorer_data.py
@@ -44,7 +44,7 @@ def score_example(self, example, *args, **kwargs):
     async def a_score_example(self, example, *args, **kwargs):
         pass
 
-    def success_check(self) -> bool:
+    def _success_check(self) -> bool:
         return self.score >= self.threshold if self.score is not None else False
 
 
diff --git a/tests/scorers/test_custom_scorer.py b/tests/scorers/test_custom_scorer.py
index c01b12a9..6cf4e7ef 100644
--- a/tests/scorers/test_custom_scorer.py
+++ b/tests/scorers/test_custom_scorer.py
@@ -29,7 +29,7 @@ def score_example(self, example, *args, **kwargs) -> float:
     async def a_score_example(self, example, *args, **kwargs) -> float:
         return 0.9
     
-    def success_check(self) -> bool:
+    def _success_check(self) -> bool:
         return self.score >= self.threshold if self.score is not None else False
 
 @pytest.fixture
@@ -118,15 +118,15 @@ def test_success_check_implementation(self, basic_scorer):
         """Test success_check with various scores"""
         # Test with score above threshold
         basic_scorer.score = 0.8
-        assert basic_scorer.success_check() is True
+        assert basic_scorer._success_check() is True
 
         # Test with score below threshold
         basic_scorer.score = 0.6
-        assert basic_scorer.success_check() is False
+        assert basic_scorer._success_check() is False
 
         # Test with no score
         basic_scorer.score = None
-        assert basic_scorer.success_check() is False
+        assert basic_scorer._success_check() is False
 
     def test_str_representation(self, basic_scorer):
         """Test string representation of scorer"""
@@ -149,4 +149,4 @@ class IncompleteScorer(CustomScorer):
             asyncio.run(scorer.a_score_example({}))
             
         with pytest.raises(NotImplementedError):
-            scorer.success_check()
+            scorer._success_check()
diff --git a/tests/scorers/test_prompt_scorer.py b/tests/scorers/test_prompt_scorer.py
index e5e7e9ed..7c50e195 100644
--- a/tests/scorers/test_prompt_scorer.py
+++ b/tests/scorers/test_prompt_scorer.py
@@ -35,20 +35,20 @@ def __init__(self, mock_model, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.model = mock_model
 
-    def build_measure_prompt(self, example: Example) -> List[dict]:
+    def _build_measure_prompt(self, example: Example) -> List[dict]:
         return [
             {"role": "system", "content": "Test system prompt"},
             {"role": "user", "content": f"Response: {example.actual_output}"}
         ]
     
-    def build_schema(self) -> dict:
+    def _build_schema(self) -> dict:
         return {"score": float, "reason": str}
     
-    def process_response(self, response: dict):
+    def _process_response(self, response: dict):
         return response["score"], response["reason"]
     
-    def success_check(self, **kwargs) -> bool:
-        return self.result >= self.threshold
+    def _success_check(self, **kwargs) -> bool:
+        return self._result >= self.threshold
 
 # Tests for PromptScorer
 class TestPromptScorer:
@@ -68,7 +68,7 @@ def test_enforce_prompt_format(self, mock_model):
         prompt = [{"role": "system", "content": "Base prompt"}]
         schema = {"score": float, "reason": str}
         
-        formatted = scorer.enforce_prompt_format(prompt, schema)
+        formatted = scorer._enforce_prompt_format(prompt, schema)
         assert "JSON format" in formatted[0]["content"]
         assert '"score": <score> (float)' in formatted[0]["content"]
         assert '"reason": <reason> (str)' in formatted[0]["content"]
@@ -76,7 +76,7 @@ def test_enforce_prompt_format(self, mock_model):
     def test_enforce_prompt_format_invalid_input(self, mock_model):
         scorer = SampleScorer(name="test_scorer", mock_model=mock_model)
         with pytest.raises(TypeError):
-            scorer.enforce_prompt_format("invalid", {})
+            scorer._enforce_prompt_format("invalid", {})
             
     @pytest.mark.asyncio
     async def test_a_score_example(self, example, mock_model):
@@ -124,7 +124,7 @@ def test_build_measure_prompt(self, example, classifier_conversation, classifier
             options=classifier_options
         )
         
-        prompt = scorer.build_measure_prompt(example)
+        prompt = scorer._build_measure_prompt(example)
         assert "This is a test response" in prompt[0]["content"]
         
     def test_process_response(self, classifier_conversation, classifier_options):
@@ -136,7 +136,7 @@ def test_process_response(self, classifier_conversation, classifier_options):
         )
         
         response = {"choice": "positive", "reason": "Test reason"}
-        score, reason = scorer.process_response(response)
+        score, reason = scorer._process_response(response)
         assert score == 1.0
         assert reason == "Test reason"
         
@@ -150,7 +150,7 @@ def test_process_response_invalid_choice(self, classifier_conversation, classifi
         
         response = {"choice": "invalid", "reason": "Test reason"}
         with pytest.raises(ValueError):
-            scorer.process_response(response)
+            scorer._process_response(response)
             
     def test_success_check(self, classifier_conversation, classifier_options):
         scorer = ClassifierScorer(
@@ -161,7 +161,7 @@ def test_success_check(self, classifier_conversation, classifier_options):
         )
         
         scorer.score = 1.0
-        assert scorer.success_check() is True
+        assert scorer._success_check() is True
         
         scorer.score = 0.0
-        assert scorer.success_check() is False
+        assert scorer._success_check() is False
diff --git a/tests/scorers/test_score.py b/tests/scorers/test_score.py
index 08354fd9..500412e2 100644
--- a/tests/scorers/test_score.py
+++ b/tests/scorers/test_score.py
@@ -20,7 +20,7 @@ def score_example(self, example, *args, **kwargs):
     async def a_score_example(self, example, *args, **kwargs):
         pass
 
-    def success_check(self):
+    def _success_check(self):
         return True
 
 
@@ -798,7 +798,7 @@ def mock_scorer():
     scorer.evaluation_model = "test-model"
     scorer.score = 0.9
     scorer.reason = "Test reason"
-    scorer.success_check.return_value = True
+    scorer._success_check.return_value = True
     scorer.evaluation_cost = 0.1
     scorer.verbose_logs = "Test logs"
     scorer.additional_metadata = {"key": "value"}
diff --git a/tests/scorers/test_scorer_utils.py b/tests/scorers/test_scorer_utils.py
index c10ac0a6..d355cff0 100644
--- a/tests/scorers/test_scorer_utils.py
+++ b/tests/scorers/test_scorer_utils.py
@@ -33,7 +33,7 @@ def score_example(self, example: Example, *args, **kwargs) -> float:
     async def a_score_example(self, example: Example, *args, **kwargs) -> float:
         return 1.0
 
-    def success_check(self) -> bool:
+    def _success_check(self) -> bool:
         return True