JudgmentLabs · SecroLoL · Jan 13, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/Pipfile b/Pipfile
@@ -6,16 +6,18 @@ name = "pypi"
 [packages]
 langfuse = "==2.50.3"
 litellm = "*"
-openai = "==1.47.1"
 python-dotenv = "==1.0.1"
-together = "*"
 fastapi = "*"
 uvicorn = "*"
 deepeval = "*"
 supabase = "*"
 requests = "*"
 pandas = "*"
+openai = "*"
+together = "*"
 anthropic = "*"
+pytest = "*"
+pytest-asyncio = "*"
 
 [dev-packages]
 pytest = "*"

diff --git a/e2etests/judgment_client_test.py b/e2etests/judgment_client_test.py
@@ -5,8 +5,10 @@
 import os
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
-from judgeval.scorers import JudgmentScorer
-from judgeval.constants import APIScorer
+from judgeval.scorers import (
+    FaithfulnessScorer,
+    HallucinationScorer,
+)
 from judgeval.judges import TogetherJudge
 from judgeval.playground import CustomFaithfulnessMetric
 from judgeval.data.datasets.dataset import EvalDataset
@@ -53,16 +55,16 @@ def test_run_eval(client: JudgmentClient):
         additional_metadata={"difficulty": "medium"}
     )
 
-    scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
-    scorer2 = JudgmentScorer(threshold=0.5, score_type=APIScorer.HALLUCINATION)
+    scorer = FaithfulnessScorer(threshold=0.5)
+    scorer2 = HallucinationScorer(threshold=0.5)
     c_scorer = CustomFaithfulnessMetric(threshold=0.6)
 
     PROJECT_NAME = "test_project_JOSEPH"
     EVAL_RUN_NAME = "yomadude"
 
     _ = client.run_evaluation(
         examples=[example1, example2],
-        scorers=[scorer, c_scorer],
+        scorers=[scorer2],
         model="QWEN",
         metadata={"batch": "test"},
         project_name=PROJECT_NAME,
@@ -72,7 +74,7 @@ def test_run_eval(client: JudgmentClient):
     )
 
     results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME)
-    # print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
+    print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
 
 def test_override_eval(client: JudgmentClient):
     example1 = Example(
@@ -82,7 +84,7 @@ def test_override_eval(client: JudgmentClient):
         trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
     )
 
-    scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
+    scorer = FaithfulnessScorer(threshold=0.5)
 
     PROJECT_NAME = "test_eval_run_naming_collisions"
     EVAL_RUN_NAME = ''.join(random.choices(string.ascii_letters + string.digits, k=12))
@@ -171,7 +173,7 @@ def test_evaluate_dataset(client: JudgmentClient):
     dataset = EvalDataset(examples=[example1, example2])
     res = client.evaluate_dataset(
         dataset=dataset,
-        scorers=[JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)],
+        scorers=[FaithfulnessScorer(threshold=0.5)],
         model="QWEN",
         metadata={"batch": "test"},
     )
@@ -180,7 +182,7 @@ def test_evaluate_dataset(client: JudgmentClient):
 
 def test_classifier_scorer(client: JudgmentClient):
     classifier_scorer = client.fetch_classifier_scorer("tonescorer-72gl")
-    faithfulness_scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
+    faithfulness_scorer = FaithfulnessScorer(threshold=0.5)
 
     example1 = Example(
         input="What if these shoes don't fit?",
@@ -199,32 +201,32 @@ def test_classifier_scorer(client: JudgmentClient):
     # Test client functionality
     client = get_client()
     ui_client = get_ui_client()
-    print("Client initialized successfully")
-    print("*" * 40)
+    # print("Client initialized successfully")
+    # print("*" * 40)
 
-    print("Testing dataset creation, pushing, and pulling")
-    test_dataset(ui_client)
-    print("Dataset creation, pushing, and pulling successful")
-    print("*" * 40)
+    # print("Testing dataset creation, pushing, and pulling")
+    # test_dataset(ui_client)
+    # print("Dataset creation, pushing, and pulling successful")
+    # print("*" * 40)
 
     print("Testing evaluation run")
     test_run_eval(ui_client)
     print("Evaluation run successful")
     print("*" * 40)
 
-    print("Testing evaluation run override")
-    test_override_eval(client)
-    print("Evaluation run override successful")
-    print("*" * 40)
+    # print("Testing evaluation run override")
+    # test_override_eval(client)
+    # print("Evaluation run override successful")
+    # print("*" * 40)
 
-    print("Testing dataset evaluation")
-    test_evaluate_dataset(ui_client)
-    print("Dataset evaluation successful")
-    print("*" * 40)
+    # print("Testing dataset evaluation")
+    # test_evaluate_dataset(ui_client)
+    # print("Dataset evaluation successful")
+    # print("*" * 40)
 
-    print("Testing classifier scorer")
-    test_classifier_scorer(ui_client)
-    print("Classifier scorer test successful")
-    print("*" * 40)
+    # print("Testing classifier scorer")
+    # test_classifier_scorer(ui_client)
+    # print("Classifier scorer test successful")
+    # print("*" * 40)
 
-    print("All tests passed successfully")
+    # print("All tests passed successfully")
diff --git a/e2etests/test_tracer.py b/e2etests/test_tracer.py
@@ -11,6 +11,7 @@
 # Local imports
 from judgeval.common.tracer import Tracer, wrap
 from judgeval.constants import APIScorer
+from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer
 
 # Initialize the tracer and clients
 judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
@@ -28,13 +29,12 @@ async def make_upper(input: str) -> str:
     """
     output = input.upper()
     await judgment.get_current_trace().async_evaluate(
+        scorers=[FaithfulnessScorer(threshold=0.5)],
         input="What if these shoes don't fit?",
         actual_output="We offer a 30-day full refund at no extra cost.",
         retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
         expected_output="We offer a 30-day full refund at no extra cost.",
         expected_tools=["refund"],
-        score_type=APIScorer.FAITHFULNESS,
-        threshold=0.5,
         model="gpt-4o-mini",
         log_results=True
     )
@@ -45,6 +45,7 @@ async def make_lower(input):
     output = input.lower()
 
     await judgment.get_current_trace().async_evaluate(
+        scorers=[AnswerRelevancyScorer(threshold=0.5)],
         input="How do I reset my password?",
         actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
         expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
@@ -53,8 +54,6 @@ async def make_lower(input):
         tools_called=["authentication"],
         expected_tools=["authentication"],
         additional_metadata={"difficulty": "medium"},
-        score_type=APIScorer.ANSWER_RELEVANCY,
-        threshold=0.5,
         model="gpt-4o-mini",
         log_results=True
     )
@@ -68,12 +67,11 @@ def llm_call(input):
 async def answer_user_question(input):
     output = llm_call(input)
     await judgment.get_current_trace().async_evaluate(
+        scorers=[AnswerRelevancyScorer(threshold=0.5)],
         input=input,
         actual_output=output,
         retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
         expected_output="We offer a 30-day full refund at no extra cost.",
-        score_type=APIScorer.ANSWER_RELEVANCY,
-        threshold=0.5,
         model="gpt-4o-mini",
         log_results=True
     )

diff --git a/judgeval/common/tracer.py b/judgeval/common/tracer.py
@@ -7,7 +7,16 @@
 import requests
 import uuid
 from contextlib import contextmanager
-from typing import Optional, Any, List, Literal, Tuple, Generator, TypeAlias, Union
+from typing import (
+    Optional, 
+    Any, 
+    List, 
+    Literal, 
+    Tuple, 
+    Generator, 
+    TypeAlias, 
+    Union
+)
 from dataclasses import dataclass, field
 from datetime import datetime 
 from openai import OpenAI
@@ -23,7 +32,7 @@
 from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
-from judgeval.scorers import JudgmentScorer
+from judgeval.scorers import JudgmentScorer, CustomScorer
 from judgeval.data.result import ScoringResult
 
 # Define type aliases for better code readability and maintainability
@@ -149,6 +158,7 @@ def span(self, name: str):
 
     async def async_evaluate(
         self,
+        scorers: List[Union[JudgmentScorer, CustomScorer]],
         input: Optional[str] = None,
         actual_output: Optional[str] = None,
         expected_output: Optional[str] = None,
@@ -157,8 +167,6 @@ async def async_evaluate(
         tools_called: Optional[List[str]] = None,
         expected_tools: Optional[List[str]] = None,
         additional_metadata: Optional[Dict[str, Any]] = None,
-        score_type: Optional[str] = None,
-        threshold: Optional[float] = None,
         model: Optional[str] = None,
         log_results: Optional[bool] = False,
     ):
@@ -174,18 +182,15 @@ async def async_evaluate(
             additional_metadata=additional_metadata,
             trace_id=self.trace_id
         )
-        scorer = JudgmentScorer(
-            score_type=score_type,
-            threshold=threshold
-        )
-        _, scoring_results = self.client.run_evaluation(
+        scoring_results = self.client.run_evaluation(
             examples=[example],
-            scorers=[scorer],
+            scorers=scorers,
             model=model,
             metadata={},
             log_results=log_results,
-            project_name="TestSpanLevel",
-            eval_run_name="TestSpanLevel",
+            project_name="TestSpanLevel1",  # TODO this should be dynamic
+            eval_run_name="TestSpanLevel1",
+            override=True,
         )
 
         self.record_evaluation(scoring_results, start_time)  # Pass start_time to record_evaluation

diff --git a/judgeval/constants.py b/judgeval/constants.py
@@ -20,6 +20,7 @@ class APIScorer(str, Enum):
     CONTEXTUAL_RELEVANCY = "contextual_relevancy"
     CONTEXTUAL_PRECISION = "contextual_precision"
     TOOL_CORRECTNESS = "tool_correctness"
+    JSON_CORRECTNESS = "json_correctness"
 
     @classmethod
     def _missing_(cls, value):

diff --git a/judgeval/run_evaluation.py b/judgeval/run_evaluation.py
@@ -21,11 +21,8 @@
     ROOT_API,
     JUDGMENT_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
-    APIScorer,
 )
 from judgeval.common.exceptions import JudgmentAPIError
-from judgeval.playground import CustomFaithfulnessMetric
-from judgeval.judges import TogetherJudge, MixtureOfJudges
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.common.logger import (
     enable_logging, 
@@ -356,38 +353,3 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
         if not result.scorers_data:  # none of the scorers could be executed on this example
             info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
     return merged_results
-
-
-if __name__ == "__main__":
-    from judgeval.common.logger import enable_logging, debug, info
-    from judgeval.common.tracer import Tracer
-
-    # TODO comeback and delete this, move this to a demo example
-    # Eval using a proprietary Judgment Scorer
-    from judgeval.judgment_client import JudgmentClient
-
-    example1 = Example(
-        input="What if these shoes don't fit?",
-        actual_output="We offer a 30-day full refund at no extra cost.",  # replace this with your code's actual output
-        retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
-    )
-
-    example2 = Example(
-        input="How do I reset my password?",
-        actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
-        expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
-        name="Password Reset",
-        context=["User Account"],
-        retrieval_context=["Password reset instructions"],
-        tools_called=["authentication"],
-        expected_tools=["authentication"],
-        additional_metadata={"difficulty": "medium"}
-    )
-
-
-    scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
-    scorer2 = JudgmentScorer(threshold=0.5, score_type=APIScorer.HALLUCINATION)
-    c_scorer = CustomFaithfulnessMetric(threshold=0.6)
-
-
-    client = JudgmentClient()
diff --git a/judgeval/scorers/__init__.py b/judgeval/scorers/__init__.py
@@ -1,5 +1,30 @@
 from judgeval.scorers.base_scorer import JudgmentScorer
 from judgeval.scorers.custom_scorer import CustomScorer
 from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
+from judgeval.scorers.judgeval_scorers import (
+    ToolCorrectnessScorer,
+    JSONCorrectnessScorer,
+    SummarizationScorer,
+    HallucinationScorer,
+    FaithfulnessScorer,
+    ContextualRelevancyScorer,
+    ContextualPrecisionScorer,
+    ContextualRecallScorer,
+    AnswerRelevancyScorer,
+)
 
-__all__ = ["JudgmentScorer", "CustomScorer", "PromptScorer", "ClassifierScorer"]
+__all__ = [
+    "JudgmentScorer",
+    "CustomScorer",
+    "PromptScorer",
+    "ClassifierScorer",
+    "ToolCorrectnessScorer",
+    "JSONCorrectnessScorer",
+    "SummarizationScorer",
+    "HallucinationScorer",
+    "FaithfulnessScorer",
+    "ContextualRelevancyScorer",
+    "ContextualPrecisionScorer",
+    "ContextualRecallScorer",
+    "AnswerRelevancyScorer",
+]
diff --git a/judgeval/scorers/base_scorer.py b/judgeval/scorers/base_scorer.py
@@ -16,10 +16,21 @@ class JudgmentScorer(BaseModel):
 
     Args:
         score_type (APIScorer): The Judgment metric to use for scoring `Example`s
+        threshold (float): A value between 0 and 1 that determines the scoring threshold
     """
     threshold: float
     score_type: APIScorer
 
+    @field_validator('threshold')
+    def validate_threshold(cls, v):
+        """
+        Validates that the threshold is between 0 and 1 inclusive.
+        """
+        if not 0 <= v <= 1:
+            error(f"Threshold must be between 0 and 1, got: {v}")
+            raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
+        return v
+
     @field_validator('score_type')
     def convert_to_enum_value(cls, v):
         """
@@ -37,5 +48,4 @@ def convert_to_enum_value(cls, v):
         raise ValueError(f"Invalid value for score_type: {v}")
 
     def __str__(self):
-        return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
-
+        return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"