JudgmentLabs
diff --git a/‎Pipfile
Lines changed: 2 additions & 2 deletions b/‎Pipfile
Lines changed: 2 additions & 2 deletions
diff --git a/‎e2etests/judgment_client_test.py
Lines changed: 10 additions & 8 deletions b/‎e2etests/judgment_client_test.py
Lines changed: 10 additions & 8 deletions
diff --git a/‎e2etests/test_tracer.py
Lines changed: 4 additions & 6 deletions b/‎e2etests/test_tracer.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎judgeval/common/tracer.py
Lines changed: 17 additions & 12 deletions b/‎judgeval/common/tracer.py
Lines changed: 17 additions & 12 deletions
diff --git a/‎judgeval/constants.py
Lines changed: 1 addition & 0 deletions b/‎judgeval/constants.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎judgeval/run_evaluation.py
Lines changed: 0 additions & 38 deletions b/‎judgeval/run_evaluation.py
Lines changed: 0 additions & 38 deletions
diff --git a/‎judgeval/scorers/__init__.py
Lines changed: 26 additions & 1 deletion b/‎judgeval/scorers/__init__.py
Lines changed: 26 additions & 1 deletion
diff --git a/‎judgeval/scorers/base_scorer.py
Lines changed: 12 additions & 2 deletions b/‎judgeval/scorers/base_scorer.py
Lines changed: 12 additions & 2 deletions
diff --git a/‎judgeval/scorers/judgeval_scorers/__init__.py
Lines changed: 21 additions & 0 deletions b/‎judgeval/scorers/judgeval_scorers/__init__.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎judgeval/scorers/judgeval_scorers/answer_relevancy.py
Lines changed: 19 additions & 0 deletions b/‎judgeval/scorers/judgeval_scorers/answer_relevancy.py
Lines changed: 19 additions & 0 deletions
@@ -6,15 +6,15 @@ name = "pypi"
 [packages]
 langfuse = "==2.50.3"
 litellm = "*"
-openai = "==1.47.1"
 python-dotenv = "==1.0.1"
-together = "*"
 fastapi = "*"
 uvicorn = "*"
 deepeval = "*"
 supabase = "*"
 requests = "*"
 pandas = "*"
+openai = "*"
+together = "*"
 anthropic = "*"
 
 [dev-packages]
 
@@ -5,8 +5,10 @@
 import os
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
-from judgeval.scorers import JudgmentScorer
-from judgeval.constants import APIScorer
+from judgeval.scorers import (
+    FaithfulnessScorer,
+    HallucinationScorer,
+)
 from judgeval.judges import TogetherJudge
 from judgeval.playground import CustomFaithfulnessMetric
 from judgeval.data.datasets.dataset import EvalDataset
@@ -53,8 +55,8 @@ def test_run_eval(client: JudgmentClient):
         additional_metadata={"difficulty": "medium"}
     )
 
-    scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
-    scorer2 = JudgmentScorer(threshold=0.5, score_type=APIScorer.HALLUCINATION)
+    scorer = FaithfulnessScorer(threshold=0.5)
+    scorer2 = HallucinationScorer(threshold=0.5)
     c_scorer = CustomFaithfulnessMetric(threshold=0.6)
 
     PROJECT_NAME = "test_project_JOSEPH"
@@ -72,7 +74,7 @@ def test_run_eval(client: JudgmentClient):
     )
 
     results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME)
-    # print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
+    print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
 
 def test_override_eval(client: JudgmentClient):
     example1 = Example(
@@ -82,7 +84,7 @@ def test_override_eval(client: JudgmentClient):
         trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
     )
 
-    scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
+    scorer = FaithfulnessScorer(threshold=0.5)
 
     PROJECT_NAME = "test_eval_run_naming_collisions"
     EVAL_RUN_NAME = ''.join(random.choices(string.ascii_letters + string.digits, k=12))
@@ -171,7 +173,7 @@ def test_evaluate_dataset(client: JudgmentClient):
     dataset = EvalDataset(examples=[example1, example2])
     res = client.evaluate_dataset(
         dataset=dataset,
-        scorers=[JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)],
+        scorers=[FaithfulnessScorer(threshold=0.5)],
         model="QWEN",
         metadata={"batch": "test"},
     )
@@ -180,7 +182,7 @@ def test_evaluate_dataset(client: JudgmentClient):
 
 def test_classifier_scorer(client: JudgmentClient):
     classifier_scorer = client.fetch_classifier_scorer("tonescorer-72gl")
-    faithfulness_scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
+    faithfulness_scorer = FaithfulnessScorer(threshold=0.5)
 
     example1 = Example(
         input="What if these shoes don't fit?",
 
@@ -11,6 +11,7 @@
 # Local imports
 from judgeval.common.tracer import Tracer, wrap
 from judgeval.constants import APIScorer
+from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer
 
 # Initialize the tracer and clients
 judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
@@ -28,13 +29,12 @@ async def make_upper(input: str) -> str:
     """
     output = input.upper()
     await judgment.get_current_trace().async_evaluate(
+        scorers=[FaithfulnessScorer(threshold=0.5)],
         input="What if these shoes don't fit?",
         actual_output="We offer a 30-day full refund at no extra cost.",
         retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
         expected_output="We offer a 30-day full refund at no extra cost.",
         expected_tools=["refund"],
-        score_type=APIScorer.FAITHFULNESS,
-        threshold=0.5,
         model="gpt-4o-mini",
         log_results=True
     )
@@ -45,6 +45,7 @@ async def make_lower(input):
     output = input.lower()
 
     await judgment.get_current_trace().async_evaluate(
+        scorers=[AnswerRelevancyScorer(threshold=0.5)],
         input="How do I reset my password?",
         actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
         expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
@@ -53,8 +54,6 @@ async def make_lower(input):
         tools_called=["authentication"],
         expected_tools=["authentication"],
         additional_metadata={"difficulty": "medium"},
-        score_type=APIScorer.ANSWER_RELEVANCY,
-        threshold=0.5,
         model="gpt-4o-mini",
         log_results=True
     )
@@ -68,12 +67,11 @@ def llm_call(input):
 async def answer_user_question(input):
     output = llm_call(input)
     await judgment.get_current_trace().async_evaluate(
+        scorers=[AnswerRelevancyScorer(threshold=0.5)],
         input=input,
         actual_output=output,
         retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
         expected_output="We offer a 30-day full refund at no extra cost.",
-        score_type=APIScorer.ANSWER_RELEVANCY,
-        threshold=0.5,
         model="gpt-4o-mini",
         log_results=True
     )
 
@@ -7,7 +7,16 @@
 import requests
 import uuid
 from contextlib import contextmanager
-from typing import Optional, Any, List, Literal, Tuple, Generator, TypeAlias, Union
+from typing import (
+    Optional, 
+    Any, 
+    List, 
+    Literal, 
+    Tuple, 
+    Generator, 
+    TypeAlias, 
+    Union
+)
 from dataclasses import dataclass, field
 from datetime import datetime 
 from openai import OpenAI
@@ -23,7 +32,7 @@
 from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
-from judgeval.scorers import JudgmentScorer
+from judgeval.scorers import JudgmentScorer, CustomScorer
 from judgeval.data.result import ScoringResult
 
 # Define type aliases for better code readability and maintainability
@@ -149,6 +158,7 @@ def span(self, name: str):
 
     async def async_evaluate(
         self,
+        scorers: List[Union[JudgmentScorer, CustomScorer]],
         input: Optional[str] = None,
         actual_output: Optional[str] = None,
         expected_output: Optional[str] = None,
@@ -157,8 +167,6 @@ async def async_evaluate(
         tools_called: Optional[List[str]] = None,
         expected_tools: Optional[List[str]] = None,
         additional_metadata: Optional[Dict[str, Any]] = None,
-        score_type: Optional[str] = None,
-        threshold: Optional[float] = None,
         model: Optional[str] = None,
         log_results: Optional[bool] = False,
     ):
@@ -174,18 +182,15 @@ async def async_evaluate(
             additional_metadata=additional_metadata,
             trace_id=self.trace_id
         )
-        scorer = JudgmentScorer(
-            score_type=score_type,
-            threshold=threshold
-        )
-        _, scoring_results = self.client.run_evaluation(
+        scoring_results = self.client.run_evaluation(
             examples=[example],
-            scorers=[scorer],
+            scorers=scorers,
             model=model,
             metadata={},
             log_results=log_results,
-            project_name="TestSpanLevel",
-            eval_run_name="TestSpanLevel",
+            project_name="TestSpanLevel1",  # TODO this should be dynamic
+            eval_run_name="TestSpanLevel1",
+            override=True,
         )
 
         self.record_evaluation(scoring_results, start_time)  # Pass start_time to record_evaluation
 
@@ -20,6 +20,7 @@ class APIScorer(str, Enum):
     CONTEXTUAL_RELEVANCY = "contextual_relevancy"
     CONTEXTUAL_PRECISION = "contextual_precision"
     TOOL_CORRECTNESS = "tool_correctness"
+    JSON_CORRECTNESS = "json_correctness"
 
     @classmethod
     def _missing_(cls, value):
 
@@ -21,11 +21,8 @@
     ROOT_API,
     JUDGMENT_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
-    APIScorer,
 )
 from judgeval.common.exceptions import JudgmentAPIError
-from judgeval.playground import CustomFaithfulnessMetric
-from judgeval.judges import TogetherJudge, MixtureOfJudges
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.common.logger import (
     enable_logging, 
@@ -356,38 +353,3 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
         if not result.scorers_data:  # none of the scorers could be executed on this example
             info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
     return merged_results
-
-
-if __name__ == "__main__":
-    from judgeval.common.logger import enable_logging, debug, info
-    from judgeval.common.tracer import Tracer
-    
-    # TODO comeback and delete this, move this to a demo example
-    # Eval using a proprietary Judgment Scorer
-    from judgeval.judgment_client import JudgmentClient
-
-    example1 = Example(
-        input="What if these shoes don't fit?",
-        actual_output="We offer a 30-day full refund at no extra cost.",  # replace this with your code's actual output
-        retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
-    )
-
-    example2 = Example(
-        input="How do I reset my password?",
-        actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
-        expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
-        name="Password Reset",
-        context=["User Account"],
-        retrieval_context=["Password reset instructions"],
-        tools_called=["authentication"],
-        expected_tools=["authentication"],
-        additional_metadata={"difficulty": "medium"}
-    )
-
-
-    scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
-    scorer2 = JudgmentScorer(threshold=0.5, score_type=APIScorer.HALLUCINATION)
-    c_scorer = CustomFaithfulnessMetric(threshold=0.6)
-
-
-    client = JudgmentClient()
@@ -1,5 +1,30 @@
 from judgeval.scorers.base_scorer import JudgmentScorer
 from judgeval.scorers.custom_scorer import CustomScorer
 from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
+from judgeval.scorers.judgeval_scorers import (
+    ToolCorrectnessScorer,
+    JSONCorrectnessScorer,
+    SummarizationScorer,
+    HallucinationScorer,
+    FaithfulnessScorer,
+    ContextualRelevancyScorer,
+    ContextualPrecisionScorer,
+    ContextualRecallScorer,
+    AnswerRelevancyScorer,
+)
 
-__all__ = ["JudgmentScorer", "CustomScorer", "PromptScorer", "ClassifierScorer"]
+__all__ = [
+    "JudgmentScorer",
+    "CustomScorer",
+    "PromptScorer",
+    "ClassifierScorer",
+    "ToolCorrectnessScorer",
+    "JSONCorrectnessScorer",
+    "SummarizationScorer",
+    "HallucinationScorer",
+    "FaithfulnessScorer",
+    "ContextualRelevancyScorer",
+    "ContextualPrecisionScorer",
+    "ContextualRecallScorer",
+    "AnswerRelevancyScorer",
+]
@@ -16,10 +16,21 @@ class JudgmentScorer(BaseModel):
 
     Args:
         score_type (APIScorer): The Judgment metric to use for scoring `Example`s
+        threshold (float): A value between 0 and 1 that determines the scoring threshold
     """
     threshold: float
     score_type: APIScorer
 
+    @field_validator('threshold')
+    def validate_threshold(cls, v):
+        """
+        Validates that the threshold is between 0 and 1 inclusive.
+        """
+        if not 0 <= v <= 1:
+            error(f"Threshold must be between 0 and 1, got: {v}")
+            raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
+        return v
+
     @field_validator('score_type')
     def convert_to_enum_value(cls, v):
         """
@@ -37,5 +48,4 @@ def convert_to_enum_value(cls, v):
         raise ValueError(f"Invalid value for score_type: {v}")
 
     def __str__(self):
-        return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
-    
+        return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
@@ -0,0 +1,21 @@
+from judgeval.scorers.judgeval_scorers.tool_correctness import ToolCorrectnessScorer
+from judgeval.scorers.judgeval_scorers.json_correctness import JSONCorrectnessScorer
+from judgeval.scorers.judgeval_scorers.summarization import SummarizationScorer
+from judgeval.scorers.judgeval_scorers.hallucination import HallucinationScorer
+from judgeval.scorers.judgeval_scorers.faithfulness import FaithfulnessScorer
+from judgeval.scorers.judgeval_scorers.contextual_relevancy import ContextualRelevancyScorer
+from judgeval.scorers.judgeval_scorers.contextual_precision import ContextualPrecisionScorer
+from judgeval.scorers.judgeval_scorers.contextual_recall import ContextualRecallScorer
+from judgeval.scorers.judgeval_scorers.answer_relevancy import AnswerRelevancyScorer
+
+__all__ = [
+    "ToolCorrectnessScorer",
+    "JSONCorrectnessScorer",
+    "SummarizationScorer",
+    "HallucinationScorer",
+    "FaithfulnessScorer",
+    "ContextualRelevancyScorer",
+    "ContextualPrecisionScorer",
+    "ContextualRecallScorer",
+    "AnswerRelevancyScorer",
+]
@@ -0,0 +1,19 @@
+"""
+`judgeval` answer relevancy scorer
+
+TODO add link to docs page for this scorer
+
+"""
+
+# Internal imports
+from judgeval.scorers.base_scorer import JudgmentScorer
+from judgeval.constants import APIScorer
+
+
+class AnswerRelevancyScorer(JudgmentScorer):
+    def __init__(self, threshold: float):
+        super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
+
+    @property
+    def __name__(self):
+        return "Answer Relevancy"