Example Check and User Prompt (#301)

alanzhang25 · web-flow · commit 30addd8ceffb · 2025-06-03T16:50:54.000Z
* Example Check and User Prompt

* Test Fixes

* Required Params for Custom Scorers
diff --git a/src/demo/eval_test.py b/src/demo/eval_test.py
@@ -1,6 +1,6 @@
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data.example import Example
-from judgeval.scorers import AnswerRelevancyScorer
+from judgeval.scorers import AnswerRelevancyScorer, FaithfulnessScorer
 from judgeval.common.tracer import Tracer
 
 judgment = JudgmentClient()
@@ -9,7 +9,7 @@
 qa_pairs = [
     ("What is the capital of France?", "Paris"),
     ("What is the largest planet in our solar system?", "Jupiter"),
-    # ("Who wrote 'Romeo and Juliet'?", "William Shakespeare"),
+    ("Who wrote 'Romeo and Juliet'?", "William Shakespeare"),
     # ("What is the chemical symbol for gold?", "Au"),
     # ("What is the square root of 144?", "12"),
     # ("Who painted the Mona Lisa?", "Leonardo da Vinci"),
@@ -61,10 +61,10 @@
 
 # Create a list of Example objects
 examples = [Example(input=question, actual_output=answer) for question, answer in qa_pairs]
-for example in examples:
-    print(example.model_dump())
+
+
 judgment.run_evaluation(
     examples=examples,
-    scorers=[AnswerRelevancyScorer(threshold=0.5)],
-    append=True
+    scorers=[AnswerRelevancyScorer(threshold=0.5), FaithfulnessScorer(threshold=0.5)],
+    override=True
 )
diff --git a/src/e2etests/test_all_scorers.py b/src/e2etests/test_all_scorers.py
@@ -26,7 +26,7 @@
 )
 
 from judgeval.data import Example
-
+from judgeval.data.example import ExampleParams
 
 def test_ac_scorer(client: JudgmentClient):
     
@@ -682,7 +682,8 @@ def _success_check(self, **kwargs) -> bool:
         threshold=0.5,  # Expect positive sentiment (3 or higher on 1-5 scale)
         include_reason=True,
         strict_mode=False,
-        verbose_mode=True
+        verbose_mode=True,
+        required_params=[ExampleParams.INPUT, ExampleParams.ACTUAL_OUTPUT]
     )
 
     # Run evaluation
diff --git a/src/e2etests/test_eval_operations.py b/src/e2etests/test_eval_operations.py
@@ -55,7 +55,7 @@ def run_eval_helper(self, client: JudgmentClient, project_name: str, eval_run_na
         )
 
         scorer = FaithfulnessScorer(threshold=0.5)
-        scorer2 = HallucinationScorer(threshold=0.5)
+        scorer2 = AnswerRelevancyScorer(threshold=0.5)
 
         client.run_evaluation(
             examples=[example1, example2],
@@ -164,15 +164,14 @@ async def test_assert_test(self, client: JudgmentClient):
             actual_output="No, the room is too small.",
         )
 
-        scorer = FaithfulnessScorer(threshold=0.5)
-        scorer1 = AnswerRelevancyScorer(threshold=0.5)
+        scorer = AnswerRelevancyScorer(threshold=0.5)
 
         with pytest.raises(AssertionError):
             await client.assert_test(
                 eval_run_name="test_eval",
                 project_name="test_project",
                 examples=[example, example1, example2],
-                scorers=[scorer, scorer1],
+                scorers=[scorer],
                 model="Qwen/Qwen2.5-72B-Instruct-Turbo",
                 override=True
             )
diff --git a/src/judgeval/run_evaluation.py b/src/judgeval/run_evaluation.py
@@ -1,6 +1,7 @@
 import asyncio
 import requests
 import time
+import json
 import sys
 import itertools
 import threading
@@ -362,14 +363,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
     """
     Checks if the example contains the necessary parameters for the scorer.
     """
+    prompt_user = False
     for scorer in scorers:
         for example in examples:
             missing_params = []
             for param in scorer.required_params:
                 if getattr(example, param.value) is None:
-                    missing_params.append(f"'{param.value}'")
+                    missing_params.append(f"{param.value}")
             if missing_params:
-                print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
+                rprint(f"[yellow]⚠️  WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
+                rprint(f"Missing parameters: {', '.join(missing_params)}")
+                rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
+                rprint("-"*40)
+                prompt_user = True
+
+    if prompt_user:
+        user_input = input("Do you want to continue? (y/n)")
+        if user_input.lower() != "y":
+            sys.exit(0)  
+        else:
+            rprint("[green]Continuing...[/green]")
 
 def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -894,6 +907,7 @@ async def _async_evaluation_workflow():
             f"Processing evaluation '{evaluation_run.eval_name}': "
         )
     else:
+        check_examples(evaluation_run.examples, evaluation_run.scorers)
         if judgment_scorers:
             # Execute evaluation using Judgment API
             info("Starting API evaluation")
diff --git a/src/judgeval/scorers/judgeval_scorer.py b/src/judgeval/scorers/judgeval_scorer.py
@@ -12,7 +12,7 @@
 from judgeval.judges import JudgevalJudge
 from judgeval.judges.utils import create_judge
 from judgeval.constants import UNBOUNDED_SCORERS
-
+from judgeval.data.example import ExampleParams
 class JudgevalScorer:
     """
     Base class for scorers in `judgeval`.
@@ -39,6 +39,7 @@ class JudgevalScorer:
     evaluation_cost: Optional[float] = None  # The cost of running the scorer
     verbose_logs: Optional[str] = None  # The verbose logs of the scorer
     additional_metadata: Optional[Dict] = None  # Additional metadata for the scorer
+    required_params: Optional[List[ExampleParams]] = None  # The required parameters for the scorer
     error: Optional[str] = None
     success: Optional[bool] = None
 
@@ -51,6 +52,7 @@ def __init__(
         reason: Optional[str] = None, 
         success: Optional[bool] = None, 
         evaluation_model: Optional[str] = None, 
+        required_params: Optional[List[ExampleParams]] = None,
         strict_mode: bool = False, 
         async_mode: bool = True, 
         verbose_mode: bool = True, 
@@ -87,6 +89,7 @@ def __init__(
             self.evaluation_cost = evaluation_cost
             self.verbose_logs = verbose_logs
             self.additional_metadata = additional_metadata
+            self.required_params = required_params
 
     def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
         """
diff --git a/src/judgeval/scorers/prompt_scorer.py b/src/judgeval/scorers/prompt_scorer.py
@@ -30,6 +30,7 @@
 from pydantic import BaseModel, model_serializer, Field
 
 from judgeval.data import Example
+from judgeval.data.example import ExampleParams
 from judgeval.scorers import JudgevalScorer
 from judgeval.scorers.utils import (
     scorer_progress_meter, 
@@ -64,6 +65,7 @@ def __init__(
         async_mode: bool = True,
         strict_mode: bool = False,
         verbose_mode: bool = False,
+        required_params: Optional[List[ExampleParams]] = None,
     ):
         # Initialize BaseModel first
         BaseModel.__init__(
@@ -85,6 +87,7 @@ def __init__(
             async_mode=async_mode,
             strict_mode=strict_mode,
             verbose_mode=verbose_mode,
+            required_params=required_params,
         )
 
     def score_example(
diff --git a/src/tests/notification/test_notification_integration.py b/src/tests/notification/test_notification_integration.py
@@ -291,7 +291,7 @@ def mock_post_side_effect(url, *args, **kwargs):
         rule = Rule(
             name="Faithfulness Rule",
             conditions=[
-                Condition(metric=FaithfulnessScorer(threshold=0.7))
+                Condition(metric=AnswerRelevancyScorer(threshold=0.7))
             ],
             combine_type="all",
             notification=notification
@@ -300,7 +300,7 @@ def mock_post_side_effect(url, *args, **kwargs):
         # Run evaluation
         result = client.run_evaluation(
             examples=[example],
-            scorers=[FaithfulnessScorer(threshold=0.7)],
+            scorers=[AnswerRelevancyScorer(threshold=0.7)],
             model="gpt-3.5-turbo",
             rules=[rule]
         )
@@ -402,7 +402,7 @@ def mock_post_side_effect(url, *args, **kwargs):
         rule = Rule(
             name="Faithfulness Rule",
             conditions=[
-                Condition(metric=FaithfulnessScorer(threshold=0.7))
+                Condition(metric=AnswerRelevancyScorer(threshold=0.7))
             ],
             combine_type="all",
             notification=notification
@@ -411,7 +411,7 @@ def mock_post_side_effect(url, *args, **kwargs):
         # Run evaluation
         result = client.run_evaluation(
             examples=[example],
-            scorers=[FaithfulnessScorer(threshold=0.7)],
+            scorers=[AnswerRelevancyScorer(threshold=0.7)],
             model="gpt-3.5-turbo",
             rules=[rule]
         )

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`)`
`27`	`27`
`28`	`28`	`from judgeval.data import Example`
`29`		`-`
	`29`	`+from judgeval.data.example import ExampleParams`
`30`	`30`
`31`	`31`	`def test_ac_scorer(client: JudgmentClient):`
`32`	`32`
`@@ -682,7 +682,8 @@ def _success_check(self, **kwargs) -> bool:`
`682`	`682`	`threshold=0.5, # Expect positive sentiment (3 or higher on 1-5 scale)`
`683`	`683`	`include_reason=True,`
`684`	`684`	`strict_mode=False,`
`685`		`- verbose_mode=True`
	`685`	`+ verbose_mode=True,`
	`686`	`+ required_params=[ExampleParams.INPUT, ExampleParams.ACTUAL_OUTPUT]`
`686`	`687`	`)`
`687`	`688`
`688`	`689`	`# Run evaluation`