Skip to content

Commit 30addd8

Browse files
authored
Example Check and User Prompt (#301)
* Example Check and User Prompt * Test Fixes * Required Params for Custom Scorers
1 parent cdc2e4a commit 30addd8

File tree

7 files changed

+39
-19
lines changed

7 files changed

+39
-19
lines changed

src/demo/eval_test.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from judgeval.judgment_client import JudgmentClient
22
from judgeval.data.example import Example
3-
from judgeval.scorers import AnswerRelevancyScorer
3+
from judgeval.scorers import AnswerRelevancyScorer, FaithfulnessScorer
44
from judgeval.common.tracer import Tracer
55

66
judgment = JudgmentClient()
@@ -9,7 +9,7 @@
99
qa_pairs = [
1010
("What is the capital of France?", "Paris"),
1111
("What is the largest planet in our solar system?", "Jupiter"),
12-
# ("Who wrote 'Romeo and Juliet'?", "William Shakespeare"),
12+
("Who wrote 'Romeo and Juliet'?", "William Shakespeare"),
1313
# ("What is the chemical symbol for gold?", "Au"),
1414
# ("What is the square root of 144?", "12"),
1515
# ("Who painted the Mona Lisa?", "Leonardo da Vinci"),
@@ -61,10 +61,10 @@
6161

6262
# Create a list of Example objects
6363
examples = [Example(input=question, actual_output=answer) for question, answer in qa_pairs]
64-
for example in examples:
65-
print(example.model_dump())
64+
65+
6666
judgment.run_evaluation(
6767
examples=examples,
68-
scorers=[AnswerRelevancyScorer(threshold=0.5)],
69-
append=True
68+
scorers=[AnswerRelevancyScorer(threshold=0.5), FaithfulnessScorer(threshold=0.5)],
69+
override=True
7070
)

src/e2etests/test_all_scorers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
)
2727

2828
from judgeval.data import Example
29-
29+
from judgeval.data.example import ExampleParams
3030

3131
def test_ac_scorer(client: JudgmentClient):
3232

@@ -682,7 +682,8 @@ def _success_check(self, **kwargs) -> bool:
682682
threshold=0.5, # Expect positive sentiment (3 or higher on 1-5 scale)
683683
include_reason=True,
684684
strict_mode=False,
685-
verbose_mode=True
685+
verbose_mode=True,
686+
required_params=[ExampleParams.INPUT, ExampleParams.ACTUAL_OUTPUT]
686687
)
687688

688689
# Run evaluation

src/e2etests/test_eval_operations.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def run_eval_helper(self, client: JudgmentClient, project_name: str, eval_run_na
5555
)
5656

5757
scorer = FaithfulnessScorer(threshold=0.5)
58-
scorer2 = HallucinationScorer(threshold=0.5)
58+
scorer2 = AnswerRelevancyScorer(threshold=0.5)
5959

6060
client.run_evaluation(
6161
examples=[example1, example2],
@@ -164,15 +164,14 @@ async def test_assert_test(self, client: JudgmentClient):
164164
actual_output="No, the room is too small.",
165165
)
166166

167-
scorer = FaithfulnessScorer(threshold=0.5)
168-
scorer1 = AnswerRelevancyScorer(threshold=0.5)
167+
scorer = AnswerRelevancyScorer(threshold=0.5)
169168

170169
with pytest.raises(AssertionError):
171170
await client.assert_test(
172171
eval_run_name="test_eval",
173172
project_name="test_project",
174173
examples=[example, example1, example2],
175-
scorers=[scorer, scorer1],
174+
scorers=[scorer],
176175
model="Qwen/Qwen2.5-72B-Instruct-Turbo",
177176
override=True
178177
)

src/judgeval/run_evaluation.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import asyncio
22
import requests
33
import time
4+
import json
45
import sys
56
import itertools
67
import threading
@@ -362,14 +363,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
362363
"""
363364
Checks if the example contains the necessary parameters for the scorer.
364365
"""
366+
prompt_user = False
365367
for scorer in scorers:
366368
for example in examples:
367369
missing_params = []
368370
for param in scorer.required_params:
369371
if getattr(example, param.value) is None:
370-
missing_params.append(f"'{param.value}'")
372+
missing_params.append(f"{param.value}")
371373
if missing_params:
372-
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
374+
rprint(f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
375+
rprint(f"Missing parameters: {', '.join(missing_params)}")
376+
rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
377+
rprint("-"*40)
378+
prompt_user = True
379+
380+
if prompt_user:
381+
user_input = input("Do you want to continue? (y/n)")
382+
if user_input.lower() != "y":
383+
sys.exit(0)
384+
else:
385+
rprint("[green]Continuing...[/green]")
373386

374387
def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
375388
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -894,6 +907,7 @@ async def _async_evaluation_workflow():
894907
f"Processing evaluation '{evaluation_run.eval_name}': "
895908
)
896909
else:
910+
check_examples(evaluation_run.examples, evaluation_run.scorers)
897911
if judgment_scorers:
898912
# Execute evaluation using Judgment API
899913
info("Starting API evaluation")

src/judgeval/scorers/judgeval_scorer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from judgeval.judges import JudgevalJudge
1313
from judgeval.judges.utils import create_judge
1414
from judgeval.constants import UNBOUNDED_SCORERS
15-
15+
from judgeval.data.example import ExampleParams
1616
class JudgevalScorer:
1717
"""
1818
Base class for scorers in `judgeval`.
@@ -39,6 +39,7 @@ class JudgevalScorer:
3939
evaluation_cost: Optional[float] = None # The cost of running the scorer
4040
verbose_logs: Optional[str] = None # The verbose logs of the scorer
4141
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
42+
required_params: Optional[List[ExampleParams]] = None # The required parameters for the scorer
4243
error: Optional[str] = None
4344
success: Optional[bool] = None
4445

@@ -51,6 +52,7 @@ def __init__(
5152
reason: Optional[str] = None,
5253
success: Optional[bool] = None,
5354
evaluation_model: Optional[str] = None,
55+
required_params: Optional[List[ExampleParams]] = None,
5456
strict_mode: bool = False,
5557
async_mode: bool = True,
5658
verbose_mode: bool = True,
@@ -87,6 +89,7 @@ def __init__(
8789
self.evaluation_cost = evaluation_cost
8890
self.verbose_logs = verbose_logs
8991
self.additional_metadata = additional_metadata
92+
self.required_params = required_params
9093

9194
def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
9295
"""

src/judgeval/scorers/prompt_scorer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from pydantic import BaseModel, model_serializer, Field
3131

3232
from judgeval.data import Example
33+
from judgeval.data.example import ExampleParams
3334
from judgeval.scorers import JudgevalScorer
3435
from judgeval.scorers.utils import (
3536
scorer_progress_meter,
@@ -64,6 +65,7 @@ def __init__(
6465
async_mode: bool = True,
6566
strict_mode: bool = False,
6667
verbose_mode: bool = False,
68+
required_params: Optional[List[ExampleParams]] = None,
6769
):
6870
# Initialize BaseModel first
6971
BaseModel.__init__(
@@ -85,6 +87,7 @@ def __init__(
8587
async_mode=async_mode,
8688
strict_mode=strict_mode,
8789
verbose_mode=verbose_mode,
90+
required_params=required_params,
8891
)
8992

9093
def score_example(

src/tests/notification/test_notification_integration.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ def mock_post_side_effect(url, *args, **kwargs):
291291
rule = Rule(
292292
name="Faithfulness Rule",
293293
conditions=[
294-
Condition(metric=FaithfulnessScorer(threshold=0.7))
294+
Condition(metric=AnswerRelevancyScorer(threshold=0.7))
295295
],
296296
combine_type="all",
297297
notification=notification
@@ -300,7 +300,7 @@ def mock_post_side_effect(url, *args, **kwargs):
300300
# Run evaluation
301301
result = client.run_evaluation(
302302
examples=[example],
303-
scorers=[FaithfulnessScorer(threshold=0.7)],
303+
scorers=[AnswerRelevancyScorer(threshold=0.7)],
304304
model="gpt-3.5-turbo",
305305
rules=[rule]
306306
)
@@ -402,7 +402,7 @@ def mock_post_side_effect(url, *args, **kwargs):
402402
rule = Rule(
403403
name="Faithfulness Rule",
404404
conditions=[
405-
Condition(metric=FaithfulnessScorer(threshold=0.7))
405+
Condition(metric=AnswerRelevancyScorer(threshold=0.7))
406406
],
407407
combine_type="all",
408408
notification=notification
@@ -411,7 +411,7 @@ def mock_post_side_effect(url, *args, **kwargs):
411411
# Run evaluation
412412
result = client.run_evaluation(
413413
examples=[example],
414-
scorers=[FaithfulnessScorer(threshold=0.7)],
414+
scorers=[AnswerRelevancyScorer(threshold=0.7)],
415415
model="gpt-3.5-turbo",
416416
rules=[rule]
417417
)

0 commit comments

Comments
 (0)