Skip to content

Commit b6944b3

Browse files
authored
Merge pull request #36 from JudgmentLabs/refactor_default_judges
Refactor default judges
2 parents 8d83f32 + 03b2287 commit b6944b3

27 files changed

+521
-69
lines changed

Pipfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@ name = "pypi"
66
[packages]
77
langfuse = "==2.50.3"
88
litellm = "*"
9-
openai = "==1.47.1"
109
python-dotenv = "==1.0.1"
11-
together = "*"
1210
fastapi = "*"
1311
uvicorn = "*"
1412
deepeval = "*"
1513
supabase = "*"
1614
requests = "*"
1715
pandas = "*"
16+
openai = "*"
17+
together = "*"
1818
anthropic = "*"
1919

2020
[dev-packages]

e2etests/judgment_client_test.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
import os
66
from judgeval.judgment_client import JudgmentClient
77
from judgeval.data import Example
8-
from judgeval.scorers import JudgmentScorer
9-
from judgeval.constants import APIScorer
8+
from judgeval.scorers import (
9+
FaithfulnessScorer,
10+
HallucinationScorer,
11+
)
1012
from judgeval.judges import TogetherJudge
1113
from judgeval.playground import CustomFaithfulnessMetric
1214
from judgeval.data.datasets.dataset import EvalDataset
@@ -53,8 +55,8 @@ def test_run_eval(client: JudgmentClient):
5355
additional_metadata={"difficulty": "medium"}
5456
)
5557

56-
scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
57-
scorer2 = JudgmentScorer(threshold=0.5, score_type=APIScorer.HALLUCINATION)
58+
scorer = FaithfulnessScorer(threshold=0.5)
59+
scorer2 = HallucinationScorer(threshold=0.5)
5860
c_scorer = CustomFaithfulnessMetric(threshold=0.6)
5961

6062
PROJECT_NAME = "test_project_JOSEPH"
@@ -72,7 +74,7 @@ def test_run_eval(client: JudgmentClient):
7274
)
7375

7476
results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME)
75-
# print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
77+
print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
7678

7779
def test_override_eval(client: JudgmentClient):
7880
example1 = Example(
@@ -82,7 +84,7 @@ def test_override_eval(client: JudgmentClient):
8284
trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
8385
)
8486

85-
scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
87+
scorer = FaithfulnessScorer(threshold=0.5)
8688

8789
PROJECT_NAME = "test_eval_run_naming_collisions"
8890
EVAL_RUN_NAME = ''.join(random.choices(string.ascii_letters + string.digits, k=12))
@@ -171,7 +173,7 @@ def test_evaluate_dataset(client: JudgmentClient):
171173
dataset = EvalDataset(examples=[example1, example2])
172174
res = client.evaluate_dataset(
173175
dataset=dataset,
174-
scorers=[JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)],
176+
scorers=[FaithfulnessScorer(threshold=0.5)],
175177
model="QWEN",
176178
metadata={"batch": "test"},
177179
)
@@ -180,7 +182,7 @@ def test_evaluate_dataset(client: JudgmentClient):
180182

181183
def test_classifier_scorer(client: JudgmentClient):
182184
classifier_scorer = client.fetch_classifier_scorer("tonescorer-72gl")
183-
faithfulness_scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
185+
faithfulness_scorer = FaithfulnessScorer(threshold=0.5)
184186

185187
example1 = Example(
186188
input="What if these shoes don't fit?",

e2etests/test_tracer.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# Local imports
1212
from judgeval.common.tracer import Tracer, wrap
1313
from judgeval.constants import APIScorer
14+
from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer
1415

1516
# Initialize the tracer and clients
1617
judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
@@ -28,13 +29,12 @@ async def make_upper(input: str) -> str:
2829
"""
2930
output = input.upper()
3031
await judgment.get_current_trace().async_evaluate(
32+
scorers=[FaithfulnessScorer(threshold=0.5)],
3133
input="What if these shoes don't fit?",
3234
actual_output="We offer a 30-day full refund at no extra cost.",
3335
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
3436
expected_output="We offer a 30-day full refund at no extra cost.",
3537
expected_tools=["refund"],
36-
score_type=APIScorer.FAITHFULNESS,
37-
threshold=0.5,
3838
model="gpt-4o-mini",
3939
log_results=True
4040
)
@@ -45,6 +45,7 @@ async def make_lower(input):
4545
output = input.lower()
4646

4747
await judgment.get_current_trace().async_evaluate(
48+
scorers=[AnswerRelevancyScorer(threshold=0.5)],
4849
input="How do I reset my password?",
4950
actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
5051
expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
@@ -53,8 +54,6 @@ async def make_lower(input):
5354
tools_called=["authentication"],
5455
expected_tools=["authentication"],
5556
additional_metadata={"difficulty": "medium"},
56-
score_type=APIScorer.ANSWER_RELEVANCY,
57-
threshold=0.5,
5857
model="gpt-4o-mini",
5958
log_results=True
6059
)
@@ -68,12 +67,11 @@ def llm_call(input):
6867
async def answer_user_question(input):
6968
output = llm_call(input)
7069
await judgment.get_current_trace().async_evaluate(
70+
scorers=[AnswerRelevancyScorer(threshold=0.5)],
7171
input=input,
7272
actual_output=output,
7373
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
7474
expected_output="We offer a 30-day full refund at no extra cost.",
75-
score_type=APIScorer.ANSWER_RELEVANCY,
76-
threshold=0.5,
7775
model="gpt-4o-mini",
7876
log_results=True
7977
)

judgeval/common/tracer.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,16 @@
77
import requests
88
import uuid
99
from contextlib import contextmanager
10-
from typing import Optional, Any, List, Literal, Tuple, Generator, TypeAlias, Union
10+
from typing import (
11+
Optional,
12+
Any,
13+
List,
14+
Literal,
15+
Tuple,
16+
Generator,
17+
TypeAlias,
18+
Union
19+
)
1120
from dataclasses import dataclass, field
1221
from datetime import datetime
1322
from openai import OpenAI
@@ -23,7 +32,7 @@
2332
from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
2433
from judgeval.judgment_client import JudgmentClient
2534
from judgeval.data import Example
26-
from judgeval.scorers import JudgmentScorer
35+
from judgeval.scorers import JudgmentScorer, CustomScorer
2736
from judgeval.data.result import ScoringResult
2837

2938
# Define type aliases for better code readability and maintainability
@@ -149,6 +158,7 @@ def span(self, name: str):
149158

150159
async def async_evaluate(
151160
self,
161+
scorers: List[Union[JudgmentScorer, CustomScorer]],
152162
input: Optional[str] = None,
153163
actual_output: Optional[str] = None,
154164
expected_output: Optional[str] = None,
@@ -157,8 +167,6 @@ async def async_evaluate(
157167
tools_called: Optional[List[str]] = None,
158168
expected_tools: Optional[List[str]] = None,
159169
additional_metadata: Optional[Dict[str, Any]] = None,
160-
score_type: Optional[str] = None,
161-
threshold: Optional[float] = None,
162170
model: Optional[str] = None,
163171
log_results: Optional[bool] = False,
164172
):
@@ -174,18 +182,15 @@ async def async_evaluate(
174182
additional_metadata=additional_metadata,
175183
trace_id=self.trace_id
176184
)
177-
scorer = JudgmentScorer(
178-
score_type=score_type,
179-
threshold=threshold
180-
)
181-
_, scoring_results = self.client.run_evaluation(
185+
scoring_results = self.client.run_evaluation(
182186
examples=[example],
183-
scorers=[scorer],
187+
scorers=scorers,
184188
model=model,
185189
metadata={},
186190
log_results=log_results,
187-
project_name="TestSpanLevel",
188-
eval_run_name="TestSpanLevel",
191+
project_name="TestSpanLevel1", # TODO this should be dynamic
192+
eval_run_name="TestSpanLevel1",
193+
override=True,
189194
)
190195

191196
self.record_evaluation(scoring_results, start_time) # Pass start_time to record_evaluation

judgeval/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class APIScorer(str, Enum):
2020
CONTEXTUAL_RELEVANCY = "contextual_relevancy"
2121
CONTEXTUAL_PRECISION = "contextual_precision"
2222
TOOL_CORRECTNESS = "tool_correctness"
23+
JSON_CORRECTNESS = "json_correctness"
2324

2425
@classmethod
2526
def _missing_(cls, value):

judgeval/run_evaluation.py

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,8 @@
2121
ROOT_API,
2222
JUDGMENT_EVAL_API_URL,
2323
JUDGMENT_EVAL_LOG_API_URL,
24-
APIScorer,
2524
)
2625
from judgeval.common.exceptions import JudgmentAPIError
27-
from judgeval.playground import CustomFaithfulnessMetric
28-
from judgeval.judges import TogetherJudge, MixtureOfJudges
2926
from judgeval.evaluation_run import EvaluationRun
3027
from judgeval.common.logger import (
3128
enable_logging,
@@ -356,38 +353,3 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
356353
if not result.scorers_data: # none of the scorers could be executed on this example
357354
info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
358355
return merged_results
359-
360-
361-
if __name__ == "__main__":
362-
from judgeval.common.logger import enable_logging, debug, info
363-
from judgeval.common.tracer import Tracer
364-
365-
# TODO comeback and delete this, move this to a demo example
366-
# Eval using a proprietary Judgment Scorer
367-
from judgeval.judgment_client import JudgmentClient
368-
369-
example1 = Example(
370-
input="What if these shoes don't fit?",
371-
actual_output="We offer a 30-day full refund at no extra cost.", # replace this with your code's actual output
372-
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
373-
)
374-
375-
example2 = Example(
376-
input="How do I reset my password?",
377-
actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
378-
expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
379-
name="Password Reset",
380-
context=["User Account"],
381-
retrieval_context=["Password reset instructions"],
382-
tools_called=["authentication"],
383-
expected_tools=["authentication"],
384-
additional_metadata={"difficulty": "medium"}
385-
)
386-
387-
388-
scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
389-
scorer2 = JudgmentScorer(threshold=0.5, score_type=APIScorer.HALLUCINATION)
390-
c_scorer = CustomFaithfulnessMetric(threshold=0.6)
391-
392-
393-
client = JudgmentClient()

judgeval/scorers/__init__.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,30 @@
11
from judgeval.scorers.base_scorer import JudgmentScorer
22
from judgeval.scorers.custom_scorer import CustomScorer
33
from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
4+
from judgeval.scorers.judgeval_scorers import (
5+
ToolCorrectnessScorer,
6+
JSONCorrectnessScorer,
7+
SummarizationScorer,
8+
HallucinationScorer,
9+
FaithfulnessScorer,
10+
ContextualRelevancyScorer,
11+
ContextualPrecisionScorer,
12+
ContextualRecallScorer,
13+
AnswerRelevancyScorer,
14+
)
415

5-
__all__ = ["JudgmentScorer", "CustomScorer", "PromptScorer", "ClassifierScorer"]
16+
__all__ = [
17+
"JudgmentScorer",
18+
"CustomScorer",
19+
"PromptScorer",
20+
"ClassifierScorer",
21+
"ToolCorrectnessScorer",
22+
"JSONCorrectnessScorer",
23+
"SummarizationScorer",
24+
"HallucinationScorer",
25+
"FaithfulnessScorer",
26+
"ContextualRelevancyScorer",
27+
"ContextualPrecisionScorer",
28+
"ContextualRecallScorer",
29+
"AnswerRelevancyScorer",
30+
]

judgeval/scorers/base_scorer.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,21 @@ class JudgmentScorer(BaseModel):
1616
1717
Args:
1818
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
19+
threshold (float): A value between 0 and 1 that determines the scoring threshold
1920
"""
2021
threshold: float
2122
score_type: APIScorer
2223

24+
@field_validator('threshold')
25+
def validate_threshold(cls, v):
26+
"""
27+
Validates that the threshold is between 0 and 1 inclusive.
28+
"""
29+
if not 0 <= v <= 1:
30+
error(f"Threshold must be between 0 and 1, got: {v}")
31+
raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
32+
return v
33+
2334
@field_validator('score_type')
2435
def convert_to_enum_value(cls, v):
2536
"""
@@ -37,5 +48,4 @@ def convert_to_enum_value(cls, v):
3748
raise ValueError(f"Invalid value for score_type: {v}")
3849

3950
def __str__(self):
40-
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
41-
51+
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from judgeval.scorers.judgeval_scorers.tool_correctness import ToolCorrectnessScorer
2+
from judgeval.scorers.judgeval_scorers.json_correctness import JSONCorrectnessScorer
3+
from judgeval.scorers.judgeval_scorers.summarization import SummarizationScorer
4+
from judgeval.scorers.judgeval_scorers.hallucination import HallucinationScorer
5+
from judgeval.scorers.judgeval_scorers.faithfulness import FaithfulnessScorer
6+
from judgeval.scorers.judgeval_scorers.contextual_relevancy import ContextualRelevancyScorer
7+
from judgeval.scorers.judgeval_scorers.contextual_precision import ContextualPrecisionScorer
8+
from judgeval.scorers.judgeval_scorers.contextual_recall import ContextualRecallScorer
9+
from judgeval.scorers.judgeval_scorers.answer_relevancy import AnswerRelevancyScorer
10+
11+
__all__ = [
12+
"ToolCorrectnessScorer",
13+
"JSONCorrectnessScorer",
14+
"SummarizationScorer",
15+
"HallucinationScorer",
16+
"FaithfulnessScorer",
17+
"ContextualRelevancyScorer",
18+
"ContextualPrecisionScorer",
19+
"ContextualRecallScorer",
20+
"AnswerRelevancyScorer",
21+
]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""
2+
`judgeval` answer relevancy scorer
3+
4+
TODO add link to docs page for this scorer
5+
6+
"""
7+
8+
# Internal imports
9+
from judgeval.scorers.base_scorer import JudgmentScorer
10+
from judgeval.constants import APIScorer
11+
12+
13+
class AnswerRelevancyScorer(JudgmentScorer):
14+
def __init__(self, threshold: float):
15+
super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
16+
17+
@property
18+
def __name__(self):
19+
return "Answer Relevancy"

0 commit comments

Comments
 (0)