Skip to content

Refactor default judges #36

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
efa9ad5
Experiment with new default scorer interface by adding JSON correctness
SecroLoL Jan 10, 2025
2012865
Wrap FaithfulnessScorer into individual class
SecroLoL Jan 10, 2025
bae3e87
Wrap AnswerRelevancyScorer into individual class
SecroLoL Jan 10, 2025
efe53af
Add ContextualPrecision wrapper for its own class
SecroLoL Jan 10, 2025
059bf71
Add ContextualRecall wrapper for its own class
SecroLoL Jan 10, 2025
3633a3e
Add ToolCorrectnessScorer wrapper for its own class
SecroLoL Jan 10, 2025
3ae5fb1
Add ContextualRelevancy wrapper for its own class
SecroLoL Jan 10, 2025
c2cb7b2
Add Summarization wrapper for its own class
SecroLoL Jan 10, 2025
900b503
Add HallucinationScorer wrapper for its own class
SecroLoL Jan 10, 2025
49c7d38
Remove test segment of code file (we can just use client test file in…
SecroLoL Jan 10, 2025
b007f43
Update __init__ files of the scorers/ and judgeval_scorers/ dirs for …
SecroLoL Jan 10, 2025
2451329
Restrict threshold to between 0 <= x <= 1 on init
SecroLoL Jan 10, 2025
b35822e
Add UT for AnswerRelevancyScorer
SecroLoL Jan 10, 2025
6eff5cd
Add UT for all new wrapped default scorers
SecroLoL Jan 11, 2025
6d9a907
Edit JSONCorrectnessScorer init because it has an extra field that ne…
SecroLoL Jan 11, 2025
5e636ef
Update e2e tests with new wrapped default scorer syntax
SecroLoL Jan 11, 2025
ecf7530
Remove unused imports
SecroLoL Jan 11, 2025
dcc79aa
Generalize span level async evaluation to run with any scorer, custom…
SecroLoL Jan 11, 2025
b1e0dc1
Update Pipfile
SecroLoL Jan 11, 2025
08fb199
Update tracer test script with new default scorer
SecroLoL Jan 11, 2025
ad1300d
Remove dev packages from standard packages in Pipfile
SecroLoL Jan 13, 2025
03b2287
Uncomment testing calls so all tests are run
SecroLoL Jan 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,18 @@ name = "pypi"
[packages]
langfuse = "==2.50.3"
litellm = "*"
openai = "==1.47.1"
python-dotenv = "==1.0.1"
together = "*"
fastapi = "*"
uvicorn = "*"
deepeval = "*"
supabase = "*"
requests = "*"
pandas = "*"
openai = "*"
together = "*"
anthropic = "*"
pytest = "*"
pytest-asyncio = "*"

[dev-packages]
pytest = "*"
Expand Down
58 changes: 30 additions & 28 deletions e2etests/judgment_client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import os
from judgeval.judgment_client import JudgmentClient
from judgeval.data import Example
from judgeval.scorers import JudgmentScorer
from judgeval.constants import APIScorer
from judgeval.scorers import (
FaithfulnessScorer,
HallucinationScorer,
)
from judgeval.judges import TogetherJudge
from judgeval.playground import CustomFaithfulnessMetric
from judgeval.data.datasets.dataset import EvalDataset
Expand Down Expand Up @@ -53,16 +55,16 @@ def test_run_eval(client: JudgmentClient):
additional_metadata={"difficulty": "medium"}
)

scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
scorer2 = JudgmentScorer(threshold=0.5, score_type=APIScorer.HALLUCINATION)
scorer = FaithfulnessScorer(threshold=0.5)
scorer2 = HallucinationScorer(threshold=0.5)
c_scorer = CustomFaithfulnessMetric(threshold=0.6)

PROJECT_NAME = "test_project_JOSEPH"
EVAL_RUN_NAME = "yomadude"

_ = client.run_evaluation(
examples=[example1, example2],
scorers=[scorer, c_scorer],
scorers=[scorer2],
model="QWEN",
metadata={"batch": "test"},
project_name=PROJECT_NAME,
Expand All @@ -72,7 +74,7 @@ def test_run_eval(client: JudgmentClient):
)

results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME)
# print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)

def test_override_eval(client: JudgmentClient):
example1 = Example(
Expand All @@ -82,7 +84,7 @@ def test_override_eval(client: JudgmentClient):
trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
)

scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
scorer = FaithfulnessScorer(threshold=0.5)

PROJECT_NAME = "test_eval_run_naming_collisions"
EVAL_RUN_NAME = ''.join(random.choices(string.ascii_letters + string.digits, k=12))
Expand Down Expand Up @@ -171,7 +173,7 @@ def test_evaluate_dataset(client: JudgmentClient):
dataset = EvalDataset(examples=[example1, example2])
res = client.evaluate_dataset(
dataset=dataset,
scorers=[JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)],
scorers=[FaithfulnessScorer(threshold=0.5)],
model="QWEN",
metadata={"batch": "test"},
)
Expand All @@ -180,7 +182,7 @@ def test_evaluate_dataset(client: JudgmentClient):

def test_classifier_scorer(client: JudgmentClient):
classifier_scorer = client.fetch_classifier_scorer("tonescorer-72gl")
faithfulness_scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
faithfulness_scorer = FaithfulnessScorer(threshold=0.5)

example1 = Example(
input="What if these shoes don't fit?",
Expand All @@ -199,32 +201,32 @@ def test_classifier_scorer(client: JudgmentClient):
# Test client functionality
client = get_client()
ui_client = get_ui_client()
print("Client initialized successfully")
print("*" * 40)
# print("Client initialized successfully")
# print("*" * 40)

print("Testing dataset creation, pushing, and pulling")
test_dataset(ui_client)
print("Dataset creation, pushing, and pulling successful")
print("*" * 40)
# print("Testing dataset creation, pushing, and pulling")
# test_dataset(ui_client)
# print("Dataset creation, pushing, and pulling successful")
# print("*" * 40)

print("Testing evaluation run")
test_run_eval(ui_client)
print("Evaluation run successful")
print("*" * 40)

print("Testing evaluation run override")
test_override_eval(client)
print("Evaluation run override successful")
print("*" * 40)
# print("Testing evaluation run override")
# test_override_eval(client)
# print("Evaluation run override successful")
# print("*" * 40)

print("Testing dataset evaluation")
test_evaluate_dataset(ui_client)
print("Dataset evaluation successful")
print("*" * 40)
# print("Testing dataset evaluation")
# test_evaluate_dataset(ui_client)
# print("Dataset evaluation successful")
# print("*" * 40)

print("Testing classifier scorer")
test_classifier_scorer(ui_client)
print("Classifier scorer test successful")
print("*" * 40)
# print("Testing classifier scorer")
# test_classifier_scorer(ui_client)
# print("Classifier scorer test successful")
# print("*" * 40)

print("All tests passed successfully")
# print("All tests passed successfully")
10 changes: 4 additions & 6 deletions e2etests/test_tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# Local imports
from judgeval.common.tracer import Tracer, wrap
from judgeval.constants import APIScorer
from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer

# Initialize the tracer and clients
judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
Expand All @@ -28,13 +29,12 @@ async def make_upper(input: str) -> str:
"""
output = input.upper()
await judgment.get_current_trace().async_evaluate(
scorers=[FaithfulnessScorer(threshold=0.5)],
input="What if these shoes don't fit?",
actual_output="We offer a 30-day full refund at no extra cost.",
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
expected_output="We offer a 30-day full refund at no extra cost.",
expected_tools=["refund"],
score_type=APIScorer.FAITHFULNESS,
threshold=0.5,
model="gpt-4o-mini",
log_results=True
)
Expand All @@ -45,6 +45,7 @@ async def make_lower(input):
output = input.lower()

await judgment.get_current_trace().async_evaluate(
scorers=[AnswerRelevancyScorer(threshold=0.5)],
input="How do I reset my password?",
actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
Expand All @@ -53,8 +54,6 @@ async def make_lower(input):
tools_called=["authentication"],
expected_tools=["authentication"],
additional_metadata={"difficulty": "medium"},
score_type=APIScorer.ANSWER_RELEVANCY,
threshold=0.5,
model="gpt-4o-mini",
log_results=True
)
Expand All @@ -68,12 +67,11 @@ def llm_call(input):
async def answer_user_question(input):
output = llm_call(input)
await judgment.get_current_trace().async_evaluate(
scorers=[AnswerRelevancyScorer(threshold=0.5)],
input=input,
actual_output=output,
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
expected_output="We offer a 30-day full refund at no extra cost.",
score_type=APIScorer.ANSWER_RELEVANCY,
threshold=0.5,
model="gpt-4o-mini",
log_results=True
)
Expand Down
29 changes: 17 additions & 12 deletions judgeval/common/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,16 @@
import requests
import uuid
from contextlib import contextmanager
from typing import Optional, Any, List, Literal, Tuple, Generator, TypeAlias, Union
from typing import (
Optional,
Any,
List,
Literal,
Tuple,
Generator,
TypeAlias,
Union
)
from dataclasses import dataclass, field
from datetime import datetime
from openai import OpenAI
Expand All @@ -23,7 +32,7 @@
from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
from judgeval.judgment_client import JudgmentClient
from judgeval.data import Example
from judgeval.scorers import JudgmentScorer
from judgeval.scorers import JudgmentScorer, CustomScorer
from judgeval.data.result import ScoringResult

# Define type aliases for better code readability and maintainability
Expand Down Expand Up @@ -149,6 +158,7 @@ def span(self, name: str):

async def async_evaluate(
self,
scorers: List[Union[JudgmentScorer, CustomScorer]],
input: Optional[str] = None,
actual_output: Optional[str] = None,
expected_output: Optional[str] = None,
Expand All @@ -157,8 +167,6 @@ async def async_evaluate(
tools_called: Optional[List[str]] = None,
expected_tools: Optional[List[str]] = None,
additional_metadata: Optional[Dict[str, Any]] = None,
score_type: Optional[str] = None,
threshold: Optional[float] = None,
model: Optional[str] = None,
log_results: Optional[bool] = False,
):
Expand All @@ -174,18 +182,15 @@ async def async_evaluate(
additional_metadata=additional_metadata,
trace_id=self.trace_id
)
scorer = JudgmentScorer(
score_type=score_type,
threshold=threshold
)
_, scoring_results = self.client.run_evaluation(
scoring_results = self.client.run_evaluation(
examples=[example],
scorers=[scorer],
scorers=scorers,
model=model,
metadata={},
log_results=log_results,
project_name="TestSpanLevel",
eval_run_name="TestSpanLevel",
project_name="TestSpanLevel1", # TODO this should be dynamic
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will handle this, I'll share my thoughts in Slack. In my multi-step eval PR, I added a project and trace name, which will tie in nicely to generate automatic eval run names (to improve UX while remaining clear).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice B)

eval_run_name="TestSpanLevel1",
override=True,
)

self.record_evaluation(scoring_results, start_time) # Pass start_time to record_evaluation
Expand Down
1 change: 1 addition & 0 deletions judgeval/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class APIScorer(str, Enum):
CONTEXTUAL_RELEVANCY = "contextual_relevancy"
CONTEXTUAL_PRECISION = "contextual_precision"
TOOL_CORRECTNESS = "tool_correctness"
JSON_CORRECTNESS = "json_correctness"

@classmethod
def _missing_(cls, value):
Expand Down
38 changes: 0 additions & 38 deletions judgeval/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,8 @@
ROOT_API,
JUDGMENT_EVAL_API_URL,
JUDGMENT_EVAL_LOG_API_URL,
APIScorer,
)
from judgeval.common.exceptions import JudgmentAPIError
from judgeval.playground import CustomFaithfulnessMetric
from judgeval.judges import TogetherJudge, MixtureOfJudges
from judgeval.evaluation_run import EvaluationRun
from judgeval.common.logger import (
enable_logging,
Expand Down Expand Up @@ -356,38 +353,3 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
if not result.scorers_data: # none of the scorers could be executed on this example
info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
return merged_results


if __name__ == "__main__":
from judgeval.common.logger import enable_logging, debug, info
from judgeval.common.tracer import Tracer

# TODO comeback and delete this, move this to a demo example
# Eval using a proprietary Judgment Scorer
from judgeval.judgment_client import JudgmentClient

example1 = Example(
input="What if these shoes don't fit?",
actual_output="We offer a 30-day full refund at no extra cost.", # replace this with your code's actual output
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
)

example2 = Example(
input="How do I reset my password?",
actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
name="Password Reset",
context=["User Account"],
retrieval_context=["Password reset instructions"],
tools_called=["authentication"],
expected_tools=["authentication"],
additional_metadata={"difficulty": "medium"}
)


scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
scorer2 = JudgmentScorer(threshold=0.5, score_type=APIScorer.HALLUCINATION)
c_scorer = CustomFaithfulnessMetric(threshold=0.6)


client = JudgmentClient()
27 changes: 26 additions & 1 deletion judgeval/scorers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,30 @@
from judgeval.scorers.base_scorer import JudgmentScorer
from judgeval.scorers.custom_scorer import CustomScorer
from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
from judgeval.scorers.judgeval_scorers import (
ToolCorrectnessScorer,
JSONCorrectnessScorer,
SummarizationScorer,
HallucinationScorer,
FaithfulnessScorer,
ContextualRelevancyScorer,
ContextualPrecisionScorer,
ContextualRecallScorer,
AnswerRelevancyScorer,
)

__all__ = ["JudgmentScorer", "CustomScorer", "PromptScorer", "ClassifierScorer"]
__all__ = [
"JudgmentScorer",
"CustomScorer",
"PromptScorer",
"ClassifierScorer",
"ToolCorrectnessScorer",
"JSONCorrectnessScorer",
"SummarizationScorer",
"HallucinationScorer",
"FaithfulnessScorer",
"ContextualRelevancyScorer",
"ContextualPrecisionScorer",
"ContextualRecallScorer",
"AnswerRelevancyScorer",
]
14 changes: 12 additions & 2 deletions judgeval/scorers/base_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,21 @@ class JudgmentScorer(BaseModel):

Args:
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
threshold (float): A value between 0 and 1 that determines the scoring threshold
"""
threshold: float
score_type: APIScorer

@field_validator('threshold')
def validate_threshold(cls, v):
"""
Validates that the threshold is between 0 and 1 inclusive.
"""
if not 0 <= v <= 1:
error(f"Threshold must be between 0 and 1, got: {v}")
raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
return v

@field_validator('score_type')
def convert_to_enum_value(cls, v):
"""
Expand All @@ -37,5 +48,4 @@ def convert_to_enum_value(cls, v):
raise ValueError(f"Invalid value for score_type: {v}")

def __str__(self):
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"

return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
Loading
Loading