-
Notifications
You must be signed in to change notification settings - Fork 83
Refactor default judges #36
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
efa9ad5
2012865
bae3e87
efe53af
059bf71
3633a3e
3ae5fb1
c2cb7b2
900b503
49c7d38
b007f43
2451329
b35822e
6eff5cd
6d9a907
5e636ef
ecf7530
dcc79aa
b1e0dc1
08fb199
ad1300d
03b2287
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,16 @@ | |
import requests | ||
import uuid | ||
from contextlib import contextmanager | ||
from typing import Optional, Any, List, Literal, Tuple, Generator, TypeAlias, Union | ||
from typing import ( | ||
Optional, | ||
Any, | ||
List, | ||
Literal, | ||
Tuple, | ||
Generator, | ||
TypeAlias, | ||
Union | ||
) | ||
from dataclasses import dataclass, field | ||
from datetime import datetime | ||
from openai import OpenAI | ||
|
@@ -23,7 +32,7 @@ | |
from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL | ||
from judgeval.judgment_client import JudgmentClient | ||
from judgeval.data import Example | ||
from judgeval.scorers import JudgmentScorer | ||
from judgeval.scorers import JudgmentScorer, CustomScorer | ||
from judgeval.data.result import ScoringResult | ||
|
||
# Define type aliases for better code readability and maintainability | ||
|
@@ -149,6 +158,7 @@ def span(self, name: str): | |
|
||
async def async_evaluate( | ||
self, | ||
scorers: List[Union[JudgmentScorer, CustomScorer]], | ||
input: Optional[str] = None, | ||
actual_output: Optional[str] = None, | ||
expected_output: Optional[str] = None, | ||
|
@@ -157,8 +167,6 @@ async def async_evaluate( | |
tools_called: Optional[List[str]] = None, | ||
expected_tools: Optional[List[str]] = None, | ||
additional_metadata: Optional[Dict[str, Any]] = None, | ||
score_type: Optional[str] = None, | ||
threshold: Optional[float] = None, | ||
model: Optional[str] = None, | ||
log_results: Optional[bool] = False, | ||
): | ||
|
@@ -174,18 +182,15 @@ async def async_evaluate( | |
additional_metadata=additional_metadata, | ||
trace_id=self.trace_id | ||
) | ||
scorer = JudgmentScorer( | ||
score_type=score_type, | ||
threshold=threshold | ||
) | ||
_, scoring_results = self.client.run_evaluation( | ||
scoring_results = self.client.run_evaluation( | ||
examples=[example], | ||
scorers=[scorer], | ||
scorers=scorers, | ||
model=model, | ||
metadata={}, | ||
log_results=log_results, | ||
project_name="TestSpanLevel", | ||
eval_run_name="TestSpanLevel", | ||
project_name="TestSpanLevel1", # TODO this should be dynamic | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will handle this, I'll share my thoughts in Slack. In my multi-step eval PR, I added a project and trace name, which will tie in nicely to generate automatic eval run names (to improve UX while remaining clear). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice B) |
||
eval_run_name="TestSpanLevel1", | ||
override=True, | ||
) | ||
|
||
self.record_evaluation(scoring_results, start_time) # Pass start_time to record_evaluation | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,30 @@ | ||
from judgeval.scorers.base_scorer import JudgmentScorer | ||
from judgeval.scorers.custom_scorer import CustomScorer | ||
from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer | ||
from judgeval.scorers.judgeval_scorers import ( | ||
ToolCorrectnessScorer, | ||
JSONCorrectnessScorer, | ||
SummarizationScorer, | ||
HallucinationScorer, | ||
FaithfulnessScorer, | ||
ContextualRelevancyScorer, | ||
ContextualPrecisionScorer, | ||
ContextualRecallScorer, | ||
AnswerRelevancyScorer, | ||
) | ||
|
||
__all__ = ["JudgmentScorer", "CustomScorer", "PromptScorer", "ClassifierScorer"] | ||
__all__ = [ | ||
"JudgmentScorer", | ||
"CustomScorer", | ||
"PromptScorer", | ||
"ClassifierScorer", | ||
"ToolCorrectnessScorer", | ||
"JSONCorrectnessScorer", | ||
"SummarizationScorer", | ||
"HallucinationScorer", | ||
"FaithfulnessScorer", | ||
"ContextualRelevancyScorer", | ||
"ContextualPrecisionScorer", | ||
"ContextualRecallScorer", | ||
"AnswerRelevancyScorer", | ||
] |
Uh oh!
There was an error while loading. Please reload this page.