Skip to content

Add UT for Data and Scorers library #26

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Dec 27, 2024
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
5ec6372
remove post_init method from Example because it won't be called anywa…
SecroLoL Dec 24, 2024
3f09924
Merge remote-tracking branch 'origin/joseph/add-unit-tests' into alex…
SecroLoL Dec 24, 2024
2dcacd1
Add UT for data/example.py
SecroLoL Dec 24, 2024
c4ed183
Refactor to remove field variables from ProcessExample
SecroLoL Dec 24, 2024
93b349b
Add UT for API Example
SecroLoL Dec 24, 2024
adcada4
Add UT for data/result.py (ScoringResult)
SecroLoL Dec 24, 2024
73c070c
Add UT for scorer_data.py script
SecroLoL Dec 24, 2024
6ab90b3
Add UT for data/datasets/dataset.py
SecroLoL Dec 24, 2024
ed65e1b
Assert that mock endpoints are called once
SecroLoL Dec 24, 2024
f6dfa1b
Add UT for data/datasets/ground_truth.py
SecroLoL Dec 24, 2024
38b74c9
Write UT for data/datasets/test_dataset_utils.py
SecroLoL Dec 24, 2024
4d99ded
Add UT for scorers/
SecroLoL Dec 25, 2024
266626d
Remove import for Example type hinting in custom_scorer.py to avoid c…
SecroLoL Dec 25, 2024
f3c5006
Add UT for score.py functions a_safe_score_example and score_task
SecroLoL Dec 25, 2024
b27e15f
Add UT for score.py, TODO fix two of the failing test cases
SecroLoL Dec 25, 2024
16c77d9
Edit execute_with_semaphore() helper to prevent error propagation.
SecroLoL Dec 25, 2024
b5c3b15
Add safe attribute checking for skipped scorers.
SecroLoL Dec 25, 2024
c3f209c
Fix two buggy tests for scoring utils functions.
SecroLoL Dec 25, 2024
18f7e2f
Slight bufixes for the PromptScorer class: change _show_indicator to …
SecroLoL Dec 25, 2024
99706a5
Add UT for PromptScorer code.
SecroLoL Dec 25, 2024
a34569b
Add UT for custom_scorer.py
SecroLoL Dec 25, 2024
3bfe9f9
Handle lowercase values for APIScorer initialization
SecroLoL Dec 25, 2024
6dcfbb0
Add UT for base_scorer.py
SecroLoL Dec 25, 2024
1bf081f
Add stricter typechecking in the ground_truth_to_examples and example…
SecroLoL Dec 25, 2024
b830cb1
Debug some UTs in the datasets utils
SecroLoL Dec 25, 2024
e226ff9
Comment out the problematic logger unit testing.
JCamyre Dec 27, 2024
95319e8
Merge branch 'main' into alex/add-unit-tests
JCamyre Dec 27, 2024
e805737
Update Pipfile to include missing packages. Rename TestScorer in unit…
JCamyre Dec 27, 2024
37545f7
Merge branch 'alex/add-unit-tests' of https://github.com/JudgmentLabs…
JCamyre Dec 27, 2024
50e2c9a
Merge branch 'main' of https://github.com/JudgmentLabs/judgeval into …
JCamyre Dec 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions judgeval/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from enum import Enum
import litellm

class APIScorer(Enum):
class APIScorer(str, Enum):
"""
Collection of proprietary scorers implemented by Judgment.

Expand All @@ -20,7 +20,13 @@ class APIScorer(Enum):
CONTEXTUAL_RELEVANCY = "contextual_relevancy"
CONTEXTUAL_PRECISION = "contextual_precision"
TOOL_CORRECTNESS = "tool_correctness"


@classmethod
def _missing_(cls, value):
# Handle case-insensitive lookup
for member in cls:
if member.value == value.lower():
return member

ROOT_API = "http://127.0.0.1:8000"
# ROOT_API = "https://api.judgmentlabs.ai" # TODO replace this with the actual API root
Expand Down
56 changes: 26 additions & 30 deletions judgeval/data/api_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,24 @@ class ProcessExample(BaseModel):
"""
name: str
input: Optional[str] = None
actual_output: Optional[str] = Field(None, alias="actualOutput")
expected_output: Optional[str] = Field(None, alias="expectedOutput")
context: Optional[list] = Field(None)
retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
tools_called: Optional[list] = Field(None, alias="toolsCalled")
expected_tools: Optional[list] = Field(None, alias="expectedTools")
actual_output: Optional[str] = None
expected_output: Optional[str] = None
context: Optional[list] = None
retrieval_context: Optional[list] = None
tools_called: Optional[list] = None
expected_tools: Optional[list] = None

# make these optional, not all test cases in a conversation will be evaluated
success: Union[bool, None] = Field(None)
scorers_data: Union[List[ScorerData], None] = Field(
None, alias="scorersData"
)
run_duration: Union[float, None] = Field(None, alias="runDuration")
evaluation_cost: Union[float, None] = Field(None, alias="evaluationCost")
success: Optional[bool] = None
scorers_data: Optional[List[ScorerData]] = None
run_duration: Optional[float] = None
evaluation_cost: Optional[float] = None

order: Union[int, None] = Field(None)
order: Optional[int] = None
# These should map 1 to 1 from golden
additional_metadata: Optional[Dict] = Field(
None, alias="additionalMetadata"
)
comments: Optional[str] = Field(None)
trace_id: Optional[str] = Field(None)
additional_metadata: Optional[Dict] = None
comments: Optional[str] = None
trace_id: Optional[str] = None
model_config = ConfigDict(arbitrary_types_allowed=True)

def update_scorer_data(self, scorer_data: ScorerData):
Expand Down Expand Up @@ -65,12 +61,12 @@ def update_run_duration(self, run_duration: float):
@model_validator(mode="before")
def check_input(cls, values: Dict[str, Any]):
input = values.get("input")
actual_output = values.get("actualOutput")
actual_output = values.get("actual_output")

if (input is None or actual_output is None):
error(f"Validation error: Required fields missing. input={input}, actualOutput={actual_output}")
error(f"Validation error: Required fields missing. input={input}, actual_output={actual_output}")
raise ValueError(
"'input' and 'actualOutput' must be provided."
"'input' and 'actual_output' must be provided."
)

return values
Expand All @@ -97,18 +93,18 @@ def create_process_example(
process_ex = ProcessExample(
name=name,
input=example.input,
actualOutput=example.actual_output,
expectedOutput=example.expected_output,
actual_output=example.actual_output,
expected_output=example.expected_output,
context=example.context,
retrievalContext=example.retrieval_context,
toolsCalled=example.tools_called,
expectedTools=example.expected_tools,
retrieval_context=example.retrieval_context,
tools_called=example.tools_called,
expected_tools=example.expected_tools,
success=success,
scorersData=scorers_data,
runDuration=None,
evaluationCost=None,
scorers_data=scorers_data,
run_duration=None,
evaluation_cost=None,
order=order,
additionalMetadata=example.additional_metadata,
additional_metadata=example.additional_metadata,
trace_id=example.trace_id
)
return process_ex
Expand Down
9 changes: 9 additions & 0 deletions judgeval/data/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ def examples_to_ground_truths(examples: List[Example]) -> List[GroundTruthExampl
Returns:
List[GroundTruthExample]: A list of `GroundTruthExample` objects.
"""

if not isinstance(examples, list):
raise TypeError("Input should be a list of `Example` objects")

ground_truths = []
ground_truths = []
for e in examples:
g_truth = {
Expand Down Expand Up @@ -45,6 +50,10 @@ def ground_truths_to_examples(
Returns:
List[Example]: A list of `Example` objects.
"""

if not isinstance(ground_truths, list):
raise TypeError("Input should be a list of `GroundTruthExample` objects")

examples = []
for index, ground_truth in enumerate(ground_truths):
e = Example(
Expand Down
35 changes: 0 additions & 35 deletions judgeval/data/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,41 +37,6 @@ class Example(BaseModel):
timestamp: Optional[str] = None
trace_id: Optional[str] = None

def __post_init__(self):
# Ensure `context` is None or a list of strings
if self.context is not None:
if not isinstance(self.context, list) or not all(
isinstance(item, str) for item in self.context
):
raise TypeError("'context' must be None or a list of strings")

# Ensure `retrieval_context` is None or a list of strings
if self.retrieval_context is not None:
if not isinstance(self.retrieval_context, list) or not all(
isinstance(item, str) for item in self.retrieval_context
):
raise TypeError(
"'retrieval_context' must be None or a list of strings"
)

# Ensure `tools_called` is None or a list of strings
if self.tools_called is not None:
if not isinstance(self.tools_called, list) or not all(
isinstance(item, str) for item in self.tools_called
):
raise TypeError(
"'tools_called' must be None or a list of strings"
)

# Ensure `expected_tools` is None or a list of strings
if self.expected_tools is not None:
if not isinstance(self.expected_tools, list) or not all(
isinstance(item, str) for item in self.expected_tools
):
raise TypeError(
"'expected_tools' must be None or a list of strings"
)

def __init__(self, **data):
super().__init__(**data)
# Set timestamp if not provided
Expand Down
5 changes: 2 additions & 3 deletions judgeval/scorers/custom_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from abc import abstractmethod

from judgeval.common.logger import debug, info, warning, error
from judgeval.data import Example
from judgeval.judges import judgevalJudge
from judgeval.judges.utils import create_judge

Expand Down Expand Up @@ -84,7 +83,7 @@ def _add_model(self, model: Optional[Union[str, List[str], judgevalJudge]] = Non
self.evaluation_model = self.model.get_model_name()

@abstractmethod
def score_example(self, example: Example, *args, **kwargs) -> float:
def score_example(self, example, *args, **kwargs) -> float:
"""
Measures the score on a single example
"""
Expand All @@ -93,7 +92,7 @@ def score_example(self, example: Example, *args, **kwargs) -> float:
raise NotImplementedError("You must implement the `score` method in your custom scorer")

@abstractmethod
async def a_score_example(self, example: Example, *args, **kwargs) -> float:
async def a_score_example(self, example, *args, **kwargs) -> float:
"""
Asynchronously measures the score on a single example
"""
Expand Down
4 changes: 2 additions & 2 deletions judgeval/scorers/prompt_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def score_example(
"""
Synchronous method for scoring an example using the prompt criteria.
"""
with scorer_progress_meter(self, _show_indicator=_show_indicator):
with scorer_progress_meter(self, display_meter=_show_indicator):
if self.async_mode:
loop = get_or_create_event_loop()
loop.run_until_complete(
Expand Down Expand Up @@ -217,7 +217,7 @@ def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict):
# create formatting string for schema enforcement
# schema is a map between key and type of the value
for key, key_type in schema.items():
SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type}), '
SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type.__name__}), '
SCHEMA_ENFORCEMENT_PROMPT = SCHEMA_ENFORCEMENT_PROMPT[:-2] + "}" # remove trailing comma and space
judge_prompt[0]["content"] += SCHEMA_ENFORCEMENT_PROMPT
return judge_prompt
Expand Down
13 changes: 10 additions & 3 deletions judgeval/scorers/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,15 @@ async def a_execute_scoring(
semaphore = asyncio.Semaphore(max_concurrent)

async def execute_with_semaphore(func: Callable, *args, **kwargs):
async with semaphore:
return await func(*args, **kwargs)
try:
async with semaphore:
return await func(*args, **kwargs)
except Exception as e:
error(f"Error executing function: {e}")
if kwargs.get('ignore_errors', False):
# Return None when ignoring errors
return None
raise

if verbose_mode is not None:
for scorer in scorers:
Expand Down Expand Up @@ -406,7 +413,7 @@ async def a_eval_examples_helper(
# the results and update the process example with the scorer data
for scorer in scorers:
# At this point, the scorer has been executed and already contains data.
if scorer.skipped:
if getattr(scorer, 'skipped', False):
continue

scorer_data = create_scorer_data(scorer) # Fetch scorer data from completed scorer evaluation
Expand Down
Loading
Loading