diff --git a/Pipfile b/Pipfile index a0a69601..ba30d8f0 100644 --- a/Pipfile +++ b/Pipfile @@ -14,6 +14,8 @@ uvicorn = "*" deepeval = "*" supabase = "*" requests = "*" +pandas = "*" +anthropic = "*" [dev-packages] pytest = "*" diff --git a/judgeval/constants.py b/judgeval/constants.py index 1fc1c082..788e8347 100644 --- a/judgeval/constants.py +++ b/judgeval/constants.py @@ -5,7 +5,7 @@ from enum import Enum import litellm -class APIScorer(Enum): +class APIScorer(str, Enum): """ Collection of proprietary scorers implemented by Judgment. @@ -20,7 +20,13 @@ class APIScorer(Enum): CONTEXTUAL_RELEVANCY = "contextual_relevancy" CONTEXTUAL_PRECISION = "contextual_precision" TOOL_CORRECTNESS = "tool_correctness" - + + @classmethod + def _missing_(cls, value): + # Handle case-insensitive lookup + for member in cls: + if member.value == value.lower(): + return member ROOT_API = "http://127.0.0.1:8000" # ROOT_API = "https://api.judgmentlabs.ai" # TODO replace this with the actual API root diff --git a/judgeval/data/api_example.py b/judgeval/data/api_example.py index 6a03cce5..48ae1170 100644 --- a/judgeval/data/api_example.py +++ b/judgeval/data/api_example.py @@ -13,28 +13,24 @@ class ProcessExample(BaseModel): """ name: str input: Optional[str] = None - actual_output: Optional[str] = Field(None, alias="actualOutput") - expected_output: Optional[str] = Field(None, alias="expectedOutput") - context: Optional[list] = Field(None) - retrieval_context: Optional[list] = Field(None, alias="retrievalContext") - tools_called: Optional[list] = Field(None, alias="toolsCalled") - expected_tools: Optional[list] = Field(None, alias="expectedTools") + actual_output: Optional[str] = None + expected_output: Optional[str] = None + context: Optional[list] = None + retrieval_context: Optional[list] = None + tools_called: Optional[list] = None + expected_tools: Optional[list] = None # make these optional, not all test cases in a conversation will be evaluated - success: Union[bool, None] = Field(None) - scorers_data: Union[List[ScorerData], None] = Field( - None, alias="scorersData" - ) - run_duration: Union[float, None] = Field(None, alias="runDuration") - evaluation_cost: Union[float, None] = Field(None, alias="evaluationCost") + success: Optional[bool] = None + scorers_data: Optional[List[ScorerData]] = None + run_duration: Optional[float] = None + evaluation_cost: Optional[float] = None - order: Union[int, None] = Field(None) + order: Optional[int] = None # These should map 1 to 1 from golden - additional_metadata: Optional[Dict] = Field( - None, alias="additionalMetadata" - ) - comments: Optional[str] = Field(None) - trace_id: Optional[str] = Field(None) + additional_metadata: Optional[Dict] = None + comments: Optional[str] = None + trace_id: Optional[str] = None model_config = ConfigDict(arbitrary_types_allowed=True) def update_scorer_data(self, scorer_data: ScorerData): @@ -65,12 +61,12 @@ def update_run_duration(self, run_duration: float): @model_validator(mode="before") def check_input(cls, values: Dict[str, Any]): input = values.get("input") - actual_output = values.get("actualOutput") + actual_output = values.get("actual_output") if (input is None or actual_output is None): - error(f"Validation error: Required fields missing. input={input}, actualOutput={actual_output}") + error(f"Validation error: Required fields missing. input={input}, actual_output={actual_output}") raise ValueError( - "'input' and 'actualOutput' must be provided." + "'input' and 'actual_output' must be provided." ) return values @@ -97,18 +93,18 @@ def create_process_example( process_ex = ProcessExample( name=name, input=example.input, - actualOutput=example.actual_output, - expectedOutput=example.expected_output, + actual_output=example.actual_output, + expected_output=example.expected_output, context=example.context, - retrievalContext=example.retrieval_context, - toolsCalled=example.tools_called, - expectedTools=example.expected_tools, + retrieval_context=example.retrieval_context, + tools_called=example.tools_called, + expected_tools=example.expected_tools, success=success, - scorersData=scorers_data, - runDuration=None, - evaluationCost=None, + scorers_data=scorers_data, + run_duration=None, + evaluation_cost=None, order=order, - additionalMetadata=example.additional_metadata, + additional_metadata=example.additional_metadata, trace_id=example.trace_id ) return process_ex diff --git a/judgeval/data/datasets/utils.py b/judgeval/data/datasets/utils.py index ca844a78..b1558f17 100644 --- a/judgeval/data/datasets/utils.py +++ b/judgeval/data/datasets/utils.py @@ -14,6 +14,11 @@ def examples_to_ground_truths(examples: List[Example]) -> List[GroundTruthExampl Returns: List[GroundTruthExample]: A list of `GroundTruthExample` objects. """ + + if not isinstance(examples, list): + raise TypeError("Input should be a list of `Example` objects") + + ground_truths = [] ground_truths = [] for e in examples: g_truth = { @@ -45,6 +50,10 @@ def ground_truths_to_examples( Returns: List[Example]: A list of `Example` objects. """ + + if not isinstance(ground_truths, list): + raise TypeError("Input should be a list of `GroundTruthExample` objects") + examples = [] for index, ground_truth in enumerate(ground_truths): e = Example( diff --git a/judgeval/data/example.py b/judgeval/data/example.py index 3df12064..38238f7a 100644 --- a/judgeval/data/example.py +++ b/judgeval/data/example.py @@ -37,41 +37,6 @@ class Example(BaseModel): timestamp: Optional[str] = None trace_id: Optional[str] = None - def __post_init__(self): - # Ensure `context` is None or a list of strings - if self.context is not None: - if not isinstance(self.context, list) or not all( - isinstance(item, str) for item in self.context - ): - raise TypeError("'context' must be None or a list of strings") - - # Ensure `retrieval_context` is None or a list of strings - if self.retrieval_context is not None: - if not isinstance(self.retrieval_context, list) or not all( - isinstance(item, str) for item in self.retrieval_context - ): - raise TypeError( - "'retrieval_context' must be None or a list of strings" - ) - - # Ensure `tools_called` is None or a list of strings - if self.tools_called is not None: - if not isinstance(self.tools_called, list) or not all( - isinstance(item, str) for item in self.tools_called - ): - raise TypeError( - "'tools_called' must be None or a list of strings" - ) - - # Ensure `expected_tools` is None or a list of strings - if self.expected_tools is not None: - if not isinstance(self.expected_tools, list) or not all( - isinstance(item, str) for item in self.expected_tools - ): - raise TypeError( - "'expected_tools' must be None or a list of strings" - ) - def __init__(self, **data): super().__init__(**data) # Set timestamp if not provided diff --git a/judgeval/scorers/custom_scorer.py b/judgeval/scorers/custom_scorer.py index 9c9a9944..75816e7d 100644 --- a/judgeval/scorers/custom_scorer.py +++ b/judgeval/scorers/custom_scorer.py @@ -9,7 +9,6 @@ from abc import abstractmethod from judgeval.common.logger import debug, info, warning, error -from judgeval.data import Example from judgeval.judges import judgevalJudge from judgeval.judges.utils import create_judge @@ -84,7 +83,7 @@ def _add_model(self, model: Optional[Union[str, List[str], judgevalJudge]] = Non self.evaluation_model = self.model.get_model_name() @abstractmethod - def score_example(self, example: Example, *args, **kwargs) -> float: + def score_example(self, example, *args, **kwargs) -> float: """ Measures the score on a single example """ @@ -93,7 +92,7 @@ def score_example(self, example: Example, *args, **kwargs) -> float: raise NotImplementedError("You must implement the `score` method in your custom scorer") @abstractmethod - async def a_score_example(self, example: Example, *args, **kwargs) -> float: + async def a_score_example(self, example, *args, **kwargs) -> float: """ Asynchronously measures the score on a single example """ diff --git a/judgeval/scorers/prompt_scorer.py b/judgeval/scorers/prompt_scorer.py index a8d259a7..6ba77172 100644 --- a/judgeval/scorers/prompt_scorer.py +++ b/judgeval/scorers/prompt_scorer.py @@ -68,7 +68,7 @@ def score_example( """ Synchronous method for scoring an example using the prompt criteria. """ - with scorer_progress_meter(self, _show_indicator=_show_indicator): + with scorer_progress_meter(self, display_meter=_show_indicator): if self.async_mode: loop = get_or_create_event_loop() loop.run_until_complete( @@ -217,7 +217,7 @@ def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict): # create formatting string for schema enforcement # schema is a map between key and type of the value for key, key_type in schema.items(): - SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type}), ' + SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type.__name__}), ' SCHEMA_ENFORCEMENT_PROMPT = SCHEMA_ENFORCEMENT_PROMPT[:-2] + "}" # remove trailing comma and space judge_prompt[0]["content"] += SCHEMA_ENFORCEMENT_PROMPT return judge_prompt diff --git a/judgeval/scorers/score.py b/judgeval/scorers/score.py index 84af194f..b16352e8 100644 --- a/judgeval/scorers/score.py +++ b/judgeval/scorers/score.py @@ -273,8 +273,15 @@ async def a_execute_scoring( semaphore = asyncio.Semaphore(max_concurrent) async def execute_with_semaphore(func: Callable, *args, **kwargs): - async with semaphore: - return await func(*args, **kwargs) + try: + async with semaphore: + return await func(*args, **kwargs) + except Exception as e: + error(f"Error executing function: {e}") + if kwargs.get('ignore_errors', False): + # Return None when ignoring errors + return None + raise if verbose_mode is not None: for scorer in scorers: @@ -406,7 +413,7 @@ async def a_eval_examples_helper( # the results and update the process example with the scorer data for scorer in scorers: # At this point, the scorer has been executed and already contains data. - if scorer.skipped: + if getattr(scorer, 'skipped', False): continue scorer_data = create_scorer_data(scorer) # Fetch scorer data from completed scorer evaluation diff --git a/tests/data/datasets/test_dataset.py b/tests/data/datasets/test_dataset.py new file mode 100644 index 00000000..32a307fa --- /dev/null +++ b/tests/data/datasets/test_dataset.py @@ -0,0 +1,177 @@ +import pytest +import json +import pandas as pd +from unittest.mock import Mock, patch, mock_open +from judgeval.data.datasets.dataset import EvalDataset +from judgeval.data import Example +from judgeval.data.datasets.ground_truth import GroundTruthExample + +@pytest.fixture +def sample_example(): + return Example( + input="test input", + actual_output="test output", + expected_output="expected output", + context=["context1", "context2"], + retrieval_context=["retrieval1"], + additional_metadata={"key": "value"}, + tools_called=["tool1"], + expected_tools=["tool1", "tool2"], + name="test example" + ) + +@pytest.fixture +def sample_ground_truth(): + return GroundTruthExample( + input="test input", + expected_output="expected output", + context=["context1"], + retrieval_context=["retrieval1"], + additional_metadata={"key": "value"}, + tools_called=["tool1"], + expected_tools=["tool1"], + comments="test comment", + source_file="test.py" + ) + +@pytest.fixture +def dataset(): + return EvalDataset(judgment_api_key="test_key") + +def test_init(): + dataset = EvalDataset(judgment_api_key="test_key") + assert dataset.judgment_api_key == "test_key" + assert dataset.ground_truths == [] + assert dataset.examples == [] + assert dataset._alias is None + assert dataset._id is None + +def test_add_example(dataset, sample_example): + dataset.add_example(sample_example) + assert len(dataset.examples) == 1 + assert dataset.examples[0] == sample_example + +def test_add_ground_truth(dataset, sample_ground_truth): + dataset.add_ground_truth(sample_ground_truth) + assert len(dataset.ground_truths) == 1 + assert dataset.ground_truths[0] == sample_ground_truth + +@patch('requests.post') +def test_push_success(mock_post, dataset, sample_example): + # Setup mock response + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {"_alias": "test_alias", "_id": "test_id"} + mock_post.return_value = mock_response + + # Add example and push + dataset.add_example(sample_example) + result = dataset.push("test_alias") + + assert result is True + assert dataset._alias == "test_alias" + assert dataset._id == "test_id" + mock_post.assert_called_once() + +@patch('requests.post') +def test_push_server_error(mock_post, dataset): + mock_response = Mock() + mock_response.status_code = 500 + mock_post.return_value = mock_response + + result = dataset.push("test_alias") + assert result is False + + mock_post.assert_called_once() + +@patch('requests.post') +def test_pull_success(mock_post, dataset): + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "ground_truths": [{"input": "test", "expected_output": "test"}], + "examples": [{"input": "test", "actual_output": "test"}], + "_alias": "test_alias", + "_id": "test_id" + } + mock_post.return_value = mock_response + + dataset.pull("test_alias") + assert len(dataset.ground_truths) == 1 + assert len(dataset.examples) == 1 + assert dataset._alias == "test_alias" + assert dataset._id == "test_id" + +@patch('builtins.open', new_callable=mock_open) +def test_add_from_json(mock_file, dataset): + json_data = { + "examples": [{"input": "test", "actual_output": "test"}], + "ground_truths": [{"input": "test", "expected_output": "test"}] + } + mock_file.return_value.__enter__.return_value.read.return_value = json.dumps(json_data) + + dataset.add_from_json("test.json") + assert len(dataset.examples) == 1 + assert len(dataset.ground_truths) == 1 + +@patch('pandas.read_csv') +def test_add_from_csv(mock_read_csv, dataset): + mock_df = pd.DataFrame({ + 'input': ['test1', 'test2'], + 'actual_output': ['output1', 'output2'], + 'expected_output': ['expected1', 'expected2'], + 'context': ['ctx1', 'ctx2'], + 'retrieval_context': ['ret1', 'ret2'], + 'additional_metadata': ['{}', '{}'], + 'tools_called': ['tool1', 'tool2'], + 'expected_tools': ['tool1', 'tool2'], + 'name': ['name1', None], + 'comments': [None, 'comment2'], + 'source_file': [None, 'file2'], + 'example': [True, False] + }) + mock_read_csv.return_value = mock_df + + dataset.add_from_csv("test.csv") + assert len(dataset.examples) == 1 + assert len(dataset.ground_truths) == 1 + +def test_save_as_json(dataset, sample_example, tmp_path): + dataset.add_example(sample_example) + save_path = tmp_path / "test_dir" + dataset.save_as("json", str(save_path), "test_save") + + assert (save_path / "test_save.json").exists() + with open(save_path / "test_save.json") as f: + saved_data = json.load(f) + assert "examples" in saved_data + assert "ground_truths" in saved_data + +def test_save_as_csv(dataset, sample_example, tmp_path): + dataset.add_example(sample_example) + save_path = tmp_path / "test_dir" + dataset.save_as("csv", str(save_path), "test_save") + + assert (save_path / "test_save.csv").exists() + df = pd.read_csv(save_path / "test_save.csv") + assert len(df) == 1 + assert "input" in df.columns + +def test_save_as_invalid_type(dataset): + with pytest.raises(TypeError): + dataset.save_as("invalid", "test_dir") + +def test_iter_and_len(dataset, sample_example): + dataset.add_example(sample_example) + assert len(dataset) == 1 + examples = list(dataset) + assert len(examples) == 1 + assert examples[0] == sample_example + +def test_str_representation(dataset, sample_example, sample_ground_truth): + dataset.add_example(sample_example) + dataset.add_ground_truth(sample_ground_truth) + str_rep = str(dataset) + assert "EvalDataset" in str_rep + assert "ground_truths" in str_rep + assert "examples" in str_rep diff --git a/tests/data/datasets/test_dataset_utils.py b/tests/data/datasets/test_dataset_utils.py new file mode 100644 index 00000000..736c15a4 --- /dev/null +++ b/tests/data/datasets/test_dataset_utils.py @@ -0,0 +1,110 @@ +import pytest +from typing import List + +from judgeval.data import Example +from judgeval.data.datasets.ground_truth import GroundTruthExample +from judgeval.data.datasets.utils import examples_to_ground_truths, ground_truths_to_examples + + +@pytest.fixture +def sample_example() -> Example: + return Example( + input="test input", + actual_output="actual result", + expected_output="expected result", + context=["some context"], + retrieval_context=["retrieval info"], + tools_called=["tool1", "tool2"], + expected_tools=["tool1"], + additional_metadata={"key": "value"}, + ) + +@pytest.fixture +def sample_ground_truth() -> GroundTruthExample: + return GroundTruthExample( + input="test input", + actual_output="actual result", + expected_output="expected result", + context=["some context"], + retrieval_context=["retrieval info"], + tools_called=["tool1", "tool2"], + expected_tools=["tool1"], + additional_metadata={"key": "value"}, + comments="test comment" + ) + + +class TestExamplesToGroundTruths: + def test_empty_list(self): + """Test conversion of empty list.""" + result = examples_to_ground_truths([]) + assert isinstance(result, list) + assert len(result) == 0 + + def test_single_example(self, sample_example): + """Test conversion of a single example.""" + result = examples_to_ground_truths([sample_example]) + assert len(result) == 1 + assert isinstance(result[0], GroundTruthExample) + assert result[0].input == sample_example.input + assert result[0].actual_output == sample_example.actual_output + assert result[0].expected_output == sample_example.expected_output + + def test_multiple_examples(self, sample_example): + """Test conversion of multiple examples.""" + examples = [sample_example, sample_example] + result = examples_to_ground_truths(examples) + assert len(result) == 2 + assert all(isinstance(gt, GroundTruthExample) for gt in result) + + def test_none_input(self): + """Test handling of None input.""" + with pytest.raises(TypeError): + examples_to_ground_truths(None) + + def test_invalid_input_type(self): + """Test handling of invalid input type.""" + with pytest.raises(TypeError): + examples_to_ground_truths("not a list") + + +class TestGroundTruthsToExamples: + def test_empty_list(self): + """Test conversion of empty list.""" + result = ground_truths_to_examples([]) + assert isinstance(result, list) + assert len(result) == 0 + + def test_single_ground_truth(self, sample_ground_truth): + """Test conversion of a single ground truth.""" + result = ground_truths_to_examples([sample_ground_truth]) + assert len(result) == 1 + assert isinstance(result[0], Example) + assert result[0].input == sample_ground_truth.input + assert result[0].actual_output == sample_ground_truth.actual_output + assert result[0].expected_output == sample_ground_truth.expected_output + + def test_multiple_ground_truths(self, sample_ground_truth): + """Test conversion of multiple ground truths.""" + ground_truths = [sample_ground_truth, sample_ground_truth] + result = ground_truths_to_examples(ground_truths) + assert len(result) == 2 + assert all(isinstance(ex, Example) for ex in result) + + def test_none_input(self): + """Test handling of None input.""" + with pytest.raises(TypeError): + ground_truths_to_examples(None) + + def test_invalid_input_type(self): + """Test handling of invalid input type.""" + with pytest.raises(TypeError): + ground_truths_to_examples("not a list") + + def test_preserves_metadata(self, sample_ground_truth): + """Test that all metadata is preserved during conversion.""" + result = ground_truths_to_examples([sample_ground_truth])[0] + assert result.additional_metadata == sample_ground_truth.additional_metadata + assert result.tools_called == sample_ground_truth.tools_called + assert result.expected_tools == sample_ground_truth.expected_tools + \ No newline at end of file diff --git a/tests/data/datasets/test_ground_truth.py b/tests/data/datasets/test_ground_truth.py new file mode 100644 index 00000000..58f58c52 --- /dev/null +++ b/tests/data/datasets/test_ground_truth.py @@ -0,0 +1,128 @@ +import pytest +from judgeval.data.datasets.ground_truth import GroundTruthExample + + +def test_ground_truth_example_minimal(): + """Test creation with only required field (input)""" + example = GroundTruthExample(input="test input") + assert example.input == "test input" + assert example.actual_output is None + assert example.expected_output is None + + +def test_ground_truth_example_full(): + """Test creation with all fields populated""" + example = GroundTruthExample( + input="test input", + actual_output="actual result", + expected_output="expected result", + context=["context1", "context2"], + retrieval_context=["retrieved1", "retrieved2"], + additional_metadata={"key": "value"}, + comments="test comment", + tools_called=["tool1", "tool2"], + expected_tools=["expected_tool1"], + source_file="test.txt" + ) + + assert example.input == "test input" + assert example.actual_output == "actual result" + assert example.expected_output == "expected result" + assert example.context == ["context1", "context2"] + assert example.retrieval_context == ["retrieved1", "retrieved2"] + assert example.additional_metadata == {"key": "value"} + assert example.comments == "test comment" + assert example.tools_called == ["tool1", "tool2"] + assert example.expected_tools == ["expected_tool1"] + assert example.source_file == "test.txt" + + +def test_ground_truth_example_to_dict(): + """Test the to_dict method returns correct dictionary""" + example = GroundTruthExample( + input="test input", + actual_output="actual result", + comments="test comment" + ) + + expected_dict = { + "input": "test input", + "actual_output": "actual result", + "expected_output": None, + "context": None, + "retrieval_context": None, + "additional_metadata": None, + "comments": "test comment", + "tools_called": None, + "expected_tools": None, + "source_file": None, + } + + assert example.to_dict() == expected_dict + + +def test_ground_truth_example_str_representation(): + """Test the string representation of the class""" + example = GroundTruthExample( + input="test input", + actual_output="actual result" + ) + + expected_str = ( + "GroundTruthExample(" + "input=test input, " + "actual_output=actual result, " + "expected_output=None, " + "context=None, " + "retrieval_context=None, " + "additional_metadata=None, " + "comments=None, " + "tools_called=None, " + "expected_tools=None, " + "source_file=None)" + ) + + assert str(example) == expected_str + + +def test_ground_truth_example_missing_input(): + """Test that creating instance without required 'input' field raises error""" + with pytest.raises(ValueError): + GroundTruthExample() + + +def test_ground_truth_example_invalid_types(): + """Test that invalid types raise validation errors""" + with pytest.raises(ValueError): + GroundTruthExample(input="test", context="not a list") + + with pytest.raises(ValueError): + GroundTruthExample(input="test", tools_called="not a list") + + with pytest.raises(ValueError): + GroundTruthExample(input="test", additional_metadata="not a dict") + + +def test_ground_truth_example_empty_lists(): + """Test that empty lists are valid for list fields""" + example = GroundTruthExample( + input="test", + context=[], + retrieval_context=[], + tools_called=[], + expected_tools=[] + ) + assert example.context == [] + assert example.retrieval_context == [] + assert example.tools_called == [] + assert example.expected_tools == [] + + +def test_ground_truth_example_empty_dict(): + """Test that empty dict is valid for additional_metadata""" + example = GroundTruthExample( + input="test", + additional_metadata={} + ) + assert example.additional_metadata == {} + \ No newline at end of file diff --git a/tests/data/test_api_example.py b/tests/data/test_api_example.py new file mode 100644 index 00000000..3e992497 --- /dev/null +++ b/tests/data/test_api_example.py @@ -0,0 +1,153 @@ +import pytest +from judgeval.data.api_example import ProcessExample, create_process_example +from judgeval.data.example import Example +from judgeval.data.scorer_data import ScorerData + +# Test data fixtures +@pytest.fixture +def basic_example(): + return Example( + name="test_case", + input="test input", + actual_output="actual output", + expected_output="expected output" + ) + +@pytest.fixture +def basic_scorer_data(): + return ScorerData( + name="test_scorer", + threshold=1.0, + success=True, + score=1.0, + metadata={"key": "value"} + ) + +class TestProcessExample: + def test_create_basic_process_example(self): + """Test creating a basic ProcessExample with required fields""" + process_ex = ProcessExample( + name="test", + input="test input", + actual_output="test output" + ) + assert process_ex.name == "test" + assert process_ex.input == "test input" + assert process_ex.actual_output == "test output" + + def test_validation_error_missing_input(self): + """Test validation error when input is missing""" + with pytest.raises(ValueError) as exc_info: + ProcessExample( + name="test", + actual_output="test output" + ) + assert "'input' and 'actual_output' must be provided" in str(exc_info.value) + + def test_validation_error_missing_actual_output(self): + """Test validation error when actual_output is missing""" + with pytest.raises(ValueError) as exc_info: + ProcessExample( + name="test", + input="test input" + ) + assert "'input' and 'actual_output' must be provided" in str(exc_info.value) + + def test_update_scorer_data_initial(self, basic_scorer_data): + """Test updating scorer data for the first time""" + process_ex = ProcessExample( + name="test", + input="test input", + actual_output="test output" + ) + process_ex.update_scorer_data(basic_scorer_data) + + assert process_ex.success == True + assert len(process_ex.scorers_data) == 1 + assert process_ex.scorers_data[0] == basic_scorer_data + + def test_update_scorer_data_multiple(self, basic_scorer_data): + """Test updating scorer data multiple times""" + process_ex = ProcessExample( + name="test", + input="test input", + actual_output="test output" + ) + + # Add first scorer + process_ex.update_scorer_data(basic_scorer_data) + + # Add second scorer with failure + failed_scorer = ScorerData( + name="failed_scorer", + threshold=1.0, + success=False, + score=0.0, + metadata={} + ) + process_ex.update_scorer_data(failed_scorer) + + assert process_ex.success == False + assert len(process_ex.scorers_data) == 2 + assert process_ex.scorers_data[1] == failed_scorer + + def test_update_run_duration(self): + """Test updating run duration""" + process_ex = ProcessExample( + name="test", + input="test input", + actual_output="test output" + ) + process_ex.update_run_duration(1.5) + assert process_ex.run_duration == 1.5 + +class TestCreateProcessExample: + def test_create_process_example_basic(self, basic_example): + """Test creating ProcessExample from basic Example""" + process_ex = create_process_example(basic_example) + + assert process_ex.name == "test_case" + assert process_ex.input == "test input" + assert process_ex.actual_output == "actual output" + assert process_ex.expected_output == "expected output" + assert process_ex.success == True + assert process_ex.scorers_data == [] + assert process_ex.run_duration is None + assert process_ex.evaluation_cost is None + + def test_create_process_example_no_name(self): + """Test creating ProcessExample from Example without name""" + example = Example( + input="test input", + actual_output="actual output" + ) + process_ex = create_process_example(example) + + assert process_ex.name == "Test Case Placeholder" + assert process_ex.input == "test input" + assert process_ex.actual_output == "actual output" + + def test_create_process_example_with_all_fields(self): + """Test creating ProcessExample with all possible fields""" + example = Example( + name="full_test", + input="test input", + actual_output="actual output", + expected_output="expected output", + context=["context1", "context2"], + retrieval_context=["retrieval1", "retrieval2"], + tools_called=["tool1", "tool2"], + expected_tools=["expected_tool1"], + additional_metadata={"key": "value"}, + trace_id="trace123" + ) + + process_ex = create_process_example(example) + + assert process_ex.name == "full_test" + assert process_ex.context == ["context1", "context2"] + assert process_ex.retrieval_context == ["retrieval1", "retrieval2"] + assert process_ex.tools_called == ["tool1", "tool2"] + assert process_ex.expected_tools == ["expected_tool1"] + assert process_ex.additional_metadata == {"key": "value"} + assert process_ex.trace_id == "trace123" diff --git a/tests/data/test_example.py b/tests/data/test_example.py new file mode 100644 index 00000000..6a31b80e --- /dev/null +++ b/tests/data/test_example.py @@ -0,0 +1,133 @@ +""" +Unit tests for the Example class +""" + +import pytest +from datetime import datetime +from pydantic import ValidationError +from judgeval.data.example import Example + + +def test_basic_example_creation(): + example = Example( + input="test input", + actual_output="test output" + ) + assert example.input == "test input" + assert example.actual_output == "test output" + assert example.expected_output is None + assert example.timestamp is not None + # Verify timestamp format + datetime.strptime(example.timestamp, "%Y%m%d_%H%M%S") + + +def test_full_example_creation(): + example = Example( + input="test input", + actual_output="test output", + expected_output="expected output", + context=["context1", "context2"], + retrieval_context=["retrieval1", "retrieval2"], + additional_metadata={"key": "value"}, + tools_called=["tool1", "tool2"], + expected_tools=["expected_tool1"], + name="test example", + example_id="123", + timestamp="20240101_120000", + trace_id="trace123" + ) + + assert example.input == "test input" + assert example.actual_output == "test output" + assert example.expected_output == "expected output" + assert example.context == ["context1", "context2"] + assert example.retrieval_context == ["retrieval1", "retrieval2"] + assert example.additional_metadata == {"key": "value"} + assert example.tools_called == ["tool1", "tool2"] + assert example.expected_tools == ["expected_tool1"] + assert example.name == "test example" + assert example.example_id == "123" + assert example.timestamp == "20240101_120000" + assert example.trace_id == "trace123" + + +def test_to_dict(): + example = Example( + input="test input", + actual_output="test output", + name="test example" + ) + + example_dict = example.to_dict() + assert example_dict["input"] == "test input" + assert example_dict["actual_output"] == "test output" + assert example_dict["name"] == "test example" + assert "timestamp" in example_dict + + +def test_string_representation(): + example = Example( + input="test input", + actual_output="test output" + ) + + str_repr = str(example) + assert "input=test input" in str_repr + assert "actual_output=test output" in str_repr + + +# Error cases + +def test_missing_input(): + with pytest.raises(ValidationError): + Example(actual_output="test output") + + +def test_missing_actual_output(): + with pytest.raises(ValidationError): + Example(input="test input") + + +def test_invalid_context_type(): + with pytest.raises(ValidationError): + Example( + input="test", + actual_output="test", + context="invalid context type" # Should be list of strings + ) + + +def test_invalid_context_content(): + with pytest.raises(ValidationError): + Example( + input="test", + actual_output="test", + context=["valid", 123] # Should be all strings + ) + + +def test_invalid_retrieval_context(): + with pytest.raises(ValidationError): + Example( + input="test", + actual_output="test", + retrieval_context=[1, 2, 3] # Should be list of strings + ) + + +def test_invalid_tools_called(): + with pytest.raises(ValidationError): + Example( + input="test", + actual_output="test", + tools_called={"tool1": "value"} # Should be list of strings + ) + + +def test_invalid_expected_tools(): + with pytest.raises(ValidationError): + Example( + input="test", + actual_output="test", + expected_tools=[1, "tool2"] # Should be list of strings + ) diff --git a/tests/data/test_result.py b/tests/data/test_result.py new file mode 100644 index 00000000..afe60440 --- /dev/null +++ b/tests/data/test_result.py @@ -0,0 +1,121 @@ +import pytest +from judgeval.data.result import ScoringResult, generate_scoring_result +from judgeval.data.api_example import ProcessExample +from judgeval.data.scorer_data import ScorerData + +@pytest.fixture +def sample_scorer_data(): + return ScorerData( + name="test_scorer", + threshold=1.0, + success=True, + score=0.8, + metadata={"key": "value"} + ) + +@pytest.fixture +def sample_process_example(sample_scorer_data): + return ProcessExample( + name="test_example", + input="test input", + actual_output="actual output", + expected_output="expected output", + context=["context1", "context2"], + retrieval_context=["retrieval1"], + success=True, + scorers_data=[sample_scorer_data] + ) + +class TestScoringResult: + def test_basic_initialization(self): + """Test basic initialization with minimal required fields""" + result = ScoringResult(success=True, scorers_data=[]) + assert result.success is True + assert result.scorers_data == [] + assert result.input is None + assert result.actual_output is None + + def test_full_initialization(self, sample_scorer_data): + """Test initialization with all fields""" + result = ScoringResult( + success=True, + scorers_data=[sample_scorer_data], + input="test input", + actual_output="actual output", + expected_output="expected output", + context=["context"], + retrieval_context=["retrieval"], + trace_id="trace123" + ) + + assert result.success is True + assert len(result.scorers_data) == 1 + assert result.input == "test input" + assert result.actual_output == "actual output" + assert result.expected_output == "expected output" + assert result.context == ["context"] + assert result.retrieval_context == ["retrieval"] + assert result.trace_id == "trace123" + + def test_to_dict_conversion(self, sample_scorer_data): + """Test conversion to dictionary""" + result = ScoringResult( + success=True, + scorers_data=[sample_scorer_data], + input="test" + ) + + dict_result = result.to_dict() + assert isinstance(dict_result, dict) + assert dict_result["success"] is True + assert len(dict_result["scorers_data"]) == 1 + assert dict_result["input"] == "test" + assert dict_result["actual_output"] is None + + def test_to_dict_with_none_scorers(self): + """Test conversion to dictionary when scorers_data is None""" + result = ScoringResult(success=False, scorers_data=None) + dict_result = result.to_dict() + assert dict_result["scorers_data"] is None + + def test_string_representation(self, sample_scorer_data): + """Test string representation of ScoringResult""" + result = ScoringResult(success=True, scorers_data=[sample_scorer_data]) + str_result = str(result) + assert "ScoringResult" in str_result + assert "success=True" in str_result + +class TestGenerateScoringResult: + def test_generate_from_process_example(self, sample_process_example): + """Test generating ScoringResult from ProcessExample""" + result = generate_scoring_result(sample_process_example) + + assert isinstance(result, ScoringResult) + assert result.success == sample_process_example.success + assert result.input == sample_process_example.input + assert result.actual_output == sample_process_example.actual_output + assert result.expected_output == sample_process_example.expected_output + assert result.context == sample_process_example.context + assert result.retrieval_context == sample_process_example.retrieval_context + assert result.trace_id == sample_process_example.trace_id + + def test_generate_with_minimal_process_example(self): + """Test generating ScoringResult from minimal ProcessExample""" + minimal_example = ProcessExample( + name="minimal", + input="test", + actual_output="output", + success=True, + scorers_data=[] + ) + + result = generate_scoring_result(minimal_example) + assert isinstance(result, ScoringResult) + assert result.success is True + assert result.scorers_data == [] + assert result.input == "test" + assert result.actual_output == "output" + assert result.expected_output is None + assert result.context is None + assert result.retrieval_context is None + assert result.trace_id is None diff --git a/tests/data/test_scorer_data.py b/tests/data/test_scorer_data.py new file mode 100644 index 00000000..1f1e7829 --- /dev/null +++ b/tests/data/test_scorer_data.py @@ -0,0 +1,294 @@ +import pytest +from typing import Dict, Optional + +from judgeval.data.scorer_data import ScorerData, create_scorer_data +from judgeval.scorers.custom_scorer import CustomScorer + + +class MockCustomScorer(CustomScorer): + """Mock implementation of CustomScorer for testing""" + def __init__( + self, + score_type: str = "mock_scorer", + threshold: float = 0.7, + score: Optional[float] = None, + score_breakdown: Optional[Dict] = None, + reason: Optional[str] = None, + success: Optional[bool] = None, + evaluation_model: Optional[str] = "gpt-4", + strict_mode: bool = False, + error: Optional[str] = None, + evaluation_cost: Optional[float] = None, + verbose_logs: Optional[str] = None, + additional_metadata: Optional[Dict] = None + ): + super().__init__( + score_type=score_type, + threshold=threshold, + score=score, + score_breakdown=score_breakdown, + reason=reason, + success=success, + evaluation_model=evaluation_model, + strict_mode=strict_mode, + error=error, + evaluation_cost=evaluation_cost, + verbose_logs=verbose_logs, + additional_metadata=additional_metadata + ) + self.__name__ = score_type + + def score_example(self, example, *args, **kwargs): + pass + + async def a_score_example(self, example, *args, **kwargs): + pass + + def success_check(self) -> bool: + return self.score >= self.threshold if self.score is not None else False + + +@pytest.fixture +def successful_scorer(): + """ + Fixture for a scorer that executes successfully and stores the results of the evaluation + """ + return MockCustomScorer( + score_type="test_scorer", + threshold=0.7, + score=0.8, + reason="Test passed successfully", + evaluation_model="gpt-4", + strict_mode=True, + evaluation_cost=0.1, + verbose_logs="Detailed test logs", + additional_metadata={"key": "value"} + ) + + +@pytest.fixture +def failed_scorer(): + """ + Fixture for a scorer that does not pass its threshold expectation + """ + return MockCustomScorer( + score_type="test_scorer", + threshold=0.7, + score=0.6, + reason="Test failed", + evaluation_model="gpt-4", + strict_mode=True, + evaluation_cost=0.1, + verbose_logs="Detailed test logs" + ) + + +@pytest.fixture +def error_scorer(): + """ + Fixture for a scorer that encounters an error during execution + """ + return MockCustomScorer( + score_type="test_scorer", + threshold=0.7, + error="Test execution failed", + evaluation_model="gpt-4", + evaluation_cost=0.1, + verbose_logs="Error logs" + ) + + +def test_scorer_data_successful_case(successful_scorer): + """Test ScorerData creation for a successful evaluation""" + scorer_data = create_scorer_data(successful_scorer) + + assert scorer_data.name == "test_scorer" + assert scorer_data.threshold == 0.7 + assert scorer_data.score == 0.8 + assert scorer_data.success is True + assert scorer_data.reason == "Test passed successfully" + assert scorer_data.strict_mode is True + assert scorer_data.evaluation_model == "gpt-4" + assert scorer_data.error is None + assert scorer_data.evaluation_cost == 0.1 + assert scorer_data.verbose_logs == "Detailed test logs" + assert scorer_data.additional_metadata == {"key": "value"} + + +def test_scorer_data_failed_case(failed_scorer): + """Test ScorerData creation for a failed evaluation""" + scorer_data = create_scorer_data(failed_scorer) + + assert scorer_data.name == "test_scorer" + assert scorer_data.threshold == 0.7 + assert scorer_data.score == 0.6 + assert scorer_data.success is False + assert scorer_data.reason == "Test failed" + assert scorer_data.error is None + + +def test_scorer_data_error_case(error_scorer): + """Test ScorerData creation when an error occurs""" + scorer_data = create_scorer_data(error_scorer) + + assert scorer_data.name == "test_scorer" + assert scorer_data.threshold == 0.7 + assert scorer_data.score is None + assert scorer_data.success is False + assert scorer_data.reason is None + assert scorer_data.error == "Test execution failed" + + +def test_scorer_data_to_dict(successful_scorer): + """Test the to_dict method of ScorerData""" + scorer_data = create_scorer_data(successful_scorer) + data_dict = scorer_data.to_dict() + + assert isinstance(data_dict, dict) + assert data_dict["name"] == "test_scorer" + assert data_dict["threshold"] == 0.7 + assert data_dict["score"] == 0.8 + assert data_dict["success"] is True + assert data_dict["reason"] == "Test passed successfully" + assert data_dict["strict_mode"] is True + assert data_dict["evaluation_model"] == "gpt-4" + assert data_dict["error"] is None + assert data_dict["evaluation_cost"] == 0.1 + assert data_dict["verbose_logs"] == "Detailed test logs" + assert data_dict["additional_metadata"] == {"key": "value"} + + +def test_scorer_data_direct_creation(): + """Test direct creation of ScorerData object""" + scorer_data = ScorerData( + name="direct_test", + threshold=0.5, + success=True, + score=0.75, + reason="Direct creation test", + strict_mode=True, + evaluation_model="gpt-4", + error=None, + evaluation_cost=0.2, + verbose_logs="Test logs", + additional_metadata={"test": "data"} + ) + + assert scorer_data.name == "direct_test" + assert scorer_data.threshold == 0.5 + assert scorer_data.success is True + assert scorer_data.score == 0.75 + + +def test_scorer_data_minimal_creation(): + """Test creation of ScorerData with minimal required fields""" + scorer_data = ScorerData( + name="minimal_test", + threshold=0.5, + success=True + ) + + assert scorer_data.name == "minimal_test" + assert scorer_data.threshold == 0.5 + assert scorer_data.success is True + assert scorer_data.score is None + assert scorer_data.reason is None + assert scorer_data.strict_mode is None + assert scorer_data.evaluation_model is None + assert scorer_data.error is None + assert scorer_data.evaluation_cost is None + assert scorer_data.verbose_logs is None + assert scorer_data.additional_metadata is None + + +def test_scorer_data_to_dict_minimal(): + """Test to_dict method with minimal required fields""" + scorer_data = ScorerData( + name="minimal_test", + threshold=0.5, + success=True + ) + data_dict = scorer_data.to_dict() + + assert isinstance(data_dict, dict) + assert data_dict["name"] == "minimal_test" + assert data_dict["threshold"] == 0.5 + assert data_dict["success"] is True + assert data_dict["score"] is None + assert data_dict["reason"] is None + assert data_dict["strict_mode"] is None + assert data_dict["evaluation_model"] is None + assert data_dict["error"] is None + assert data_dict["evaluation_cost"] is None + assert data_dict["verbose_logs"] is None + assert data_dict["additional_metadata"] is None + +def test_scorer_data_to_dict_with_list_model(): + """Test to_dict method when evaluation_model is a list""" + scorer_data = ScorerData( + name="list_model_test", + threshold=0.5, + success=True, + evaluation_model=["gpt-4", "gpt-3.5-turbo"] + ) + data_dict = scorer_data.to_dict() + + assert isinstance(data_dict["evaluation_model"], list) + assert data_dict["evaluation_model"] == ["gpt-4", "gpt-3.5-turbo"] + +def test_scorer_data_to_dict_with_error(): + """Test to_dict method with error information""" + scorer_data = ScorerData( + name="error_test", + threshold=0.5, + success=False, + error="Test error message" + ) + data_dict = scorer_data.to_dict() + + assert data_dict["error"] == "Test error message" + assert data_dict["success"] is False + assert data_dict["score"] is None + + +def test_scorer_data_to_dict_all_parameters(): + """Test to_dict method with all possible parameters set""" + test_metadata = { + "model_tokens": 150, + "completion_tokens": 50, + "custom_field": "custom_value" + } + + scorer_data = ScorerData( + name="full_test", + threshold=0.75, + success=True, + score=0.85, + reason="Comprehensive test case", + strict_mode=True, + evaluation_model=["gpt-4", "gpt-3.5-turbo"], + error=None, + evaluation_cost=0.123, + verbose_logs="Detailed execution logs\nwith multiple lines", + additional_metadata=test_metadata + ) + data_dict = scorer_data.to_dict() + + # Verify all fields are present and have correct values + assert isinstance(data_dict, dict) + assert data_dict["name"] == "full_test" + assert data_dict["threshold"] == 0.75 + assert data_dict["success"] is True + assert data_dict["score"] == 0.85 + assert data_dict["reason"] == "Comprehensive test case" + assert data_dict["strict_mode"] is True + assert data_dict["evaluation_model"] == ["gpt-4", "gpt-3.5-turbo"] + assert data_dict["error"] is None + assert data_dict["evaluation_cost"] == 0.123 + assert data_dict["verbose_logs"] == "Detailed execution logs\nwith multiple lines" + assert data_dict["additional_metadata"] == test_metadata + + # Verify the metadata dictionary contains all expected fields + assert data_dict["additional_metadata"]["model_tokens"] == 150 + assert data_dict["additional_metadata"]["completion_tokens"] == 50 + assert data_dict["additional_metadata"]["custom_field"] == "custom_value" diff --git a/tests/scorers/test_base_scorer.py b/tests/scorers/test_base_scorer.py new file mode 100644 index 00000000..d369997d --- /dev/null +++ b/tests/scorers/test_base_scorer.py @@ -0,0 +1,65 @@ +import pytest +from pydantic import ValidationError + +from judgeval.scorers.base_scorer import JudgmentScorer +from judgeval.constants import APIScorer + +@pytest.fixture +def valid_scorer_params(): + return { + "threshold": 0.8, + "score_type": APIScorer.FAITHFULNESS + } + +def test_judgment_scorer_creation_with_enum(): + """Test creating JudgmentScorer with APIScorer enum value""" + scorer = JudgmentScorer(threshold=0.8, score_type=APIScorer.FAITHFULNESS) + assert scorer.threshold == 0.8 + assert scorer.score_type == "faithfulness" + +def test_judgment_scorer_creation_with_string(): + """Test creating JudgmentScorer with string value""" + scorer = JudgmentScorer(threshold=0.8, score_type="faithfulness") + assert scorer.threshold == 0.8 + assert scorer.score_type == "faithfulness" + +def test_judgment_scorer_creation_with_uppercase_string(): + """Test creating JudgmentScorer with uppercase string value""" + scorer = JudgmentScorer(threshold=0.8, score_type="FAITHFULNESS") + assert scorer.threshold == 0.8 + assert scorer.score_type == "faithfulness" + +def test_judgment_scorer_str_representation(): + """Test the string representation of JudgmentScorer""" + scorer = JudgmentScorer(threshold=0.8, score_type=APIScorer.FAITHFULNESS) + expected_str = "JudgmentScorer(score_type=faithfulness, threshold=0.8)" + assert str(scorer) == expected_str + +@pytest.mark.parametrize("invalid_score_type", [ + 123, # integer + None, # None + True, # boolean + ["faithfulness"], # list + {"type": "faithfulness"}, # dict +]) +def test_judgment_scorer_invalid_score_type(invalid_score_type): + """Test creating JudgmentScorer with invalid score_type values""" + with pytest.raises(ValidationError) as exc_info: + JudgmentScorer(threshold=0.8, score_type=invalid_score_type) + + assert "Input should be" in str(exc_info.value) + +def test_judgment_scorer_invalid_string_value(): + """Test creating JudgmentScorer with invalid string value""" + with pytest.raises(ValidationError): + JudgmentScorer(threshold=0.8, score_type="INVALID_METRIC") + +def test_judgment_scorer_threshold_validation(): + """Test threshold validation""" + # Test float values + scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS) + assert scorer.threshold == 0.5 + + # Test integer values (should be converted to float) + scorer = JudgmentScorer(threshold=1, score_type=APIScorer.FAITHFULNESS) + assert scorer.threshold == 1.0 diff --git a/tests/scorers/test_custom_scorer.py b/tests/scorers/test_custom_scorer.py new file mode 100644 index 00000000..c01b12a9 --- /dev/null +++ b/tests/scorers/test_custom_scorer.py @@ -0,0 +1,152 @@ +import asyncio +import pytest +from unittest.mock import Mock, patch +from typing import Dict, Optional + +from judgeval.scorers.custom_scorer import CustomScorer +from judgeval.judges import judgevalJudge +from judgeval.common.exceptions import InvalidJudgeModelError + +class MockJudge(judgevalJudge): + """Mock implementation of judgevalJudge for testing""" + def load_model(self, *args, **kwargs): + return Mock() + + def generate(self, *args, **kwargs) -> str: + return "mock response" + + async def a_generate(self, *args, **kwargs) -> str: + return "mock async response" + + def get_model_name(self, *args, **kwargs) -> str: + return "mock-model" + +class SampleScorer(CustomScorer): + """Concrete implementation of CustomScorer for testing""" + def score_example(self, example, *args, **kwargs) -> float: + return 0.8 + + async def a_score_example(self, example, *args, **kwargs) -> float: + return 0.9 + + def success_check(self) -> bool: + return self.score >= self.threshold if self.score is not None else False + +@pytest.fixture +def basic_scorer(): + return SampleScorer( + score_type="test_scorer", + threshold=0.7 + ) + +@pytest.fixture +def mock_judge(): + return MockJudge(model_name="mock-model") + +class TestCustomScorer: + def test_initialization(self): + """Test basic initialization with minimal parameters""" + scorer = SampleScorer(score_type="test", threshold=0.5) + assert scorer.score_type == "test" + assert scorer.threshold == 0.5 + assert scorer.score is None + assert scorer.async_mode is True + assert scorer.verbose_mode is True + + def test_initialization_with_all_params(self): + """Test initialization with all optional parameters""" + additional_metadata = {"key": "value"} + scorer = SampleScorer( + score_type="test", + threshold=0.5, + score=0.8, + score_breakdown={"detail": 0.8}, + reason="test reason", + success=True, + evaluation_model="gpt-4", + strict_mode=True, + async_mode=False, + verbose_mode=False, + include_reason=True, + error=None, + evaluation_cost=0.01, + verbose_logs="test logs", + additional_metadata=additional_metadata + ) + + assert scorer.score == 0.8 + assert scorer.score_breakdown == {"detail": 0.8} + assert scorer.reason == "test reason" + assert scorer.success is True + assert scorer.strict_mode is True + assert scorer.async_mode is False + assert scorer.additional_metadata == additional_metadata + + @patch('judgeval.scorers.custom_scorer.create_judge') + def test_add_model_success(self, mock_create_judge, mock_judge, basic_scorer): + """Test successful model addition""" + mock_create_judge.return_value = (mock_judge, True) + + scorer = basic_scorer + scorer._add_model("mock-model") + + assert scorer.evaluation_model == "mock-model" + assert scorer.using_native_model is True + mock_create_judge.assert_called_once_with("mock-model") + + @patch('judgeval.scorers.custom_scorer.create_judge') + def test_add_model_error(self, mock_create_judge, basic_scorer): + """Test model addition with invalid model""" + mock_create_judge.side_effect = InvalidJudgeModelError("Invalid model") + + scorer = basic_scorer + with pytest.raises(InvalidJudgeModelError): + scorer._add_model("invalid-model") + + def test_score_example_implementation(self, basic_scorer): + """Test score_example returns expected value""" + score = basic_scorer.score_example({"test": "example"}) + assert score == 0.8 + + @pytest.mark.asyncio + async def test_a_score_example_implementation(self, basic_scorer): + """Test async score_example returns expected value""" + score = await basic_scorer.a_score_example({"test": "example"}) + assert score == 0.9 + + def test_success_check_implementation(self, basic_scorer): + """Test success_check with various scores""" + # Test with score above threshold + basic_scorer.score = 0.8 + assert basic_scorer.success_check() is True + + # Test with score below threshold + basic_scorer.score = 0.6 + assert basic_scorer.success_check() is False + + # Test with no score + basic_scorer.score = None + assert basic_scorer.success_check() is False + + def test_str_representation(self, basic_scorer): + """Test string representation of scorer""" + str_rep = str(basic_scorer) + assert "CustomScorer" in str_rep + assert "test_scorer" in str_rep + assert "0.7" in str_rep # threshold value + + def test_abstract_methods_base_class(self): + """Test that abstract methods raise NotImplementedError when not implemented""" + class IncompleteScorer(CustomScorer): + pass + + scorer = IncompleteScorer(score_type="test", threshold=0.5) + + with pytest.raises(NotImplementedError): + scorer.score_example({}) + + with pytest.raises(NotImplementedError): + asyncio.run(scorer.a_score_example({})) + + with pytest.raises(NotImplementedError): + scorer.success_check() diff --git a/tests/scorers/test_prompt_scorer.py b/tests/scorers/test_prompt_scorer.py new file mode 100644 index 00000000..4bdf3e89 --- /dev/null +++ b/tests/scorers/test_prompt_scorer.py @@ -0,0 +1,156 @@ +import pytest +from unittest.mock import MagicMock, AsyncMock +from typing import List, Dict + +from judgeval.data import Example +from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer + +# Test fixtures +@pytest.fixture +def example(): + return Example( + input="This is a test input", + actual_output="This is a test response", + expected_output="Expected response", + context=["Some context"], + retrieval_context=["Retrieved context"], + tools_called=["tool1", "tool2"], + expected_tools=["tool1"] + ) + +@pytest.fixture +def mock_model(): + model = MagicMock() + model.generate = MagicMock(return_value='{"score": 0.8, "reason": "Test reason"}') + model.a_generate = AsyncMock(return_value='{"score": 0.8, "reason": "Test reason"}') + return model + +# Simple implementation of PromptScorer for testing +class SampleScorer(PromptScorer): + def build_measure_prompt(self, example: Example) -> List[dict]: + return [ + {"role": "system", "content": "Test system prompt"}, + {"role": "user", "content": f"Response: {example.actual_output}"} + ] + + def build_schema(self) -> dict: + return {"score": float, "reason": str} + + def process_response(self, response: dict): + return response["score"], response["reason"] + + def success_check(self, **kwargs) -> bool: + return self.result >= self.threshold + +# Tests for PromptScorer +class TestPromptScorer: + def test_init(self): + scorer = SampleScorer("test_scorer") + assert scorer.name == "test_scorer" + assert scorer.threshold == 0.5 + assert scorer.include_reason is True + assert scorer.async_mode is True + + def test_init_strict_mode(self): + scorer = SampleScorer("test_scorer", strict_mode=True) + assert scorer.threshold == 1 + + def test_enforce_prompt_format(self): + scorer = SampleScorer("test_scorer") + prompt = [{"role": "system", "content": "Base prompt"}] + schema = {"score": float, "reason": str} + + formatted = scorer.enforce_prompt_format(prompt, schema) + assert "JSON format" in formatted[0]["content"] + assert '"score": (float)' in formatted[0]["content"] + assert '"reason": (str)' in formatted[0]["content"] + + def test_enforce_prompt_format_invalid_input(self): + scorer = SampleScorer("test_scorer") + with pytest.raises(TypeError): + scorer.enforce_prompt_format("invalid", {}) + + @pytest.mark.asyncio + async def test_a_score_example(self, example, mock_model): + scorer = SampleScorer("test_scorer") + scorer.model = mock_model + + result = await scorer.a_score_example(example, _show_indicator=False) + assert result == 0.8 + assert scorer.reason == "Test reason" + + def test_score_example_sync(self, example, mock_model): + scorer = SampleScorer("test_scorer", async_mode=False) + scorer.model = mock_model + + result = scorer.score_example(example, _show_indicator=False) + assert result == 0.8 + assert scorer.reason == "Test reason" + +# Tests for ClassifierScorer +class TestClassifierScorer: + @pytest.fixture + def classifier_conversation(self): + return [ + {"role": "system", "content": "Evaluate if {{actual_output}} is positive"}, + {"role": "user", "content": "Please analyze."} + ] + + @pytest.fixture + def classifier_options(self): + return {"positive": 1.0, "negative": 0.0} + + def test_classifier_init(self, classifier_conversation, classifier_options): + scorer = ClassifierScorer( + "test_classifier", + classifier_conversation, + classifier_options + ) + assert scorer.conversation == classifier_conversation + assert scorer.options == classifier_options + + def test_build_measure_prompt(self, example, classifier_conversation, classifier_options): + scorer = ClassifierScorer( + "test_classifier", + classifier_conversation, + classifier_options + ) + + prompt = scorer.build_measure_prompt(example) + assert "This is a test response" in prompt[0]["content"] + + def test_process_response(self, classifier_conversation, classifier_options): + scorer = ClassifierScorer( + "test_classifier", + classifier_conversation, + classifier_options + ) + + response = {"choice": "positive", "reason": "Test reason"} + score, reason = scorer.process_response(response) + assert score == 1.0 + assert reason == "Test reason" + + def test_process_response_invalid_choice(self, classifier_conversation, classifier_options): + scorer = ClassifierScorer( + "test_classifier", + classifier_conversation, + classifier_options + ) + + response = {"choice": "invalid", "reason": "Test reason"} + with pytest.raises(ValueError): + scorer.process_response(response) + + def test_success_check(self, classifier_conversation, classifier_options): + scorer = ClassifierScorer( + "test_classifier", + classifier_conversation, + classifier_options + ) + + scorer.score = 1.0 + assert scorer.success_check() is True + + scorer.score = 0.0 + assert scorer.success_check() is False diff --git a/tests/scorers/test_score.py b/tests/scorers/test_score.py new file mode 100644 index 00000000..08354fd9 --- /dev/null +++ b/tests/scorers/test_score.py @@ -0,0 +1,974 @@ +import pytest +from unittest.mock import AsyncMock, Mock, patch +from rich.progress import Progress, SpinnerColumn, TextColumn +import asyncio + +from judgeval.scorers.score import (safe_a_score_example, + score_task, + score_with_indicator, + a_execute_scoring, + a_eval_examples_helper) +from judgeval.scorers import CustomScorer +from judgeval.data import Example, ScoringResult, ProcessExample, ScorerData +from judgeval.common.exceptions import MissingTestCaseParamsError + + +class MockCustomScorer(CustomScorer): + def score_example(self, example, *args, **kwargs): + pass + + async def a_score_example(self, example, *args, **kwargs): + pass + + def success_check(self): + return True + + +@pytest.fixture +def example(): + return Example( + input="test input", + actual_output="test output", + example_id="test_id" + ) + + +@pytest.fixture +def basic_scorer(): + return MockCustomScorer( + score_type="test_scorer", + threshold=0.5 + ) + + +@pytest.fixture +def scorers(basic_scorer): + """Fixture providing a list of test scorers""" + return [ + MockCustomScorer(score_type="test_scorer", threshold=0.5), + MockCustomScorer(score_type="test_scorer", threshold=0.5) + ] + + +@pytest.fixture +def progress(): + return Progress( + SpinnerColumn(style="rgb(106,0,255)"), + TextColumn("[progress.description]{task.description}"), + transient=True + ) + + +@pytest.mark.asyncio +async def test_successful_scoring(example, basic_scorer): + """Test basic successful scoring case""" + basic_scorer.a_score_example = AsyncMock() + + await safe_a_score_example( + scorer=basic_scorer, + example=example, + ignore_errors=True, + skip_on_missing_params=True + ) + + basic_scorer.a_score_example.assert_called_once_with(example, _show_indicator=False) + assert basic_scorer.error is None + assert not hasattr(basic_scorer, 'skipped') or not basic_scorer.skipped + + +@pytest.mark.asyncio +async def test_missing_params_with_skip(example, basic_scorer): + """Test handling of MissingTestCaseParamsError when skip_on_missing_params is True""" + async def mock_score(*args, **kwargs): + raise MissingTestCaseParamsError("Missing required params") + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + await safe_a_score_example( + scorer=basic_scorer, + example=example, + ignore_errors=True, + skip_on_missing_params=True + ) + + assert basic_scorer.skipped is True + assert basic_scorer.error is None + + +@pytest.mark.asyncio +async def test_missing_params_with_ignore_errors(example, basic_scorer): + """Test handling of MissingTestCaseParamsError when ignore_errors is True but not skipping""" + async def mock_score(*args, **kwargs): + raise MissingTestCaseParamsError("Missing required params") + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + await safe_a_score_example( + scorer=basic_scorer, + example=example, + ignore_errors=True, + skip_on_missing_params=False + ) + + assert basic_scorer.error == "Missing required params" + assert basic_scorer.success is False + + +@pytest.mark.asyncio +async def test_missing_params_raises_error(example, basic_scorer): + """Test that MissingTestCaseParamsError is raised when appropriate""" + async def mock_score(*args, **kwargs): + raise MissingTestCaseParamsError("Missing required params") + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + with pytest.raises(MissingTestCaseParamsError): + await safe_a_score_example( + scorer=basic_scorer, + example=example, + ignore_errors=False, + skip_on_missing_params=False + ) + + +@pytest.mark.asyncio +async def test_type_error_handling(example, basic_scorer): + """Test handling of TypeError when _show_indicator is not accepted""" + calls = [] + + async def mock_score(*args, **kwargs): + calls.append(kwargs) + if '_show_indicator' in kwargs: + raise TypeError("_show_indicator not accepted") + return True + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + await safe_a_score_example( + scorer=basic_scorer, + example=example, + ignore_errors=True, + skip_on_missing_params=True + ) + + assert len(calls) == 2 # Should try twice - once with _show_indicator, once without + assert '_show_indicator' in calls[0] # First attempt includes _show_indicator + assert '_show_indicator' not in calls[1] # Second attempt doesn't include _show_indicator + + +@pytest.mark.asyncio +async def test_general_exception_with_ignore(example, basic_scorer): + """Test handling of general exceptions when ignore_errors is True""" + async def mock_score(*args, **kwargs): + raise ValueError("Test error") + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + await safe_a_score_example( + scorer=basic_scorer, + example=example, + ignore_errors=True, + skip_on_missing_params=True + ) + + assert basic_scorer.error == "Test error" + assert basic_scorer.success is False + + +@pytest.mark.asyncio +async def test_general_exception_raises(example, basic_scorer): + """Test that general exceptions are raised when ignore_errors is False""" + async def mock_score(*args, **kwargs): + raise ValueError("Test error") + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + with pytest.raises(ValueError): + await safe_a_score_example( + scorer=basic_scorer, + example=example, + ignore_errors=False, + skip_on_missing_params=True + ) + + +@pytest.mark.asyncio +async def test_error_with_missing_params(example, basic_scorer): + """Test handling of TypeError followed by MissingTestCaseParamsError""" + calls = [] + + async def mock_score(*args, **kwargs): + calls.append(kwargs) + if '_show_indicator' in kwargs: + raise TypeError("_show_indicator not accepted") + raise MissingTestCaseParamsError("Missing params") + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + await safe_a_score_example( + scorer=basic_scorer, + example=example, + ignore_errors=True, + skip_on_missing_params=True + ) + + assert basic_scorer.skipped is True + assert len(calls) == 2 + + +@pytest.mark.asyncio +async def test_task_successful_scoring(example, basic_scorer, progress): + """Test basic successful scoring case with progress tracking""" + task_id = progress.add_task(description="Test Task", total=100) + basic_scorer.a_score_example = AsyncMock() + + with progress: + await score_task( + task_id=task_id, + progress=progress, + scorer=basic_scorer, + example=example + ) + + basic_scorer.a_score_example.assert_called_once_with(example, _show_indicator=False) + assert progress.tasks[task_id].completed == 100 + assert "Completed" in progress.tasks[task_id].description + + +@pytest.mark.asyncio +async def test_task_missing_params_with_skip(example, basic_scorer, progress): + """Test handling of MissingTestCaseParamsError when skip_on_missing_params is True""" + task_id = progress.add_task(description="Test Task", total=100) + + async def mock_score(*args, **kwargs): + raise MissingTestCaseParamsError("Missing required params") + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + with progress: + await score_task( + task_id=task_id, + progress=progress, + scorer=basic_scorer, + example=example, + skip_on_missing_params=True + ) + + assert basic_scorer.skipped is True + assert not progress.tasks[task_id].completed # Task should not be marked as complete + + +@pytest.mark.asyncio +async def test_task_missing_params_with_ignore_errors(example, basic_scorer, progress): + """Test handling of MissingTestCaseParamsError when ignore_errors is True""" + task_id = progress.add_task(description="Test Task", total=100) + + async def mock_score(*args, **kwargs): + raise MissingTestCaseParamsError("Missing required params") + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + with progress: + await score_task( + task_id=task_id, + progress=progress, + scorer=basic_scorer, + example=example, + skip_on_missing_params=False, + ignore_errors=True + ) + + assert basic_scorer.error == "Missing required params" + assert basic_scorer.success is False + assert progress.tasks[task_id].completed == 100 + assert "Failed" in progress.tasks[task_id].description + + +@pytest.mark.asyncio +async def test_task_missing_params_raises_error(example, basic_scorer, progress): + """Test that MissingTestCaseParamsError is raised when appropriate""" + task_id = progress.add_task(description="Test Task", total=100) + + async def mock_score(*args, **kwargs): + raise MissingTestCaseParamsError("Missing required params") + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + with pytest.raises(MissingTestCaseParamsError): + with progress: + await score_task( + task_id=task_id, + progress=progress, + scorer=basic_scorer, + example=example, + skip_on_missing_params=False, + ignore_errors=False + ) + + +@pytest.mark.asyncio +async def test_task_type_error_handling(example, basic_scorer, progress): + """Test handling of TypeError when _show_indicator is not accepted""" + task_id = progress.add_task(description="Test Task", total=100) + calls = [] + + async def mock_score(*args, **kwargs): + calls.append(kwargs) + if '_show_indicator' in kwargs: + raise TypeError("_show_indicator not accepted") + return True + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + with progress: + await score_task( + task_id=task_id, + progress=progress, + scorer=basic_scorer, + example=example + ) + + assert len(calls) == 2 # Should try twice - once with _show_indicator, once without + assert progress.tasks[task_id].completed == 100 + assert "Completed" in progress.tasks[task_id].description + + +@pytest.mark.asyncio +async def test_task_general_exception_with_ignore(example, basic_scorer, progress): + """Test handling of general exceptions when ignore_errors is True""" + task_id = progress.add_task(description="Test Task", total=100) + + async def mock_score(*args, **kwargs): + raise ValueError("Test error") + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + with progress: + await score_task( + task_id=task_id, + progress=progress, + scorer=basic_scorer, + example=example, + ignore_errors=True + ) + + assert basic_scorer.error == "Test error" + assert basic_scorer.success is False + assert progress.tasks[task_id].completed == 100 + assert "Failed" in progress.tasks[task_id].description + + +@pytest.mark.asyncio +async def test_task_general_exception_raises(example, basic_scorer, progress): + """Test that general exceptions are raised when ignore_errors is False""" + task_id = progress.add_task(description="Test Task", total=100) + + async def mock_score(*args, **kwargs): + raise ValueError("Test error") + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + with pytest.raises(ValueError): + with progress: + await score_task( + task_id=task_id, + progress=progress, + scorer=basic_scorer, + example=example, + ignore_errors=False + ) + + +@pytest.mark.asyncio +async def test_task_progress_timing(example, basic_scorer, progress): + """Test that timing information is correctly added to progress description""" + task_id = progress.add_task(description="Test Task", total=100) + + async def mock_score(*args, **kwargs): + await asyncio.sleep(0.1) # Simulate some work + return True + + basic_scorer.a_score_example = AsyncMock(side_effect=mock_score) + + with progress: + await score_task( + task_id=task_id, + progress=progress, + scorer=basic_scorer, + example=example + ) + + assert "(" in progress.tasks[task_id].description + assert "s)" in progress.tasks[task_id].description # Should show timing + + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.safe_a_score_example') +@patch('judgeval.scorers.score.score_task') +async def test_score_with_indicator_no_show(mock_score_task, mock_safe_score, example, scorers): + """Test scoring without showing the indicator""" + mock_safe_score.return_value = AsyncMock()() + + await score_with_indicator( + scorers=scorers, + example=example, + ignore_errors=True, + skip_on_missing_params=True, + show_indicator=False + ) + + assert mock_safe_score.call_count == 2 # Called once for each scorer + assert mock_score_task.call_count == 0 # Should not be called when show_indicator is False + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.Progress') +@patch('judgeval.scorers.score.score_task') +@patch('judgeval.scorers.score.scorer_console_msg') +async def test_score_with_indicator_show(mock_console_msg, mock_score_task, mock_progress, example, scorers): + """Test scoring with progress indicator""" + mock_progress_instance = Mock() + mock_progress.return_value.__enter__.return_value = mock_progress_instance + mock_progress_instance.add_task.return_value = 1 + mock_score_task.return_value = AsyncMock()() + mock_console_msg.return_value = "Test Progress Message" + + await score_with_indicator( + scorers=scorers, + example=example, + ignore_errors=True, + skip_on_missing_params=True, + show_indicator=True + ) + + assert mock_progress_instance.add_task.call_count == 2 # Called once for each scorer + assert mock_score_task.call_count == 2 # Called once for each scorer + +@pytest.mark.asyncio +async def test_score_with_indicator_error_handling(example, scorers): + """Test error handling during scoring""" + # Make first scorer raise an error + async def mock_error(*args, **kwargs): + raise ValueError("Test error") + + async def mock_success(*args, **kwargs): + # Simulate successful scoring + scorers[1].success = True + return True + + scorers[0].a_score_example = AsyncMock(side_effect=mock_error) + scorers[1].a_score_example = AsyncMock(side_effect=mock_success) + + await score_with_indicator( + scorers=scorers, + example=example, + ignore_errors=True, + skip_on_missing_params=True, + show_indicator=False + ) + + assert scorers[0].error == "Test error" + assert scorers[0].success is False + assert scorers[1].error is None + assert scorers[1].success is True + +@pytest.mark.asyncio +async def test_score_with_indicator_missing_params(example, scorers): + """Test handling of missing parameters""" + async def mock_missing_params(*args, **kwargs): + raise MissingTestCaseParamsError("Missing params") + + # Set up mock for first scorer to raise error + scorers[0].a_score_example = AsyncMock(side_effect=mock_missing_params) + # Set up mock for second scorer to succeed + scorers[1].a_score_example = AsyncMock(return_value=True) + + await score_with_indicator( + scorers=scorers, + example=example, + ignore_errors=True, + skip_on_missing_params=True, + show_indicator=False + ) + + assert scorers[0].skipped is True + assert not hasattr(scorers[1], 'skipped') # Second scorer should not be skipped, so attribute shouldn't exist + +@pytest.mark.asyncio +async def test_score_with_indicator_raises_error(example, scorers): + """Test that errors are raised when ignore_errors is False""" + async def mock_error(*args, **kwargs): + raise ValueError("Test error") + + scorers[0].a_score_example = AsyncMock(side_effect=mock_error) + + with pytest.raises(ValueError): + await score_with_indicator( + scorers=scorers, + example=example, + ignore_errors=False, # Errors should be raised + skip_on_missing_params=True, + show_indicator=False + ) + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.Progress') +async def test_score_with_indicator_empty_scorers(mock_progress, example): + """Test handling of empty scorers list""" + await score_with_indicator( + scorers=[], + example=example, + ignore_errors=True, + skip_on_missing_params=True, + show_indicator=False + ) + + mock_progress.assert_not_called() + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.Progress') +async def test_score_with_indicator_concurrent_execution(mock_progress, example, scorers): + """Test that scorers are executed concurrently""" + completed_order = [] + + async def mock_delayed_score(*args, **kwargs): + await asyncio.sleep(0.1) # First scorer + completed_order.append(1) + + async def mock_quick_score(*args, **kwargs): + completed_order.append(2) # Second scorer + + # Create two separate scorer instances instead of using the same one twice + scorer1 = MockCustomScorer(score_type="test_scorer", threshold=0.5) + scorer2 = MockCustomScorer(score_type="test_scorer", threshold=0.5) + + scorer1.a_score_example = AsyncMock(side_effect=mock_delayed_score) + scorer2.a_score_example = AsyncMock(side_effect=mock_quick_score) + + await score_with_indicator( + scorers=[scorer1, scorer2], # Use the new separate instances + example=example, + ignore_errors=True, + skip_on_missing_params=True, + show_indicator=False + ) + + # Second scorer should complete before first scorer due to delay + assert completed_order == [2, 1] + + +@pytest.fixture +def mock_example(): + return Example( + input="test input", + actual_output="test output", + example_id="test_id", + timestamp="20241225_000004" + ) + +@pytest.fixture +def mock_examples(): + return [ + Example(input=f"test input {i}", + actual_output=f"test output {i}", + example_id=f"test_id_{i}", + timestamp="20241225_000004") + for i in range(3) + ] + +@pytest.fixture +def mock_scorer(): + class MockScorer(CustomScorer): + def __init__(self): + self.success = None + self.error = None + self.skipped = False + self.verbose_mode = False + self._add_model = Mock() + + return MockScorer() + +@pytest.fixture +def mock_scoring_result(): + return Mock(spec=ScoringResult) + +# Tests +@pytest.mark.asyncio +@patch('judgeval.scorers.score.clone_scorers') +@patch('judgeval.scorers.score.a_eval_examples_helper') +async def test_basic_execution(mock_helper, mock_clone_scorers, mock_examples, mock_scorer, mock_scoring_result): + """Test basic execution with single scorer and multiple examples""" + # Setup mocks + mock_clone_scorers.return_value = [mock_scorer] + mock_helper.return_value = None + + results = await a_execute_scoring( + examples=mock_examples, + scorers=[mock_scorer], + show_indicator=False + ) + + assert len(results) == len(mock_examples) + assert mock_helper.call_count == len(mock_examples) + assert mock_clone_scorers.call_count == len(mock_examples) + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.clone_scorers') +@patch('judgeval.scorers.score.a_eval_examples_helper') +async def test_empty_scorers(mock_helper, mock_clone_scorers, mock_examples): + """Test execution with no scorers""" + results = await a_execute_scoring( + examples=mock_examples, + scorers=[], + show_indicator=False + ) + + assert len(results) == len(mock_examples) + mock_helper.assert_not_called() + mock_clone_scorers.assert_not_called() + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.clone_scorers') +@patch('judgeval.scorers.score.a_eval_examples_helper') +async def test_empty_examples(mock_helper, mock_clone_scorers, mock_scorer): + """Test execution with no examples""" + results = await a_execute_scoring( + examples=[], + scorers=[mock_scorer], + show_indicator=False + ) + + assert len(results) == 0 + mock_helper.assert_not_called() + mock_clone_scorers.assert_not_called() + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.clone_scorers') +@patch('judgeval.scorers.score.a_eval_examples_helper') +async def test_error_handling(mock_helper, mock_clone_scorers, mock_examples, mock_scorer): + """Test error handling when helper raises exception""" + mock_clone_scorers.return_value = [mock_scorer] + mock_helper.side_effect = ValueError("Test error") + + # Test with ignore_errors=True + results = await a_execute_scoring( + examples=mock_examples, + scorers=[mock_scorer], + ignore_errors=True, + skip_on_missing_params=True, + show_indicator=False, + _use_bar_indicator=False + ) + + # Add assertions to verify error was handled + assert len(results) == len(mock_examples) + assert all(result is None for result in results) # Results should be None when errors are ignored + + # Test with ignore_errors=False + with pytest.raises(ValueError): + await a_execute_scoring( + examples=mock_examples, + scorers=[mock_scorer], + ignore_errors=False, + skip_on_missing_params=True, + show_indicator=False, + _use_bar_indicator=False + ) + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.clone_scorers') +@patch('judgeval.scorers.score.a_eval_examples_helper') +async def test_max_concurrent_limit(mock_helper, mock_clone_scorers, mock_examples, mock_scorer): + """Test concurrent execution limit""" + mock_clone_scorers.return_value = [mock_scorer] + + async def delayed_execution(*args, **kwargs): + await asyncio.sleep(0.1) + return None + + mock_helper.side_effect = delayed_execution + + start_time = asyncio.get_event_loop().time() + + await a_execute_scoring( + examples=mock_examples, + scorers=[mock_scorer], + max_concurrent=1, # Force sequential execution + show_indicator=False + ) + + end_time = asyncio.get_event_loop().time() + duration = end_time - start_time + + # Duration should be at least (num_examples * 0.1) seconds due to sequential execution + assert duration >= len(mock_examples) * 0.1 + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.clone_scorers') +@patch('judgeval.scorers.score.a_eval_examples_helper') +async def test_throttle_value(mock_helper, mock_clone_scorers, mock_examples, mock_scorer): + """Test throttling between tasks""" + mock_clone_scorers.return_value = [mock_scorer] + start_time = asyncio.get_event_loop().time() + + await a_execute_scoring( + examples=mock_examples, + scorers=[mock_scorer], + throttle_value=0.1, + show_indicator=False + ) + + end_time = asyncio.get_event_loop().time() + duration = end_time - start_time + + # Duration should be at least (num_examples - 1) * throttle_value + assert duration >= (len(mock_examples) - 1) * 0.1 + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.clone_scorers') +@patch('judgeval.scorers.score.a_eval_examples_helper') +@patch('judgeval.scorers.score.tqdm_asyncio') +async def test_progress_indicator(mock_tqdm, mock_helper, mock_clone_scorers, mock_examples, mock_scorer): + """Test progress indicator functionality""" + mock_clone_scorers.return_value = [mock_scorer] + + await a_execute_scoring( + examples=mock_examples, + scorers=[mock_scorer], + show_indicator=True, + _use_bar_indicator=True + ) + + assert mock_tqdm.called + mock_helper.assert_called() + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.clone_scorers') +@patch('judgeval.scorers.score.a_eval_examples_helper') +async def test_model_assignment(mock_helper, mock_clone_scorers, mock_examples, mock_scorer): + """Test model assignment to scorers""" + mock_clone_scorers.return_value = [mock_scorer] + test_model = "test_model" + + await a_execute_scoring( + examples=mock_examples, + scorers=[mock_scorer], + model=test_model, + show_indicator=False + ) + + mock_scorer._add_model.assert_called_once_with(test_model) + +@pytest.mark.asyncio +@patch('judgeval.scorers.score.clone_scorers') +@patch('judgeval.scorers.score.a_eval_examples_helper') +async def test_verbose_mode_setting(mock_helper, mock_clone_scorers, mock_examples, mock_scorer): + """Test verbose mode is properly set on scorers""" + mock_clone_scorers.return_value = [mock_scorer] + + await a_execute_scoring( + examples=mock_examples, + scorers=[mock_scorer], + verbose_mode=True, + show_indicator=False + ) + + assert mock_scorer.verbose_mode is True + + +@pytest.fixture +def mock_example(): + """Create a mock Example object""" + return Example( + name="test_example", + input="test input", + actual_output="test output", + expected_output="expected output", + context=["context1", "context2"], + retrieval_context=["retrieval1"], + trace_id="test_trace_123" + ) + +@pytest.fixture +def mock_scorer(): + """Create a mock CustomScorer""" + scorer = Mock(spec=CustomScorer) + scorer.__name__ = "MockScorer" + scorer.threshold = 0.8 + scorer.strict_mode = True + scorer.evaluation_model = "test-model" + scorer.score = 0.9 + scorer.reason = "Test reason" + scorer.success_check.return_value = True + scorer.evaluation_cost = 0.1 + scorer.verbose_logs = "Test logs" + scorer.additional_metadata = {"key": "value"} + scorer.skipped = False + scorer.error = None + return scorer + +@pytest.fixture +def mock_scoring_results(): + """Create a mock list to store ScoringResults""" + return [None] * 3 # List with 3 None elements + +@pytest.fixture +def mock_process_example(mock_example): + """Create a mock ProcessExample""" + return ProcessExample( + name=mock_example.name, + input=mock_example.input, + actual_output=mock_example.actual_output, + expected_output=mock_example.expected_output, + context=mock_example.context, + retrieval_context=mock_example.retrieval_context, + trace_id=mock_example.trace_id + ) + +@pytest.mark.asyncio +async def test_a_eval_examples_helper_success( + mock_example, + mock_scorer, + mock_scoring_results, + mock_process_example +): + """Test successful execution of a_eval_examples_helper""" + + # Create list of scorers + scorers = [mock_scorer] + + # Mock the external functions + with patch('judgeval.scorers.score.create_process_example', return_value=mock_process_example) as mock_create_process, \ + patch('judgeval.scorers.score.score_with_indicator', new_callable=AsyncMock) as mock_score_with_indicator, \ + patch('judgeval.scorers.score.create_scorer_data') as mock_create_scorer_data, \ + patch('judgeval.scorers.score.generate_scoring_result') as mock_generate_result: + + # Setup mock returns + mock_scorer_data = ScorerData( + name=mock_scorer.__name__, + threshold=mock_scorer.threshold, + success=True, + score=mock_scorer.score, + reason=mock_scorer.reason, + strict_mode=mock_scorer.strict_mode, + evaluation_model=mock_scorer.evaluation_model, + error=None, + evaluation_cost=mock_scorer.evaluation_cost, + verbose_logs=mock_scorer.verbose_logs, + additional_metadata=mock_scorer.additional_metadata + ) + mock_create_scorer_data.return_value = mock_scorer_data + + mock_scoring_result = ScoringResult( + success=True, + scorers_data=[mock_scorer_data], + input=mock_example.input, + actual_output=mock_example.actual_output, + expected_output=mock_example.expected_output, + context=mock_example.context, + retrieval_context=mock_example.retrieval_context, + trace_id=mock_example.trace_id + ) + mock_generate_result.return_value = mock_scoring_result + + # Execute the function + await a_eval_examples_helper( + scorers=scorers, + example=mock_example, + scoring_results=mock_scoring_results, + score_index=0, + ignore_errors=True, + skip_on_missing_params=True, + show_indicator=True, + _use_bar_indicator=False, + pbar=None + ) + + # Verify the calls + mock_create_process.assert_called_once_with(mock_example) + mock_score_with_indicator.assert_called_once_with( + scorers=scorers, + example=mock_example, + skip_on_missing_params=True, + ignore_errors=True, + show_indicator=True + ) + mock_create_scorer_data.assert_called_once_with(mock_scorer) + mock_generate_result.assert_called_once_with(mock_process_example) + + # Verify the result was stored correctly + assert mock_scoring_results[0] == mock_scoring_result + +@pytest.mark.asyncio +async def test_a_eval_examples_helper_with_skipped_scorer( + mock_example, + mock_scorer, + mock_scoring_results, + mock_process_example +): + """Test execution when scorer is skipped""" + + scorers = [mock_scorer] + + with patch('judgeval.scorers.score.create_process_example', return_value=mock_process_example) as mock_create_process, \ + patch('judgeval.scorers.score.score_with_indicator', new_callable=AsyncMock) as mock_score_with_indicator, \ + patch('judgeval.scorers.score.create_scorer_data') as mock_create_scorer_data, \ + patch('judgeval.scorers.score.generate_scoring_result') as mock_generate_result: + + # Mock score_with_indicator to simulate skipped scorer behavior + async def mock_score(*args, **kwargs): + # Set scorer as skipped after score_with_indicator is called + mock_scorer.skipped = True + return None + + mock_score_with_indicator.side_effect = mock_score + + await a_eval_examples_helper( + scorers=scorers, + example=mock_example, + scoring_results=mock_scoring_results, + score_index=1, + ignore_errors=True, + skip_on_missing_params=True, + show_indicator=True, + _use_bar_indicator=False, + pbar=None + ) + + # Verify that create_scorer_data was not called since scorer was skipped + mock_create_scorer_data.assert_not_called() + + # Verify that generate_scoring_result was still called (but with no scorer data) + mock_generate_result.assert_called_once_with(mock_process_example) + +@pytest.mark.asyncio +async def test_a_eval_examples_helper_with_progress_bar( + mock_example, + mock_scorer, + mock_scoring_results, + mock_process_example +): + """Test execution with progress bar""" + + scorers = [mock_scorer] + mock_pbar = Mock() + + with patch('judgeval.scorers.score.create_process_example', return_value=mock_process_example), \ + patch('judgeval.scorers.score.score_with_indicator', new_callable=AsyncMock), \ + patch('judgeval.scorers.score.create_scorer_data'), \ + patch('judgeval.scorers.score.generate_scoring_result'): + + await a_eval_examples_helper( + scorers=scorers, + example=mock_example, + scoring_results=mock_scoring_results, + score_index=2, + ignore_errors=True, + skip_on_missing_params=True, + show_indicator=True, + _use_bar_indicator=True, + pbar=mock_pbar + ) + + # Verify progress bar was updated + mock_pbar.update.assert_called_once_with(1) + diff --git a/tests/scorers/test_scorer_utils.py b/tests/scorers/test_scorer_utils.py new file mode 100644 index 00000000..c10ac0a6 --- /dev/null +++ b/tests/scorers/test_scorer_utils.py @@ -0,0 +1,175 @@ +import pytest +import asyncio +import json +from unittest.mock import MagicMock, patch +from rich.console import Console + +from judgeval.scorers.utils import ( + clone_scorers, + scorer_console_msg, + scorer_progress_meter, + parse_response_json, + print_verbose_logs, + create_verbose_logs, + get_or_create_event_loop, +) +from judgeval.scorers import CustomScorer +from judgeval.data import Example + + +class MockCustomScorer(CustomScorer): + """Mock implementation of CustomScorer for testing""" + def __init__(self, **kwargs): + super().__init__( + score_type="mock_scorer", + threshold=0.7, + **kwargs + ) + self.__name__ = "MockScorer" + + def score_example(self, example: Example, *args, **kwargs) -> float: + return 1.0 + + async def a_score_example(self, example: Example, *args, **kwargs) -> float: + return 1.0 + + def success_check(self) -> bool: + return True + + +@pytest.fixture +def mock_scorer(): + return MockCustomScorer( + evaluation_model="gpt-4", + strict_mode=True, + async_mode=True, + verbose_mode=True + ) + + +@pytest.fixture +def mock_scorers(): + return [ + MockCustomScorer(evaluation_model="gpt-4o"), + MockCustomScorer(evaluation_model="gpt-4o") + ] + + +def test_clone_scorers(mock_scorers): + """Test that scorers are properly cloned with all attributes""" + cloned = clone_scorers(mock_scorers) + + assert len(cloned) == len(mock_scorers) + for original, clone in zip(mock_scorers, cloned): + assert type(original) == type(clone) + assert original.score_type == clone.score_type + assert original.threshold == clone.threshold + assert original.evaluation_model == clone.evaluation_model + + +def test_scorer_console_msg(mock_scorer): + """Test console message formatting""" + # Test with default async_mode + msg = scorer_console_msg(mock_scorer) + assert "MockScorer" in msg + assert "gpt-4" in msg + assert "async_mode=True" in msg + + # Test with explicit async_mode + msg = scorer_console_msg(mock_scorer, async_mode=False) + assert "async_mode=False" in msg + + +@pytest.mark.asyncio +async def test_scorer_progress_meter(mock_scorer, capsys): + """Test progress meter display""" + # Test with display_meter=True + with scorer_progress_meter(mock_scorer, display_meter=True): + pass + + # Test with display_meter=False + with scorer_progress_meter(mock_scorer, display_meter=False): + pass + + +def test_parse_response_json_valid(): + """Test parsing valid JSON responses""" + valid_json = '{"score": 0.8, "reason": "test"}' + result = parse_response_json(valid_json) + assert result == {"score": 0.8, "reason": "test"} + + # Test JSON with surrounding text + text_with_json = 'Some text {"score": 0.9} more text' + result = parse_response_json(text_with_json) + assert result == {"score": 0.9} + + +def test_parse_response_json_invalid(mock_scorer): + """ + Test parsing invalid JSON responses, but still completes the JSON parsing without error. + """ + invalid_json = '{"score": 0.8, "reason": "test"' # Missing closing brace + + # the parse_response_json function should add the missing brace and parse the JSON + assert parse_response_json(invalid_json, scorer=mock_scorer) == {"score": 0.8, "reason": "test"} + assert mock_scorer.error is None + +def test_parse_response_json_missing_beginning_brace(mock_scorer): + """ + Test that parse_response_json raises an error when JSON is missing opening brace. + """ + invalid_json = 'score": 0.8, "reason": "test}' # Missing opening brace + + with pytest.raises(ValueError) as exc_info: + parse_response_json(invalid_json, scorer=mock_scorer) + + assert "Evaluation LLM outputted an invalid JSON" in str(exc_info.value) + assert mock_scorer.error is not None + + +def test_create_verbose_logs(mock_scorer, capsys): + """Test verbose logs creation""" + steps = ["Step 1", "Step 2", "Final step"] + logs = create_verbose_logs(mock_scorer, steps) + + assert "Step 1" in logs + assert "Step 2" in logs + + # Check printed output when verbose_mode is True + captured = capsys.readouterr() + assert "MockScorer Verbose Logs" in captured.out + + # Test with verbose_mode=False + mock_scorer.verbose_mode = False + create_verbose_logs(mock_scorer, steps) + captured = capsys.readouterr() + assert captured.out == "" + + +@pytest.mark.asyncio +async def test_get_or_create_event_loop(): + """Test event loop creation and retrieval""" + # Remove the is_running check since the loop will be running under pytest-asyncio + loop = get_or_create_event_loop() + assert isinstance(loop, asyncio.AbstractEventLoop) + + # Test with running loop + async def dummy_task(): + pass + + loop.create_task(dummy_task()) + loop2 = get_or_create_event_loop() + assert loop2 is not None + + assert loop.is_running() + + +def test_print_verbose_logs(capsys): + """Test verbose logs printing""" + metric = "TestMetric" + logs = "Test logs content" + print_verbose_logs(metric, logs) + + captured = capsys.readouterr() + assert "TestMetric Verbose Logs" in captured.out + assert "Test logs content" in captured.out