diff --git a/src/demo/sequence_test.py b/src/demo/sequence_test.py index 343b375c..89c8d3d8 100644 --- a/src/demo/sequence_test.py +++ b/src/demo/sequence_test.py @@ -159,7 +159,7 @@ def generate_itinerary(destination, start_date, end_date): judgment.assert_test( project_name="travel_agent_demo", examples=[example], - scorers=[ToolOrderScorer(threshold=0.5)], + scorers=[ToolOrderScorer()], model="gpt-4.1-mini", function=generate_itinerary, tracer=tracer, diff --git a/src/judgeval/data/example.py b/src/judgeval/data/example.py index d7dd6e7e..deeb9972 100644 --- a/src/judgeval/data/example.py +++ b/src/judgeval/data/example.py @@ -8,6 +8,7 @@ from pydantic import BaseModel, Field, field_validator from enum import Enum from datetime import datetime +from judgeval.data.tool import Tool import time @@ -31,7 +32,7 @@ class Example(BaseModel): retrieval_context: Optional[List[str]] = None additional_metadata: Optional[Dict[str, Any]] = None tools_called: Optional[List[str]] = None - expected_tools: Optional[List[Dict[str, Any]]] = None + expected_tools: Optional[List[Tool]] = None name: Optional[str] = None example_id: str = Field(default_factory=lambda: str(uuid4())) example_index: Optional[int] = None @@ -82,17 +83,17 @@ def validate_expected_output(cls, v): raise ValueError(f"All items in expected_output must be strings but got {v}") return v - @field_validator('expected_tools', mode='before') + @field_validator('expected_tools') @classmethod def validate_expected_tools(cls, v): if v is not None: if not isinstance(v, list): - raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}") + raise ValueError(f"Expected tools must be a list of Tools or None but got {v} of type {type(v)}") - # Check that each item in the list is a dictionary + # Check that each item in the list is a Tool for i, item in enumerate(v): - if not isinstance(item, dict): - raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}") + if not isinstance(item, Tool): + raise ValueError(f"Expected tools must be a list of Tools, but item at index {i} is {item} of type {type(item)}") return v diff --git a/src/judgeval/data/tool.py b/src/judgeval/data/tool.py new file mode 100644 index 00000000..1c886cf0 --- /dev/null +++ b/src/judgeval/data/tool.py @@ -0,0 +1,19 @@ +from pydantic import BaseModel, field_validator +from typing import Dict, Any, Optional +import warnings + +class Tool(BaseModel): + tool_name: str + parameters: Optional[Dict[str, Any]] = None + + @field_validator('tool_name') + def validate_tool_name(cls, v): + if not v: + warnings.warn("Tool name is empty or None", UserWarning) + return v + + @field_validator('parameters') + def validate_parameters(cls, v): + if v is not None and not isinstance(v, dict): + warnings.warn(f"Parameters should be a dictionary, got {type(v)}", UserWarning) + return v \ No newline at end of file diff --git a/src/judgeval/data/trace.py b/src/judgeval/data/trace.py index 6e44298d..1c150481 100644 --- a/src/judgeval/data/trace.py +++ b/src/judgeval/data/trace.py @@ -1,6 +1,7 @@ from pydantic import BaseModel from typing import Optional, Dict, Any, List from judgeval.evaluation_run import EvaluationRun +from judgeval.data.tool import Tool import json from datetime import datetime, timezone @@ -17,7 +18,7 @@ class TraceSpan(BaseModel): duration: Optional[float] = None annotation: Optional[List[Dict[str, Any]]] = None evaluation_runs: Optional[List[EvaluationRun]] = [] - expected_tools: Optional[List[Dict[str, Any]]] = None + expected_tools: Optional[List[Tool]] = None additional_metadata: Optional[Dict[str, Any]] = None def model_dump(self, **kwargs): diff --git a/src/tests/data/test_example.py b/src/tests/data/test_example.py index 5fc097c0..adc4bcd2 100644 --- a/src/tests/data/test_example.py +++ b/src/tests/data/test_example.py @@ -6,7 +6,7 @@ from datetime import datetime from pydantic import ValidationError from judgeval.data import Example - +from judgeval.data.tool import Tool def test_basic_example_creation(): example = Example( @@ -30,7 +30,7 @@ def test_full_example_creation(): retrieval_context=["retrieval1", "retrieval2"], additional_metadata={"key": "value"}, tools_called=["tool1", "tool2"], - expected_tools=[{"tool_name": "expected_tool1"}, {"tool_name": "expected_tool2"}], + expected_tools=[Tool(tool_name="expected_tool1"), Tool(tool_name="expected_tool2")], name="test example", example_id="123", timestamp="20240101_120000", @@ -43,7 +43,7 @@ def test_full_example_creation(): assert example.retrieval_context == ["retrieval1", "retrieval2"] assert example.additional_metadata == {"key": "value"} assert example.tools_called == ["tool1", "tool2"] - assert example.expected_tools == [{"tool_name": "expected_tool1"}, {"tool_name": "expected_tool2"}] + assert example.expected_tools == [Tool(tool_name="expected_tool1"), Tool(tool_name="expected_tool2")] assert example.name == "test example" assert example.example_id == "123" assert example.timestamp == "20240101_120000"