diff --git a/src/demo/dataset.py b/src/demo/dataset.py deleted file mode 100644 index 5f1b5182..00000000 --- a/src/demo/dataset.py +++ /dev/null @@ -1,16 +0,0 @@ -from judgeval import JudgmentClient -from judgeval.data import Example, Sequence -from judgeval.scorers import DerailmentScorer - -client = JudgmentClient() - -dataset = client.pull_dataset(alias="test", project_name="travel_agent_demo_test") - -client.run_sequence_evaluation( - sequences=dataset.sequences, - model="gpt-4.1", - project_name="travel_agent_demo_test", - scorers=[DerailmentScorer(threshold=0.5)], - log_results=True, - override=True, -) \ No newline at end of file diff --git a/src/demo/demo.py b/src/demo/demo.py deleted file mode 100644 index d8525f77..00000000 --- a/src/demo/demo.py +++ /dev/null @@ -1,46 +0,0 @@ -from judgeval import JudgmentClient -from judgeval.data import Example, Sequence -from judgeval.scorers import DerailmentScorer - -client = JudgmentClient() - -airlines_example = Example( - input="Which airlines fly to Tokyo?", - actual_output="Japan Airlines, All Nippon Airways, and Chinese Airlines offer direct flights." -) -weather_example = Example( - input="What is the weather like in Japan?", - actual_output="It's cloudy with a high of 75°F and a low of 60°F in Japan." -) -airline_sequence = Sequence( - name="Flight Details", - items=[airlines_example, weather_example], -) - -# Level 1: Top-level sequence -top_example1 = Example( - input="I want to plan a trip to Tokyok.", - actual_output="That sounds great! When are you planning to go?" -) -top_example2 = Example( - input="Can you book a flight for me and anything else I need to know?", - actual_output="Sure, I'll help you with flights. hotels. and transportation." -) -top_level_sequence = Sequence( - name="Travel Planning", - items=[top_example1, top_example2, airline_sequence], -) - -other_sequence = Sequence( - name="Other", - items=[Example( - input="What is the weather like in Tokyo?", - actual_output="It's cloudy with a high of 75°F and a low of 60°F in Tokyo." - )] -) - -results = client.run_sequence_evaluation( - scorers=[DerailmentScorer(threshold=1)], - sequences=[other_sequence], - override=True, -) \ No newline at end of file diff --git a/src/demo/sequence_test.py b/src/demo/sequence_test.py index 62abdb13..343b375c 100644 --- a/src/demo/sequence_test.py +++ b/src/demo/sequence_test.py @@ -15,40 +15,50 @@ tracer = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"), project_name="travel_agent_demo") -@tracer.observe(span_type="tool") +# @tracer.observe(span_type="tool") def search_tavily(query): """Fetch travel data using Tavily API.""" - API_KEY = os.getenv("TAVILY_API_KEY") - client = TavilyClient(api_key=API_KEY) - results = client.search(query, num_results=3) - return results + # API_KEY = os.getenv("TAVILY_API_KEY") + # client = TavilyClient(api_key=API_KEY) + # results = client.search(query, num_results=3) + # return results + return "The weather in Tokyo is sunny with a high of 75°F." -# @judgment.observe(span_type="tool") +@tracer.observe(span_type="tool") def get_attractions(destination): """Search for top attractions in the destination.""" prompt = f"Best tourist attractions in {destination}" attractions_search = search_tavily(prompt) return attractions_search -# @judgment.observe(span_type="tool") +@tracer.observe(span_type="tool") def get_hotels(destination): """Search for hotels in the destination.""" prompt = f"Best hotels in {destination}" hotels_search = search_tavily(prompt) return hotels_search -# @judgment.observe(span_type="tool") +@tracer.observe(span_type="tool") def get_flights(destination): """Search for flights to the destination.""" prompt = f"Flights to {destination} from major cities" flights_search = search_tavily(prompt) return flights_search -# @judgment.observe(span_type="tool") +@tracer.observe(span_type="tool") def get_weather(destination, start_date, end_date): """Search for weather information.""" prompt = f"Weather forecast for {destination} from {start_date} to {end_date}" weather_search = search_tavily(prompt) + example = Example( + input="What is the weather in Tokyo?", + actual_output=weather_search + ) + tracer.async_evaluate( + scorers=[AnswerRelevancyScorer(threshold=0.5)], + example=example, + model="gpt-4o-mini", + ) return weather_search def research_destination(destination, start_date, end_date): @@ -84,15 +94,15 @@ def create_travel_plan(destination, start_date, end_date, research_data): - Weather: {research_data['weather']} """ - response = client.chat.completions.create( - model="gpt-4.1", - messages=[ - {"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."}, - {"role": "user", "content": prompt} - ] - ).choices[0].message.content + # response = client.chat.completions.create( + # model="gpt-4o", + # messages=[ + # {"role": "system", "content": "You are an expert travel planner. Combine both historical and current information to create the best possible itinerary."}, + # {"role": "user", "content": prompt} + # ] + # ).choices[0].message.content - return response + return "Here is travel plan" @tracer.observe(span_type="function") def generate_itinerary(destination, start_date, end_date): @@ -100,7 +110,6 @@ def generate_itinerary(destination, start_date, end_date): research_data = research_destination(destination, start_date, end_date) res = create_travel_plan(destination, start_date, end_date, research_data) -from judgeval.data import Sequence from judgeval.scorers import ToolOrderScorer from judgeval import JudgmentClient @@ -110,27 +119,29 @@ def generate_itinerary(destination, start_date, end_date): input={"destination": "Paris", "start_date": "2025-06-01", "end_date": "2025-06-02"}, expected_tools=[ { - "tool_name": "search_tavily", + "tool_name": "get_attractions", "parameters": { - "query": "Best tourist attractions in Paris" + "destination": "Paris" } }, { - "tool_name": "search_tavily", + "tool_name": "get_hotels", "parameters": { - "query": "Best hotels in Paris" + "destination": "Paris" } }, { - "tool_name": "search_tavily", + "tool_name": "get_flights", "parameters": { - "query": "Flights to Paris from major cities" + "destination": "Paris" } }, { - "tool_name": "search_tavily", + "tool_name": "get_weather", "parameters": { - "query": "Weather forecast for Paris from 2025-06-01 to 2025-06-02" + "destination": "Paris", + "start_date": "2025-06-01", + "end_date": "2025-06-02" } } ] @@ -141,11 +152,12 @@ def generate_itinerary(destination, start_date, end_date): {"tool_name": "search_tavily", "parameters": {"query": "Best tourist attractions in Tokyo"}}, {"tool_name": "search_tavily", "parameters": {"query": "Best hotels in Tokyo"}}, {"tool_name": "search_tavily", "parameters": {"query": "Flights to Tokyo from major cities"}}, - {"tool_name": "search_tavily", "parameters": {"query": "Weather forecast for Tokyo from 2025-06-01 to 2025-06-02"}} + {"tool_name": "search_tavily", "parameters": {"query": "Weather forecast for Tokyo from 2025-06-01 to 2025-06-03"}} ] ) judgment.assert_test( + project_name="travel_agent_demo", examples=[example], scorers=[ToolOrderScorer(threshold=0.5)], model="gpt-4.1-mini", diff --git a/src/e2etests/test_all_scorers.py b/src/e2etests/test_all_scorers.py index 9b8347e0..e2003d0e 100644 --- a/src/e2etests/test_all_scorers.py +++ b/src/e2etests/test_all_scorers.py @@ -23,7 +23,7 @@ ClassifierScorer, ) -from judgeval.data import Example, Sequence +from judgeval.data import Example def test_ac_scorer(client: JudgmentClient): @@ -533,31 +533,6 @@ def test_execution_order_scorer(client: JudgmentClient): override=True ) -def test_derailment_scorer(client: JudgmentClient): - PROJECT_NAME = "test-project" - EVAL_RUN_NAME = "test-run-derailment" - - airlines_example = Example( - input="Which airlines fly to Paris?", - actual_output="Air France, Delta, and American Airlines offer direct flights." - ) - weather_example = Example( - input="What is the weather like in Texas?", - actual_output="It's sunny with a high of 75°F in Texas." - ) - airline_sequence = Sequence( - name="Flight Details", - items=[airlines_example, weather_example], - ) - results = client.run_sequence_evaluation( - eval_run_name=EVAL_RUN_NAME, - project_name=PROJECT_NAME, - sequences=[airline_sequence], - scorers=[DerailmentScorer(threshold=0.5)], - model="gpt-4.1", - log_results=True, - override=True, - ) def test_json_scorer(client: JudgmentClient): """Test JSON scorer functionality.""" example1 = Example( diff --git a/src/e2etests/test_dataset_operations.py b/src/e2etests/test_dataset_operations.py index 19f9735f..a9caae7f 100644 --- a/src/e2etests/test_dataset_operations.py +++ b/src/e2etests/test_dataset_operations.py @@ -8,7 +8,7 @@ import string from judgeval.judgment_client import JudgmentClient -from judgeval.data import Example, Sequence +from judgeval.data import Example @pytest.fixture(scope="module", autouse=True) def setup_and_teardown_module(client: JudgmentClient): @@ -38,26 +38,6 @@ def test_dataset(self, client: JudgmentClient, project_name: str): client.delete_dataset(alias="test_dataset_5", project_name=project_name) - def test_dataset_with_sequence(self, client: JudgmentClient, project_name: str): - """Test dataset creation and manipulation with a sequence.""" - dataset = client.create_dataset() - examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")] - sequence = Sequence( - name="test_sequence", - items=examples - ) - dataset.add_sequence(sequence) - client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True) - - dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name) - assert dataset.sequences, "Failed to pull dataset" - assert len(dataset.sequences) == 1, "Dataset should have 1 sequence" - sequence = dataset.sequences[0] - assert sequence.name == "test_sequence", "Sequence should have the correct name" - assert len(sequence.items) == 3, "Sequence should have 3 items" - - client.delete_dataset(alias="test_dataset_with_sequence", project_name=project_name) - def test_pull_all_project_dataset_stats(self, client: JudgmentClient, project_name: str): """Test pulling statistics for all project datasets.""" dataset = client.create_dataset() @@ -132,51 +112,6 @@ def test_append_example_dataset(self, client: JudgmentClient, project_name: str) dataset = client.pull_dataset(alias="test_dataset_8", project_name=project_name) assert dataset, "Failed to pull dataset" assert len(dataset.examples) == 3, "Dataset should have 3 examples" - - def test_append_sequence_dataset(self, client: JudgmentClient, project_name: str): - """Test dataset appending.""" - dataset = client.create_dataset() - examples = [Example(input="input 1", actual_output="output 1"), Example(input="input 2", actual_output="output 2"), Example(input="input 3", actual_output="output 3")] - sequence = Sequence( - name="test_sequence", - items=examples - ) - dataset.add_sequence(sequence) - client.push_dataset(alias="test_dataset_with_sequence", dataset=dataset, project_name=project_name, overwrite=True) - - dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name) - assert dataset.sequences, "Failed to pull dataset" - assert len(dataset.sequences) == 1, "Dataset should have 1 sequence" - sequence = dataset.sequences[0] - assert sequence.name == "test_sequence", "Sequence should have the correct name" - assert len(sequence.items) == 3, "Sequence should have 3 items" - examples2 = [Example(input="input 4", actual_output="output 4"), Example(input="input 5", actual_output="output 5")] - sequence2 = Sequence( - name="test_sequence2", - items=examples2 - ) - - client.append_sequence_dataset(alias="test_dataset_with_sequence", sequences=[sequence2], project_name=project_name) - - dataset = client.pull_dataset(alias="test_dataset_with_sequence", project_name=project_name) - assert dataset.sequences, "Failed to pull dataset" - assert len(dataset.sequences) == 2, "Dataset should have 2 sequences" - - test_sequence = None - test_sequence2 = None - for seq in dataset.sequences: - if seq.name == "test_sequence": - test_sequence = seq - elif seq.name == "test_sequence2": - test_sequence2 = seq - - # Verify first sequence - assert test_sequence is not None, "Could not find 'test_sequence'" - assert len(test_sequence.items) == 3, "Sequence 'test_sequence' should have 3 items" - - # Verify second sequence - assert test_sequence2 is not None, "Could not find 'test_sequence2'" - assert len(test_sequence2.items) == 2, "Sequence 'test_sequence2' should have 2 items" def test_export_jsonl(self, client: JudgmentClient, random_name: str, project_name: str): """Test JSONL dataset export functionality.""" diff --git a/src/judgeval/common/tracer.py b/src/judgeval/common/tracer.py index b869e311..5984a2cb 100644 --- a/src/judgeval/common/tracer.py +++ b/src/judgeval/common/tracer.py @@ -146,7 +146,7 @@ def fetch_trace(self, trace_id: str): return response.json() - def save_trace(self, trace_data: dict): + def save_trace(self, trace_data: dict, offline_mode: bool = False): """ Saves a trace to the Judgment Supabase and optionally to S3 if configured. @@ -183,7 +183,7 @@ def save_trace(self, trace_data: dict): except Exception as e: warnings.warn(f"Failed to save trace to S3: {str(e)}") - if "ui_results_url" in response.json(): + if not offline_mode and "ui_results_url" in response.json(): pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n" rprint(pretty_str) @@ -660,11 +660,12 @@ def save(self, overwrite: bool = False) -> Tuple[str, dict]: "entries": [span.model_dump() for span in self.trace_spans], "evaluation_runs": [run.model_dump() for run in self.evaluation_runs], "overwrite": overwrite, + "offline_mode": self.tracer.offline_mode, "parent_trace_id": self.parent_trace_id, "parent_name": self.parent_name } # --- Log trace data before saving --- - self.trace_manager_client.save_trace(trace_data) + self.trace_manager_client.save_trace(trace_data, offline_mode=self.tracer.offline_mode) # upload annotations # TODO: batch to the log endpoint @@ -930,6 +931,7 @@ def __init__( s3_aws_access_key_id: Optional[str] = None, s3_aws_secret_access_key: Optional[str] = None, s3_region_name: Optional[str] = None, + offline_mode: bool = False, deep_tracing: bool = True # Deep tracing is enabled by default ): if not hasattr(self, 'initialized'): @@ -970,6 +972,7 @@ def __init__( aws_secret_access_key=s3_aws_secret_access_key, region_name=s3_region_name ) + self.offline_mode: bool = offline_mode self.deep_tracing: bool = deep_tracing # NEW: Store deep tracing setting elif hasattr(self, 'project_name') and self.project_name != project_name: diff --git a/src/judgeval/constants.py b/src/judgeval/constants.py index 2d8d27ca..fab54477 100644 --- a/src/judgeval/constants.py +++ b/src/judgeval/constants.py @@ -40,17 +40,15 @@ def _missing_(cls, value): ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai") # API URLs JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/" -JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/" +JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/" JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/" JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/" -JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/" JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/" JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/" JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/" JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/" JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/" JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/" -JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/" JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/" JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/" JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/" diff --git a/src/judgeval/data/__init__.py b/src/judgeval/data/__init__.py index b1baa68a..3baffe3e 100644 --- a/src/judgeval/data/__init__.py +++ b/src/judgeval/data/__init__.py @@ -2,7 +2,6 @@ from judgeval.data.custom_example import CustomExample from judgeval.data.scorer_data import ScorerData, create_scorer_data from judgeval.data.result import ScoringResult, generate_scoring_result -from judgeval.data.sequence import Sequence from judgeval.data.trace import Trace, TraceSpan @@ -14,7 +13,6 @@ "create_scorer_data", "ScoringResult", "generate_scoring_result", - "Sequence", "Trace", "TraceSpan", ] diff --git a/src/judgeval/data/datasets/dataset.py b/src/judgeval/data/datasets/dataset.py index fd7e49c5..9759ac17 100644 --- a/src/judgeval/data/datasets/dataset.py +++ b/src/judgeval/data/datasets/dataset.py @@ -7,13 +7,12 @@ from dataclasses import dataclass, field from typing import List, Union, Literal -from judgeval.data import Example, Sequence +from judgeval.data import Example from judgeval.common.logger import debug, error, warning, info @dataclass class EvalDataset: examples: List[Example] - sequences: List[Sequence] _alias: Union[str, None] = field(default=None) _id: Union[str, None] = field(default=None) judgment_api_key: str = field(default="") @@ -22,13 +21,11 @@ def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID"), examples: List[Example] = [], - sequences: List[Sequence] = [] ): debug(f"Initializing EvalDataset with {len(examples)} examples") if not judgment_api_key: warning("No judgment_api_key provided") self.examples = examples - self.sequences = sequences self._alias = None self._id = None self.judgment_api_key = judgment_api_key @@ -223,10 +220,7 @@ def add_from_yaml(self, file_path: str) -> None: def add_example(self, e: Example) -> None: self.examples = self.examples + [e] # TODO if we need to add rank, then we need to do it here - - def add_sequence(self, s: Sequence) -> None: - self.sequences = self.sequences + [s] - + def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None: """ Saves the dataset as a file. Save only the examples. @@ -313,7 +307,6 @@ def __str__(self): return ( f"{self.__class__.__name__}(" f"examples={self.examples}, " - f"sequences={self.sequences}, " f"_alias={self._alias}, " f"_id={self._id}" f")" diff --git a/src/judgeval/data/datasets/eval_dataset_client.py b/src/judgeval/data/datasets/eval_dataset_client.py index 73d2e01b..a84eae9e 100644 --- a/src/judgeval/data/datasets/eval_dataset_client.py +++ b/src/judgeval/data/datasets/eval_dataset_client.py @@ -7,14 +7,13 @@ from judgeval.constants import ( JUDGMENT_DATASETS_PUSH_API_URL, JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL, - JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL, JUDGMENT_DATASETS_PULL_API_URL, JUDGMENT_DATASETS_PROJECT_STATS_API_URL, JUDGMENT_DATASETS_DELETE_API_URL, JUDGMENT_DATASETS_INSERT_API_URL, JUDGMENT_DATASETS_EXPORT_JSONL_API_URL ) -from judgeval.data import Example, Sequence +from judgeval.data import Example from judgeval.data.datasets import EvalDataset @@ -59,8 +58,6 @@ def push(self, dataset: EvalDataset, alias: str, project_name: str, overwrite: O "dataset_alias": alias, "project_name": project_name, "examples": [e.to_dict() for e in dataset.examples], - "sequences": [s.model_dump() for s in dataset.sequences], - "is_sequence": len(dataset.sequences) > 0, "overwrite": overwrite, } try: @@ -151,63 +148,6 @@ def append_examples(self, alias: str, examples: List[Example], project_name: str description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)", ) return True - - def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool: - debug(f"Appending dataset with alias '{alias}'") - """ - Appends the dataset to Judgment platform - - Mock request: - dataset = { - "alias": alias, - "examples": [...], - "project_name": project_name - } ==> - { - "_alias": alias, - "_id": "..." # ID of the dataset - } - """ - with Progress( - SpinnerColumn(style="rgb(106,0,255)"), - TextColumn("[progress.description]{task.description}"), - transient=False, - ) as progress: - task_id = progress.add_task( - f"Appending [rgb(106,0,255)]'{alias}' to Judgment...", - total=100, - ) - content = { - "dataset_alias": alias, - "project_name": project_name, - "sequences": [s.model_dump() for s in sequences], - } - try: - response = requests.post( - JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL, - json=content, - headers={ - "Content-Type": "application/json", - "Authorization": f"Bearer {self.judgment_api_key}", - "X-Organization-Id": self.organization_id - }, - verify=True - ) - if response.status_code != 200: - error(f"Server error during append: {response.json()}") - raise Exception(f"Server error during append: {response.json()}") - response.raise_for_status() - except requests.exceptions.HTTPError as err: - if response.status_code == 422: - error(f"Validation error during append: {err.response.json()}") - else: - error(f"HTTP error during append: {err}") - - progress.update( - task_id, - description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)", - ) - return True def pull(self, alias: str, project_name: str) -> EvalDataset: debug(f"Pulling dataset with alias '{alias}'") @@ -262,7 +202,6 @@ def pull(self, alias: str, project_name: str) -> EvalDataset: info(f"Successfully pulled dataset with alias '{alias}'") payload = response.json() dataset.examples = [Example(**e) for e in payload.get("examples", [])] - dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])] dataset._alias = payload.get("alias") dataset._id = payload.get("id") progress.update( diff --git a/src/judgeval/data/example.py b/src/judgeval/data/example.py index 373e1946..d7dd6e7e 100644 --- a/src/judgeval/data/example.py +++ b/src/judgeval/data/example.py @@ -37,7 +37,6 @@ class Example(BaseModel): example_index: Optional[int] = None timestamp: Optional[str] = None trace_id: Optional[str] = None - sequence_order: Optional[int] = 0 def __init__(self, **data): if 'example_id' not in data: diff --git a/src/judgeval/data/result.py b/src/judgeval/data/result.py index 9a681b38..01b0504a 100644 --- a/src/judgeval/data/result.py +++ b/src/judgeval/data/result.py @@ -3,7 +3,7 @@ from judgeval.common.logger import debug, error from pydantic import BaseModel from judgeval.data import ScorerData, Example, CustomExample -from judgeval.data.sequence import Sequence +from judgeval.data.trace import TraceSpan class ScoringResult(BaseModel): @@ -24,7 +24,7 @@ class ScoringResult(BaseModel): name: Optional[str] = None # The original example object that was used to create the ScoringResult - data_object: Optional[Union[Sequence, CustomExample, Example]] = None + data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None trace_id: Optional[str] = None # Additional fields for internal use @@ -49,7 +49,7 @@ def __str__(self) -> str: def generate_scoring_result( - data_object: Union[Example, Sequence], + data_object: Union[Example, TraceSpan], scorers_data: List[ScorerData], run_duration: float, success: bool, diff --git a/src/judgeval/data/sequence.py b/src/judgeval/data/sequence.py deleted file mode 100644 index d9a691dd..00000000 --- a/src/judgeval/data/sequence.py +++ /dev/null @@ -1,50 +0,0 @@ -from pydantic import BaseModel, Field, field_validator, model_validator -from typing import List, Optional, Union, Any, Dict -from judgeval.data.example import Example -from judgeval.scorers import JudgevalScorer, APIJudgmentScorer -from uuid import uuid4 -from datetime import datetime, timezone - -class Sequence(BaseModel): - """ - A sequence is a list of either Examples or nested Sequence objects. - """ - sequence_id: str = Field(default_factory=lambda: str(uuid4())) - name: Optional[str] = "Sequence" - created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")) - items: List[Union["Sequence", Example]] = [] - scorers: Optional[Any] = None - parent_sequence_id: Optional[str] = None - sequence_order: Optional[int] = 0 - root_sequence_id: Optional[str] = None - inputs: Optional[Dict[str, Any]] = None - output: Optional[Any] = None - expected_tools: Optional[List[Dict[str, Any]]] = None - - @field_validator("scorers") - def validate_scorer(cls, v): - for scorer in v or []: - if not isinstance(scorer, APIJudgmentScorer) and not isinstance(scorer, JudgevalScorer): - raise ValueError(f"Invalid scorer type: {type(scorer)}") - return v - - @model_validator(mode="after") - def populate_sequence_metadata(self) -> "Sequence": - """Recursively set parent_sequence_id, root_sequence_id, and sequence_order.""" - # If root_sequence_id isn't already set, assign it to self - if self.root_sequence_id is None: - self.root_sequence_id = self.sequence_id - - for idx, item in enumerate(self.items): - item.sequence_order = idx - if isinstance(item, Sequence): - item.parent_sequence_id = self.sequence_id - item.root_sequence_id = self.root_sequence_id - item.populate_sequence_metadata() - return self - - class Config: - arbitrary_types_allowed = True - -# Update forward references so that "Sequence" inside items is resolved. -Sequence.model_rebuild() diff --git a/src/judgeval/data/trace.py b/src/judgeval/data/trace.py index 72d02111..6e44298d 100644 --- a/src/judgeval/data/trace.py +++ b/src/judgeval/data/trace.py @@ -9,7 +9,7 @@ class TraceSpan(BaseModel): trace_id: str function: Optional[str] = None depth: int - created_at: Optional[float] = None + created_at: Optional[Any] = None parent_span_id: Optional[str] = None span_type: Optional[str] = "span" inputs: Optional[Dict[str, Any]] = None @@ -17,6 +17,8 @@ class TraceSpan(BaseModel): duration: Optional[float] = None annotation: Optional[List[Dict[str, Any]]] = None evaluation_runs: Optional[List[EvaluationRun]] = [] + expected_tools: Optional[List[Dict[str, Any]]] = None + additional_metadata: Optional[Dict[str, Any]] = None def model_dump(self, **kwargs): return { @@ -124,6 +126,7 @@ class Trace(BaseModel): duration: float entries: List[TraceSpan] overwrite: bool = False + offline_mode: bool = False rules: Optional[Dict[str, Any]] = None has_notification: Optional[bool] = False \ No newline at end of file diff --git a/src/judgeval/data/sequence_run.py b/src/judgeval/data/trace_run.py similarity index 92% rename from src/judgeval/data/sequence_run.py rename to src/judgeval/data/trace_run.py index 01b11742..f0428361 100644 --- a/src/judgeval/data/sequence_run.py +++ b/src/judgeval/data/trace_run.py @@ -1,20 +1,20 @@ from pydantic import BaseModel from typing import List, Optional, Dict, Any, Union, Callable -from judgeval.data import Sequence +from judgeval.data import Trace from judgeval.scorers import APIJudgmentScorer, JudgevalScorer from judgeval.judges import JudgevalJudge from judgeval.rules import Rule -class SequenceRun(BaseModel): +class TraceRun(BaseModel): """ Stores example and evaluation scorers together for running an eval task Args: project_name (str): The name of the project the evaluation results belong to eval_name (str): A name for this evaluation run - sequences (List[Sequence]): The sequences to evaluate + traces (List[Trace]): The traces to evaluate scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation model (str): The model used as a judge when using LLM as a Judge aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges @@ -29,7 +29,7 @@ class SequenceRun(BaseModel): organization_id: Optional[str] = None project_name: Optional[str] = None eval_name: Optional[str] = None - sequences: Optional[List[Sequence]] = None + traces: Optional[List[Trace]] = None scorers: List[Union[APIJudgmentScorer, JudgevalScorer]] model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1" aggregator: Optional[str] = None diff --git a/src/judgeval/evaluation_run.py b/src/judgeval/evaluation_run.py index c2a852aa..1f126ab2 100644 --- a/src/judgeval/evaluation_run.py +++ b/src/judgeval/evaluation_run.py @@ -79,7 +79,7 @@ def validate_eval_name(cls, v, values): raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.") return v - @field_validator('examples', mode='before') + @field_validator('examples') def validate_examples(cls, v): if not v: raise ValueError("Examples cannot be empty.") diff --git a/src/judgeval/judgment_client.py b/src/judgeval/judgment_client.py index 0dd630b5..e5d0ce29 100644 --- a/src/judgeval/judgment_client.py +++ b/src/judgeval/judgment_client.py @@ -12,7 +12,7 @@ ScoringResult, Example, CustomExample, - Sequence, + Trace, ) from judgeval.scorers import ( APIJudgmentScorer, @@ -23,9 +23,9 @@ from judgeval.run_evaluation import ( run_eval, assert_test, - run_sequence_eval + run_trace_eval ) -from judgeval.data.sequence_run import SequenceRun +from judgeval.data.trace_run import TraceRun from judgeval.judges import JudgevalJudge from judgeval.constants import ( JUDGMENT_EVAL_FETCH_API_URL, @@ -105,16 +105,16 @@ def a_run_evaluation( rules=rules ) - def run_sequence_evaluation( + def run_trace_evaluation( self, scorers: List[Union[APIJudgmentScorer, JudgevalScorer]], model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1", - sequences: Optional[List[Sequence]] = None, + traces: Optional[List[Trace]] = None, examples: Optional[List[Example]] = None, test_file: Optional[str] = None, aggregator: Optional[str] = None, project_name: str = "default_project", - eval_run_name: str = "default_eval_sequence", + eval_run_name: str = "default_eval_trace", log_results: bool = True, append: bool = False, override: bool = False, @@ -134,16 +134,16 @@ def run_sequence_evaluation( if examples and not function: raise ValueError("Cannot pass in examples without a function") - if sequences and function: - raise ValueError("Cannot pass in sequences and function") + if traces and function: + raise ValueError("Cannot pass in traces and function") - if examples and sequences: - raise ValueError("Cannot pass in both examples and sequences") + if examples and traces: + raise ValueError("Cannot pass in both examples and traces") - sequence_run = SequenceRun( + trace_run = TraceRun( project_name=project_name, eval_name=eval_run_name, - sequences=sequences, + traces=traces, scorers=scorers, model=model, aggregator=aggregator, @@ -152,9 +152,9 @@ def run_sequence_evaluation( judgment_api_key=self.judgment_api_key, organization_id=self.organization_id, ) - return run_sequence_eval(sequence_run, override, ignore_errors, function, tracer, examples) + return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples) except ValueError as e: - raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}") + raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}") except Exception as e: raise Exception(f"An unexpected error occurred during evaluation: {str(e)}") @@ -245,12 +245,6 @@ def append_example_dataset(self, alias: str, examples: List[Example], project_na """ return self.eval_dataset_client.append_examples(alias, examples, project_name) - def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool: - """ - Appends a `Sequence` to the Judgment platform for storage. - """ - return self.eval_dataset_client.append_sequences(alias, sequences, project_name) - def pull_dataset(self, alias: str, project_name: str) -> EvalDataset: """ Retrieves a saved `EvalDataset` from the Judgment platform. @@ -523,7 +517,7 @@ def assert_test( raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both") if function: - results = self.run_sequence_evaluation( + results = self.run_trace_evaluation( examples=examples, scorers=scorers, model=model, diff --git a/src/judgeval/run_evaluation.py b/src/judgeval/run_evaluation.py index 79927214..4732767c 100644 --- a/src/judgeval/run_evaluation.py +++ b/src/judgeval/run_evaluation.py @@ -13,7 +13,6 @@ ScoringResult, Example, CustomExample, - Sequence, Trace ) from judgeval.scorers import ( @@ -25,11 +24,10 @@ from judgeval.constants import ( ROOT_API, JUDGMENT_EVAL_API_URL, - JUDGMENT_SEQUENCE_EVAL_API_URL, + JUDGMENT_TRACE_EVAL_API_URL, JUDGMENT_EVAL_LOG_API_URL, MAX_CONCURRENT_EVALUATIONS, JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL, - JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL ) from judgeval.common.exceptions import JudgmentAPIError from judgeval.common.logger import ( @@ -39,7 +37,7 @@ example_logging_context ) from judgeval.evaluation_run import EvaluationRun -from judgeval.data.sequence_run import SequenceRun +from judgeval.data.trace_run import TraceRun from judgeval.common.tracer import Tracer from langchain_core.callbacks import BaseCallbackHandler @@ -98,20 +96,20 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]: raise JudgmentAPIError(error_message) return response_data -def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]: +def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]: """ Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API. """ try: # submit API request to execute evals - payload = sequence_run.model_dump(warnings=False) + payload = trace_run.model_dump(warnings=False) response = requests.post( - JUDGMENT_SEQUENCE_EVAL_API_URL, + JUDGMENT_TRACE_EVAL_API_URL, headers={ "Content-Type": "application/json", - "Authorization": f"Bearer {sequence_run.judgment_api_key}", - "X-Organization-Id": sequence_run.organization_id + "Authorization": f"Bearer {trace_run.judgment_api_key}", + "X-Organization-Id": trace_run.organization_id }, json=payload, verify=True @@ -282,7 +280,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}") -def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str: +def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]) -> str: """ Logs evaluation results to the Judgment API database. @@ -327,51 +325,6 @@ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[Eval error(f"Failed to save evaluation results to DB: {str(e)}") raise ValueError(f"Failed to save evaluation results to DB: {str(e)}") -def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence: - """ - Retrieves a sequence from a trace ID. - """ - """ - Logs evaluation results to the Judgment API database. - - Args: - merged_results (List[ScoringResult]): The results to log - evaluation_run (EvaluationRun): The evaluation run containing project info and API key - - Raises: - JudgmentAPIError: If there's an API error during logging - ValueError: If there's a validation error with the results - """ - try: - res = requests.post( - JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL, - headers={ - "Content-Type": "application/json", - "Authorization": f"Bearer {judgment_api_key}", - "X-Organization-Id": organization_id - }, - json={ - "trace_id": trace_id, - "trace_span_id": parent_span, - }, - verify=True - ) - - if not res.ok: - response_data = res.json() - error_message = response_data.get('detail', 'An unknown error occurred.') - error(f"Error {res.status_code}: {error_message}") - raise JudgmentAPIError(error_message) - - return Sequence(**res.json()) - except requests.exceptions.RequestException as e: - error(f"Request failed while saving evaluation results to DB: {str(e)}") - raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}") - except Exception as e: - error(f"Failed to save evaluation results to DB: {str(e)}") - raise ValueError(f"Failed to save evaluation results to DB: {str(e)}") - - def run_with_spinner(message: str, func, *args, **kwargs) -> Any: """Run a function with a spinner in the terminal.""" spinner = itertools.cycle(['|', '/', '-', '\\']) @@ -415,62 +368,59 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore if missing_params: print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}") -def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]: +def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]: # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results) - if not override and sequence_run.log_results and not sequence_run.append: + if not override and trace_run.log_results and not trace_run.append: check_eval_run_name_exists( - sequence_run.eval_name, - sequence_run.project_name, - sequence_run.judgment_api_key, - sequence_run.organization_id + trace_run.eval_name, + trace_run.project_name, + trace_run.judgment_api_key, + trace_run.organization_id ) - if sequence_run.append: + if trace_run.append: # Check that the current experiment, if one exists, has the same type (examples of sequences) check_experiment_type( - sequence_run.eval_name, - sequence_run.project_name, - sequence_run.judgment_api_key, - sequence_run.organization_id, + trace_run.eval_name, + trace_run.project_name, + trace_run.judgment_api_key, + trace_run.organization_id, True ) if function and tracer: - new_sequences: List[Sequence] = [] + new_traces: List[Trace] = [] + tracer.offline_mode = True for example in examples: if example.input: result = run_with_spinner("Running agent function: ", function, **example.input) else: result = run_with_spinner("Running agent function: ", function) for i, trace in enumerate(tracer.traces): - trace_id = trace['trace_id'] - parent_span = trace['entries'][0]['span_id'] - new_sequence = retrieve_sequence_from_trace(trace_id, parent_span, sequence_run.judgment_api_key, sequence_run.organization_id) - new_sequence.expected_tools = examples[i].expected_tools - new_sequences.append(new_sequence) - sequence_run.sequences = new_sequences - - for sequence in sequence_run.sequences: - sequence.scorers = sequence_run.scorers + # We set the root-level trace span with the expected tools of the Trace + trace = Trace(**trace) + trace.entries[0].expected_tools = examples[i].expected_tools + new_traces.append(trace) + trace_run.traces = new_traces # Execute evaluation using Judgment API info("Starting API evaluation") try: # execute an EvaluationRun with just JudgmentScorers debug("Sending request to Judgment API") - response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run) + response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run) scoring_results = [ScoringResult(**result) for result in response_data["results"]] info(f"Received {len(scoring_results)} results from API") except JudgmentAPIError as e: error(f"An error occurred while executing the Judgment API request: {str(e)}") raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}") except ValueError as e: - raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}") + raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: {str(e)}") # Convert the response data to `ScoringResult` objects debug("Processing API results") - # TODO: allow for custom scorer on sequences - if sequence_run.log_results: - pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run) + # TODO: allow for custom scorer on traces + if trace_run.log_results: + pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run) rprint(pretty_str) return scoring_results diff --git a/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py b/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py index c4130842..df92966a 100644 --- a/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +++ b/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py @@ -5,13 +5,15 @@ # Internal imports from judgeval.scorers.api_scorer import APIJudgmentScorer from judgeval.constants import APIScorer - +from typing import Optional, Dict class ToolOrderScorer(APIJudgmentScorer): - def __init__(self, threshold: float=1.0): + kwargs: Optional[Dict] = None + def __init__(self, threshold: float=1.0, exact_match: bool=False): super().__init__( threshold=threshold, score_type=APIScorer.TOOL_ORDER, ) + self.kwargs = {"exact_match": exact_match} @property def __name__(self):