diff --git a/Pipfile b/Pipfile index 7a1a9f02..f5fd917a 100644 --- a/Pipfile +++ b/Pipfile @@ -16,6 +16,7 @@ pandas = "*" openai = "*" together = "*" anthropic = "*" +patronus = "*" [dev-packages] pytest = "*" diff --git a/demo/test_competitors.py b/demo/test_competitors.py new file mode 100644 index 00000000..423906ce --- /dev/null +++ b/demo/test_competitors.py @@ -0,0 +1,96 @@ +from dotenv import load_dotenv +from patronus import Client +import os +import asyncio +import time +from openai import OpenAI +from anthropic import Anthropic + +load_dotenv() + +PATRONUS_API_KEY = os.getenv("PATRONUS_API_KEY") + +client = Client(api_key=PATRONUS_API_KEY) + +# Initialize clients +openai_client = OpenAI() +anthropic_client = Anthropic() + +async def make_upper(input: str) -> str: + output = input.upper() + result = client.evaluate( + evaluator="answer-relevance", + criteria="patronus:answer-relevance", + evaluated_model_input=input, + evaluated_model_output=output, + threshold=0.5, + model="gpt-4o-mini", + log_results=True + ) + return output + +def llm_call(input): + time.sleep(1.3) + return "We have a 30 day full refund policy on shoes." + +async def answer_user_question(input): + output = llm_call(input) + result = client.evaluate( + evaluator="answer-relevance", + criteria="patronus:answer-relevance", + evaluated_model_input=input, + evaluated_model_output=output, + evaluated_model_retrieved_context=["All customers are eligible for a 30 day full refund at no extra cost."], + expected_output="We offer a 30-day full refund at no extra cost.", + threshold=0.5, + model="gpt-4o-mini", + log_results=True + ) + return output + +async def make_poem(input: str) -> str: + try: + # Using Anthropic API + anthropic_response = anthropic_client.messages.create( + model="claude-3-sonnet-20240229", + messages=[{"role": "user", "content": input}], + max_tokens=30 + ) + anthropic_result = anthropic_response.content[0].text + + result = client.evaluate( + evaluator="answer-relevance", + criteria="patronus:answer-relevance", + evaluated_model_input=input, + evaluated_model_output=anthropic_result, + threshold=0.5, + model="gpt-4o-mini", + log_results=True + ) + + # Using OpenAI API + openai_response = openai_client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": "Make a short sentence with the input."}, + {"role": "user", "content": input} + ] + ) + openai_result = openai_response.choices[0].message.content + + return f"{anthropic_result} {openai_result}".lower() + + except Exception as e: + print(f"Error generating poem: {e}") + return "" + +async def test_evaluation_mixed(input): + upper = await make_upper(input) + result = await make_poem(upper) + await answer_user_question("What if these shoes don't fit?") + return result + +if __name__ == "__main__": + test_input = "Write a poem about Nissan R32 GTR" + asyncio.run(test_evaluation_mixed(test_input)) + diff --git a/docs/notebooks/prompt_scorer.ipynb b/docs/notebooks/prompt_scorer.ipynb index efe0323c..fb3f0223 100644 --- a/docs/notebooks/prompt_scorer.ipynb +++ b/docs/notebooks/prompt_scorer.ipynb @@ -157,7 +157,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/e2etests/judgment_client_test.py b/e2etests/judgment_client_test.py index 4eb7484b..d3394f94 100644 --- a/e2etests/judgment_client_test.py +++ b/e2etests/judgment_client_test.py @@ -16,6 +16,8 @@ import random import string +from judgeval.scorers.prompt_scorer import ClassifierScorer + load_dotenv() def get_client(): @@ -35,36 +37,32 @@ def test_dataset(client: JudgmentClient): print(dataset) def test_run_eval(client: JudgmentClient): + # Single step in our workflow, an outreach Sales Agent example1 = Example( - input="What if these shoes don't fit?", - actual_output="We offer a 30-day full refund at no extra cost.", - retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."], - trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6" + input="Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.", + actual_output="Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex", + retrieval_context=["TechCorp launched AI analytics platform in 2024", "Sarah Chen is CEO, ex-Google executive", "Current client base: 50+ enterprise customers"], ) example2 = Example( - input="How do I reset my password?", - actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.", - expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.", - name="Password Reset", - context=["User Account"], - retrieval_context=["Password reset instructions"], - tools_called=["authentication"], - expected_tools=["authentication"], - additional_metadata={"difficulty": "medium"} + input="Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.", + actual_output="Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex", + expected_output="A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans", + context=["Business Development"], + retrieval_context=["GreenEnergy Solutions won 2023 sustainability award", "New solar technology 30% more efficient", "Planning European market expansion"], ) scorer = FaithfulnessScorer(threshold=0.5) scorer2 = HallucinationScorer(threshold=0.5) c_scorer = CustomFaithfulnessMetric(threshold=0.6) - PROJECT_NAME = "test_project_JOSEPH" - EVAL_RUN_NAME = "yomadude" + PROJECT_NAME = "OutreachWorkflow" + EVAL_RUN_NAME = "ColdEmailGenerator-Improve-BasePrompt" - _ = client.run_evaluation( + client.run_evaluation( examples=[example1, example2], - scorers=[scorer, c_scorer], + scorers=[scorer, scorer2], model="QWEN", metadata={"batch": "test"}, project_name=PROJECT_NAME, @@ -146,8 +144,6 @@ def test_override_eval(client: JudgmentClient): if "already exists" not in str(e): raise print(f"Successfully caught expected error: {e}") - - def test_evaluate_dataset(client: JudgmentClient): @@ -194,8 +190,10 @@ def test_classifier_scorer(client: JudgmentClient): examples=[example1], scorers=[classifier_scorer, faithfulness_scorer], model="QWEN", + log_results=True, + eval_run_name="ToneScorerTest", + project_name="ToneScorerTest", ) - print(res) if __name__ == "__main__": # Test client functionality @@ -204,29 +202,29 @@ def test_classifier_scorer(client: JudgmentClient): print("Client initialized successfully") print("*" * 40) - print("Testing dataset creation, pushing, and pulling") - test_dataset(ui_client) - print("Dataset creation, pushing, and pulling successful") - print("*" * 40) + # print("Testing dataset creation, pushing, and pulling") + # test_dataset(ui_client) + # print("Dataset creation, pushing, and pulling successful") + # print("*" * 40) print("Testing evaluation run") test_run_eval(ui_client) print("Evaluation run successful") print("*" * 40) - print("Testing evaluation run override") - test_override_eval(client) - print("Evaluation run override successful") - print("*" * 40) + # print("Testing evaluation run override") + # test_override_eval(client) + # print("Evaluation run override successful") + # print("*" * 40) - print("Testing dataset evaluation") - test_evaluate_dataset(ui_client) - print("Dataset evaluation successful") - print("*" * 40) + # print("Testing dataset evaluation") + # test_evaluate_dataset(ui_client) + # print("Dataset evaluation successful") + # print("*" * 40) - print("Testing classifier scorer") - test_classifier_scorer(ui_client) - print("Classifier scorer test successful") - print("*" * 40) + # print("Testing classifier scorer") + # test_classifier_scorer(ui_client) + # print("Classifier scorer test successful") + # print("*" * 40) print("All tests passed successfully") diff --git a/e2etests/test_prompt_scoring.py b/e2etests/test_prompt_scoring.py index 51f8c9c3..ac535d76 100644 --- a/e2etests/test_prompt_scoring.py +++ b/e2etests/test_prompt_scoring.py @@ -36,7 +36,7 @@ def __init__( ) self.score = 0.0 - def build_measure_prompt(self, example: Example): + def _build_measure_prompt(self, example: Example): SYSTEM_ROLE = ( 'You are a great judge of emotional intelligence. You understand the feelings ' 'and intentions of others. You will be tasked with judging whether the following ' @@ -51,16 +51,16 @@ def build_measure_prompt(self, example: Example): ] return conversation - def build_schema(self): + def _build_schema(self): return { "score": int, "reason": str } - def process_response(self, response): + def _process_response(self, response): return response["score"], response["reason"] - def success_check(self): + def _success_check(self): POSITIVITY_THRESHOLD = 3 # we want all model responses to be somewhat positive in tone return self.score <= POSITIVITY_THRESHOLD diff --git a/e2etests/test_tracer.py b/e2etests/test_tracer.py index 2f97280c..d2262377 100644 --- a/e2etests/test_tracer.py +++ b/e2etests/test_tracer.py @@ -14,11 +14,11 @@ from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer # Initialize the tracer and clients -judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY")) +judgment = Tracer(api_key=os.getenv("UI_JUDGMENT_API_KEY")) openai_client = wrap(OpenAI()) anthropic_client = wrap(Anthropic()) -@judgment.observe +@judgment.observe(span_type="tool") async def make_upper(input: str) -> str: """Convert input to uppercase and evaluate using judgment API. @@ -28,6 +28,7 @@ async def make_upper(input: str) -> str: The uppercase version of the input string """ output = input.upper() + await judgment.get_current_trace().async_evaluate( scorers=[FaithfulnessScorer(threshold=0.5)], input="What if these shoes don't fit?", @@ -38,9 +39,10 @@ async def make_upper(input: str) -> str: model="gpt-4o-mini", log_results=True ) + return output -@judgment.observe +@judgment.observe(span_type="tool") async def make_lower(input): output = input.lower() @@ -59,11 +61,12 @@ async def make_lower(input): ) return output -@judgment.observe +@judgment.observe(span_type="llm") def llm_call(input): + time.sleep(1.3) return "We have a 30 day full refund policy on shoes." -@judgment.observe +@judgment.observe(span_type="tool") async def answer_user_question(input): output = llm_call(input) await judgment.get_current_trace().async_evaluate( @@ -77,7 +80,7 @@ async def answer_user_question(input): ) return output -@judgment.observe +@judgment.observe(span_type="tool") async def make_poem(input: str) -> str: """Generate a poem using both Anthropic and OpenAI APIs. @@ -95,6 +98,15 @@ async def make_poem(input: str) -> str: ) anthropic_result = anthropic_response.content[0].text + await judgment.get_current_trace().async_evaluate( + input=input, + actual_output=anthropic_result, + score_type=APIScorer.ANSWER_RELEVANCY, + threshold=0.5, + model="gpt-4o-mini", + log_results=True + ) + # Using OpenAI API openai_response = openai_client.chat.completions.create( model="gpt-4o-mini", @@ -112,7 +124,8 @@ async def make_poem(input: str) -> str: return "" async def test_evaluation_mixed(input): - with judgment.trace("test_evaluation") as trace: + PROJECT_NAME = "NewPoemBot" + with judgment.trace("Use-claude", project_name=PROJECT_NAME, overwrite=True) as trace: upper = await make_upper(input) result = await make_poem(upper) await answer_user_question("What if these shoes don't fit?") diff --git a/judgeval/common/tracer.py b/judgeval/common/tracer.py index bc6bf071..34b135cd 100644 --- a/judgeval/common/tracer.py +++ b/judgeval/common/tracer.py @@ -28,6 +28,7 @@ import json import warnings from pydantic import BaseModel +from http import HTTPStatus from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL from judgeval.judgment_client import JudgmentClient @@ -38,7 +39,7 @@ # Define type aliases for better code readability and maintainability ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types - +SpanType = Literal['span', 'tool', 'llm', 'evaluation'] @dataclass class TraceEntry: """Represents a single trace entry with its visual representation. @@ -58,7 +59,8 @@ class TraceEntry: duration: Optional[float] = None # Time taken (for exit/evaluation entries) output: Any = None # Function output value # Use field() for mutable defaults to avoid shared state issues - inputs: dict = field(default_factory=dict) + inputs: dict = field(default_factory=dict) + span_type: SpanType = "span" evaluation_result: Optional[List[ScoringResult]] = field(default=None) def print_entry(self): @@ -93,7 +95,8 @@ def to_dict(self) -> dict: "duration": self.duration, "output": output, "inputs": self.inputs or None, # Convert empty dict to None - "evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None + "evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None, + "span_type": self.span_type } def _serialize_output(self) -> Any: @@ -112,17 +115,19 @@ def _serialize_output(self) -> Any: class TraceClient: """Client for managing a single trace context""" - def __init__(self, tracer, trace_id: str, name: str): + def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project"): self.tracer = tracer self.trace_id = trace_id self.name = name + self.project_name = project_name self.client: JudgmentClient = tracer.client self.entries: List[TraceEntry] = [] self.start_time = time.time() - self._current_span = None + self.span_type = None + self._current_span: Optional[TraceEntry] = None @contextmanager - def span(self, name: str): + def span(self, name: str, span_type: SpanType = "span"): """Context manager for creating a trace span""" start_time = time.time() @@ -132,7 +137,8 @@ def span(self, name: str): function=name, depth=self.tracer.depth, message=name, - timestamp=start_time + timestamp=start_time, + span_type=span_type )) self.tracer.depth += 1 @@ -152,7 +158,8 @@ def span(self, name: str): depth=self.tracer.depth, message=f"← {name}", timestamp=time.time(), - duration=duration + duration=duration, + span_type=span_type )) self._current_span = prev_span @@ -199,6 +206,7 @@ def record_evaluation(self, results: List[ScoringResult], start_time: float): """Record evaluation results for the current span""" if self._current_span: duration = time.time() - start_time # Calculate duration from start_time + self.add_entry(TraceEntry( type="evaluation", function=self._current_span, @@ -206,7 +214,8 @@ def record_evaluation(self, results: List[ScoringResult], start_time: float): message=f"Evaluation results for {self._current_span}", timestamp=time.time(), evaluation_result=results, - duration=duration + duration=duration, + span_type="evaluation" )) def record_input(self, inputs: dict): @@ -218,7 +227,8 @@ def record_input(self, inputs: dict): depth=self.tracer.depth, message=f"Inputs to {self._current_span}", timestamp=time.time(), - inputs=inputs + inputs=inputs, + span_type=self.span_type )) async def _update_coroutine_output(self, entry: TraceEntry, coroutine: Any): @@ -240,7 +250,8 @@ def record_output(self, output: Any): depth=self.tracer.depth, message=f"Output from {self._current_span}", timestamp=time.time(), - output="" if inspect.iscoroutine(output) else output + output="" if inspect.iscoroutine(output) else output, + span_type=self.span_type ) self.add_entry(entry) @@ -266,45 +277,39 @@ def get_duration(self) -> float: def condense_trace(self, entries: List[dict]) -> List[dict]: """ - Condenses trace entries into a single entry for each function. - - Groups entries by function call and combines them into a single entry with: - - depth: deepest depth for this function call - - duration: time from first to last timestamp - - function: function name - - inputs: non-None inputs - - output: non-None outputs - - evaluation_result: evaluation results - - timestamp: first timestamp of the function call + Condenses trace entries into a single entry for each function call. """ condensed = [] - current_func = None - current_entry = None + active_functions = [] # Stack to track nested function calls + function_entries = {} # Store entries for each function for entry in entries: + function = entry["function"] + if entry["type"] == "enter": - # Start of new function call - current_func = entry["function"] - current_entry = { + # Initialize new function entry + function_entries[function] = { "depth": entry["depth"], - "function": entry["function"], + "function": function, "timestamp": entry["timestamp"], "inputs": None, "output": None, - "evaluation_result": None + "evaluation_result": None, + "span_type": entry.get("span_type", "span") } - - elif entry["type"] == "exit" and entry["function"] == current_func: - # End of current function + active_functions.append(function) + + elif entry["type"] == "exit" and function in active_functions: + # Complete function entry + current_entry = function_entries[function] current_entry["duration"] = entry["timestamp"] - current_entry["timestamp"] condensed.append(current_entry) - current_func = None - current_entry = None - - elif current_func and entry["function"] == current_func: - # Additional entries for current function - if entry["depth"] > current_entry["depth"]: - current_entry["depth"] = entry["depth"] + active_functions.remove(function) + del function_entries[function] + + elif function in active_functions: + # Update existing function entry with additional data + current_entry = function_entries[function] if entry["type"] == "input" and entry["inputs"]: current_entry["inputs"] = entry["inputs"] @@ -315,9 +320,11 @@ def condense_trace(self, entries: List[dict]) -> List[dict]: if entry["type"] == "evaluation" and entry["evaluation_result"]: current_entry["evaluation_result"] = entry["evaluation_result"] + # Sort by timestamp + condensed.sort(key=lambda x: x["timestamp"]) return condensed - def save(self) -> Tuple[str, dict]: + def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str, dict]: """ Save the current trace to the database. Returns a tuple of (trace_id, trace_data) where trace_data is the trace data that was saved. @@ -333,6 +340,7 @@ def save(self) -> Tuple[str, dict]: "trace_id": self.trace_id, "api_key": self.tracer.api_key, "name": self.name, + "project_name": self.project_name, "created_at": datetime.fromtimestamp(self.start_time).isoformat(), "duration": total_duration, "token_counts": { @@ -340,7 +348,9 @@ def save(self) -> Tuple[str, dict]: "completion_tokens": 0, # Dummy value "total_tokens": 0, # Dummy value }, # TODO: Add token counts - "entries": condensed_entries + "entries": condensed_entries, + "empty_save": empty_save, + "overwrite": overwrite } # Save trace data by making POST request to API @@ -351,7 +361,11 @@ def save(self) -> Tuple[str, dict]: "Content-Type": "application/json", } ) - response.raise_for_status() + + if response.status_code == HTTPStatus.BAD_REQUEST: + raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}") + elif response.status_code != HTTPStatus.OK: + raise ValueError(f"Failed to save trace data: {response.text}") return self.trace_id, trace_data @@ -369,17 +383,17 @@ def __init__(self, api_key: str): if not api_key: raise ValueError("Tracer must be configured with a Judgment API key") - self.api_key = api_key - self.client = JudgmentClient(judgment_api_key=api_key) - self.depth = 0 - self._current_trace: Optional[TraceClient] = None - self.initialized = True + self.api_key: str = api_key + self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key) + self.depth: int = 0 + self._current_trace: Optional[str] = None + self.initialized: bool = True @contextmanager - def trace(self, name: str = None) -> Generator[TraceClient, None, None]: + def trace(self, name: str, project_name: str = "default_project", overwrite: bool = False) -> Generator[TraceClient, None, None]: """Start a new trace context using a context manager""" trace_id = str(uuid.uuid4()) - trace = TraceClient(self, trace_id, name or "unnamed_trace") + trace = TraceClient(self, trace_id, name, project_name=project_name) prev_trace = self._current_trace self._current_trace = trace @@ -387,7 +401,7 @@ def trace(self, name: str = None) -> Generator[TraceClient, None, None]: with trace.span(name or "unnamed_trace") as span: try: # Save the trace to the database to handle Evaluations' trace_id referential integrity - trace.save() + trace.save(empty_save=True, overwrite=overwrite) yield trace finally: self._current_trace = prev_trace @@ -398,16 +412,17 @@ def get_current_trace(self) -> Optional[TraceClient]: """ return self._current_trace - def observe(self, func=None, *, name=None): + def observe(self, func=None, *, name=None, span_type: SpanType = "span"): """ Decorator to trace function execution with detailed entry/exit information. Args: func: The function to trace name: Optional custom name for the function + span_type: The type of span to use for this observation (default: "span") """ if func is None: - return lambda f: self.observe(f, name=name) + return lambda f: self.observe(f, name=name, span_type=span_type) if asyncio.iscoroutinefunction(func): @functools.wraps(func) @@ -415,7 +430,10 @@ async def async_wrapper(*args, **kwargs): if self._current_trace: span_name = name or func.__name__ - with self._current_trace.span(span_name) as span: + with self._current_trace.span(span_name, span_type=span_type) as span: + # Set the span type + span.span_type = span_type + # Record inputs span.record_input({ 'args': list(args), @@ -438,7 +456,10 @@ def wrapper(*args, **kwargs): if self._current_trace: span_name = name or func.__name__ - with self._current_trace.span(span_name) as span: + with self._current_trace.span(span_name, span_type=span_type) as span: + # Set the span type + span.span_type = span_type + # Record inputs span.record_input({ 'args': list(args), @@ -471,7 +492,7 @@ def traced_create(*args, **kwargs): if not (tracer and tracer._current_trace): return original_create(*args, **kwargs) - with tracer._current_trace.span(span_name) as span: + with tracer._current_trace.span(span_name, span_type="llm") as span: # Format and record the input parameters input_data = _format_input_data(client, **kwargs) span.record_input(input_data) diff --git a/judgeval/data/result.py b/judgeval/data/result.py index 9b9f4c1d..dc24c670 100644 --- a/judgeval/data/result.py +++ b/judgeval/data/result.py @@ -7,6 +7,7 @@ class ScoringResult: """ A ScoringResult contains the output of one or more scorers applied to a single example. + Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...) Args: success (bool): Whether the evaluation was successful. @@ -32,6 +33,9 @@ class ScoringResult: retrieval_context: Optional[List[str]] = None trace_id: Optional[str] = None + example_id: Optional[str] = None + eval_run_name: Optional[str] = None + def to_dict(self) -> dict: """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data.""" return { @@ -42,7 +46,8 @@ def to_dict(self) -> dict: "expected_output": self.expected_output, "context": self.context, "retrieval_context": self.retrieval_context, - "trace_id": self.trace_id + "trace_id": self.trace_id, + "example_id": self.example_id } def __str__(self) -> str: diff --git a/judgeval/data/scorer_data.py b/judgeval/data/scorer_data.py index 787bc6c4..85272f7f 100644 --- a/judgeval/data/scorer_data.py +++ b/judgeval/data/scorer_data.py @@ -76,7 +76,7 @@ def create_scorer_data(scorer: CustomScorer) -> ScorerData: score=scorer.score, threshold=scorer.threshold, reason=scorer.reason, - success=scorer.success_check(), + success=scorer._success_check(), strict_mode=scorer.strict_mode, evaluation_model=scorer.evaluation_model, error=None, diff --git a/judgeval/judgment_client.py b/judgeval/judgment_client.py index fe4636a1..d8190ceb 100644 --- a/judgeval/judgment_client.py +++ b/judgeval/judgment_client.py @@ -129,20 +129,34 @@ def pull_dataset(self, alias: str) -> EvalDataset: return dataset # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend - def pull_eval(self, project_name: str, eval_run_name: str) -> List[ScoringResult]: + def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]: + """Pull evaluation results from the server. + + Args: + project_name (str): Name of the project + eval_run_name (str): Name of the evaluation run + + Returns: + Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing: + - id (str): The evaluation run ID + - results (List[ScoringResult]): List of scoring results + """ eval_run_request_body = EvalRunRequestBody(project_name=project_name, eval_name=eval_run_name, judgment_api_key=self.judgment_api_key) - eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL, + eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL, json=eval_run_request_body.model_dump()) if eval_run.status_code != requests.codes.ok: raise ValueError(f"Error fetching eval results: {eval_run.json()}") - eval_results = [] + + eval_run_result = [{}] for result in eval_run.json(): - result = result.get("result", dict()) - filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__} - eval_results.append(ScoringResult(**filtered_result)) - return eval_results + result_id = result.get("id", "") + result_data = result.get("result", dict()) + filtered_result = {k: v for k, v in result_data.items() if k in ScoringResult.__annotations__} + eval_run_result[0]["id"] = result_id + eval_run_result[0]["results"] = [ScoringResult(**filtered_result)] + return eval_run_result def _validate_api_key(self): """ @@ -191,3 +205,37 @@ def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer: return ClassifierScorer(**scorer_config) except Exception as e: raise JudgmentAPIError(f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}") + + def push_classifier_scorer(self, scorer: ClassifierScorer, slug: str = None) -> str: + """ + Pushes a classifier scorer configuration to the Judgment API. + + Args: + slug (str): Slug identifier for the scorer. If it exists, the scorer will be updated. + scorer (ClassifierScorer): The classifier scorer to save + + Returns: + str: The slug identifier of the saved scorer + + Raises: + JudgmentAPIError: If there's an error saving the scorer + """ + request_body = { + "name": scorer.name, + "conversation": [m.model_dump() for m in scorer.conversation], + "options": scorer.options, + "judgment_api_key": self.judgment_api_key, + "slug": slug + } + + response = requests.post( + f"{ROOT_API}/save_scorer/", + json=request_body + ) + + if response.status_code == 500: + raise JudgmentAPIError(f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}") + elif response.status_code != 200: + raise JudgmentAPIError(f"Failed to save classifier scorer: {response.json().get('detail', '')}") + + return response.json()["slug"] \ No newline at end of file diff --git a/judgeval/playground.py b/judgeval/playground.py index 19db5809..907ad24e 100644 --- a/judgeval/playground.py +++ b/judgeval/playground.py @@ -566,7 +566,7 @@ def _calculate_score(self) -> float: score = faithfulness_count / number_of_verdicts return 0 if self.strict_mode and score < self.threshold else score - def success_check(self) -> bool: + def _success_check(self) -> bool: if self.error is not None: self.success = False else: diff --git a/judgeval/scorers/custom_scorer.py b/judgeval/scorers/custom_scorer.py index 75816e7d..d21e47ee 100644 --- a/judgeval/scorers/custom_scorer.py +++ b/judgeval/scorers/custom_scorer.py @@ -101,7 +101,7 @@ async def a_score_example(self, example, *args, **kwargs) -> float: raise NotImplementedError("You must implement the `a_score` method in your custom scorer") @abstractmethod - def success_check(self) -> bool: + def _success_check(self) -> bool: """ For unit testing, determines whether the test case passes or fails """ diff --git a/judgeval/scorers/prompt_scorer.py b/judgeval/scorers/prompt_scorer.py index b1829afe..fb996a96 100644 --- a/judgeval/scorers/prompt_scorer.py +++ b/judgeval/scorers/prompt_scorer.py @@ -49,8 +49,8 @@ class PromptScorer(CustomScorer, BaseModel): using_native_model: bool = Field(default=True) # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD - response: Optional[dict] = None - result: Optional[float] = None + _response: Optional[dict] = None + _result: Optional[float] = None def __init__( self, @@ -100,11 +100,11 @@ def score_example( else: result, reason = self.evaluate(example) self.reason = reason - self.result = result + self._result = result self.verbose_logs = create_verbose_logs( self, steps=[ - f"Results: {self.result}\nReason: {self.reason}", + f"Results: {self._result}\nReason: {self.reason}", ], ) return result @@ -120,11 +120,11 @@ async def a_score_example( with scorer_progress_meter(self, display_meter=_show_indicator): result, reason = await self.a_evaluate(example) self.reason = reason - self.result = result + self._result = result self.verbose_logs = create_verbose_logs( self, steps=[ - f"Results: {self.result}\nReason: {self.reason}", + f"Results: {self._result}\nReason: {self.reason}", ], ) return result @@ -138,11 +138,11 @@ def evaluate(self, example: Example) -> Tuple[Any, str]: NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field. """ - prompt = self.build_measure_prompt(example) + prompt = self._build_measure_prompt(example) if self.using_native_model: res = self.model.generate(prompt) response = parse_response_json(res, self) - result, reason = self.process_response(response) + result, reason = self._process_response(response) return result, reason else: raise NotImplementedError("Non-native judge models are not supported in synchronous mode yet.") @@ -156,25 +156,25 @@ async def a_evaluate(self, example: Example) -> Tuple[Any, str]: NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field. """ - judge_prompt = self.build_measure_prompt(example) - schema = self.build_schema() - prompt = self.enforce_prompt_format(judge_prompt=judge_prompt, schema=schema) + judge_prompt = self._build_measure_prompt(example) + schema = self._build_schema() + prompt = self._enforce_prompt_format(judge_prompt=judge_prompt, schema=schema) if self.using_native_model: res = await self.model.a_generate(prompt) response = parse_response_json(res, self) - self.response = response + self._response = response - result, reason = self.process_response(response) + result, reason = self._process_response(response) self.score = result self.reason = reason - self.response = response + self._response = response return result, reason else: raise NotImplementedError("Non-native judge models are not supported in async mode yet.") # TODO: can we make this take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args @abstractmethod - def build_measure_prompt(self, example: Example) -> List[dict]: + def _build_measure_prompt(self, example: Example) -> List[dict]: # builds the prompt that is sent to the model inside of the `score_example()` method # returns either a string prompt or a conversation prompt of the form [{"role": "system", "content": "..."}, ...] @@ -197,7 +197,7 @@ def build_measure_prompt(self, example: Example) -> List[dict]: # TODO: does this need to take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args @abstractmethod - def build_schema(self) -> dict: + def _build_schema(self) -> dict: """ This function returns a dictionary that represents the schema of the JSON response that the judge model should return. @@ -208,7 +208,7 @@ def build_schema(self) -> dict: """ pass - def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict): + def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict): """ Formats the final prompt to the judge model. @@ -248,7 +248,7 @@ def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict): raise TypeError(f"Prompt must be a list of dictionaries. Got {type(judge_prompt)} instead.") @abstractmethod - def process_response(self, response: dict): + def _process_response(self, response: dict): """ Customizable method for processing the response from the judge model. @@ -264,7 +264,7 @@ def process_response(self, response: dict): pass @abstractmethod - def success_check(self, **kwargs) -> bool: + def _success_check(self, **kwargs) -> bool: """ Determines whether or not the PromptScorer should consider the evaluation of a single example successful. """ @@ -320,7 +320,16 @@ def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapp verbose_mode=verbose_mode, ) - def build_measure_prompt(self, example: Example) -> List[dict]: + def _build_measure_prompt(self, example: Example) -> List[dict]: + """ + Builds the measure prompt for the classifier scorer. + + Args: + example (Example): The example to build the prompt for + + Returns: + List[dict]: The measure prompt for the classifier scorer + """ replacement_words = { "{{actual_output}}": example.actual_output, "{{expected_output}}": example.expected_output, @@ -341,10 +350,10 @@ def build_measure_prompt(self, example: Example) -> List[dict]: message["content"] = content.replace(key, str(value)) return conversation_copy - def build_schema(self) -> dict: + def _build_schema(self) -> dict: return self.options - def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]: + def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]: """ Enforces the judge model to choose an option from the schema. @@ -369,15 +378,45 @@ def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[ judge_prompt[0]["content"] = system_role return judge_prompt - def process_response(self, response: dict) -> Tuple[float, str]: + def _process_response(self, response: dict) -> Tuple[float, str]: choice = response.get("choice") if choice not in self.options: raise ValueError(f"Invalid choice: {choice}. Expected one of: {self.options.keys()}") reason = response.get("reason", "No reason could be found in model response.") return self.options[choice], reason - def success_check(self, **kwargs) -> bool: + def _success_check(self, **kwargs) -> bool: return self.score >= self.threshold + + def update_name(self, name: str): + """ + Updates the name of the scorer. + """ + self.name = name + + def update_threshold(self, threshold: float): + """ + Updates the threshold of the scorer. + """ + self.threshold = threshold + + def update_conversation(self, conversation: List[dict]): + """ + Updates the conversation with the new conversation. + + Sample conversation: + [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}] + """ + self.conversation = conversation + + def update_options(self, options: Mapping[str, float]): + """ + Updates the options with the new options. + + Sample options: + {"yes": 1, "no": 0} + """ + self.options = options def __str__(self): return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})" diff --git a/tests/common/test_tracer.py b/tests/common/test_tracer.py index cb6abd61..edc58197 100644 --- a/tests/common/test_tracer.py +++ b/tests/common/test_tracer.py @@ -149,17 +149,21 @@ def test_record_input_output(trace_client): def test_condense_trace(trace_client): """Test trace condensing functionality""" + # Store the base depth from the enter event + base_depth = 0 entries = [ - {"type": "enter", "function": "test_func", "depth": 0, "timestamp": 1.0}, - {"type": "input", "function": "test_func", "depth": 1, "timestamp": 1.1, "inputs": {"x": 1}}, - {"type": "output", "function": "test_func", "depth": 1, "timestamp": 1.2, "output": "result"}, - {"type": "exit", "function": "test_func", "depth": 0, "timestamp": 2.0}, + {"type": "enter", "function": "test_func", "depth": base_depth, "timestamp": 1.0}, + {"type": "input", "function": "test_func", "depth": base_depth + 1, "timestamp": 1.1, "inputs": {"x": 1}}, + {"type": "output", "function": "test_func", "depth": base_depth + 1, "timestamp": 1.2, "output": "result"}, + {"type": "exit", "function": "test_func", "depth": base_depth, "timestamp": 2.0}, ] condensed = trace_client.condense_trace(entries) + print(f"{condensed=}") + # Test that the condensed entry's depth matches the enter event's depth assert len(condensed) == 1 assert condensed[0]["function"] == "test_func" - assert condensed[0]["depth"] == 1 + assert condensed[0]["depth"] == entries[0]["depth"] # Should match the input event's depth assert condensed[0]["inputs"] == {"x": 1} assert condensed[0]["output"] == "result" assert condensed[0]["duration"] == 1.0 @@ -167,50 +171,35 @@ def test_condense_trace(trace_client): @patch('requests.post') def test_save_trace(mock_post, trace_client): """Test saving trace data""" - mock_post.return_value.raise_for_status = Mock() + # Configure mock response properly + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = '{"message": "success"}' + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response with trace_client.span("test_span"): trace_client.record_input({"arg": 1}) trace_client.record_output("result") trace_id, data = trace_client.save() - assert mock_post.called assert data["trace_id"] == trace_client.trace_id - assert data["name"] == "test_trace" - assert len(data["entries"]) > 0 - assert isinstance(data["created_at"], str) - assert isinstance(data["duration"], float) - -def test_observe_decorator(tracer): - """Test the @tracer.observe decorator""" - @tracer.observe - def test_function(x, y): - return x + y - - with tracer.trace("test_trace"): - result = test_function(1, 2) - - assert result == 3 - -def test_observe_decorator_with_error(tracer): - """Test decorator error handling""" - @tracer.observe - def failing_function(): - raise ValueError("Test error") - - with tracer.trace("test_trace"): - with pytest.raises(ValueError): - failing_function() @patch('requests.post') def test_wrap_openai(mock_post, tracer): """Test wrapping OpenAI client""" + # Configure mock response properly + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = '{"message": "success"}' + mock_post.return_value = mock_response + client = OpenAI() - mock_response = MagicMock() - mock_response.choices = [MagicMock(message=MagicMock(content="test response"))] - mock_response.usage = MagicMock(prompt_tokens=10, completion_tokens=20, total_tokens=30) - client.chat.completions.create = MagicMock(return_value=mock_response) + mock_completion = MagicMock() + mock_completion.choices = [MagicMock(message=MagicMock(content="test response"))] + mock_completion.usage = MagicMock(prompt_tokens=10, completion_tokens=20, total_tokens=30) + client.chat.completions.create = MagicMock(return_value=mock_completion) wrapped_client = wrap(client) @@ -220,16 +209,22 @@ def test_wrap_openai(mock_post, tracer): messages=[{"role": "user", "content": "test"}] ) - assert response == mock_response + assert response == mock_completion @patch('requests.post') def test_wrap_anthropic(mock_post, tracer): """Test wrapping Anthropic client""" + # Configure mock response properly + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = '{"message": "success"}' + mock_post.return_value = mock_response + client = Anthropic() - mock_response = MagicMock() - mock_response.content = [MagicMock(text="test response")] - mock_response.usage = MagicMock(input_tokens=10, output_tokens=20) - client.messages.create = MagicMock(return_value=mock_response) + mock_completion = MagicMock() + mock_completion.content = [MagicMock(text="test response")] + mock_completion.usage = MagicMock(input_tokens=10, output_tokens=20) + client.messages.create = MagicMock(return_value=mock_completion) wrapped_client = wrap(client) @@ -239,7 +234,7 @@ def test_wrap_anthropic(mock_post, tracer): messages=[{"role": "user", "content": "test"}] ) - assert response == mock_response + assert response == mock_completion def test_wrap_unsupported_client(tracer): """Test wrapping unsupported client type""" @@ -266,3 +261,24 @@ def test_tracer_invalid_api_key(mocker): with pytest.raises(JudgmentAPIError, match="Issue with passed in Judgment API key: API key is invalid"): Tracer(api_key="invalid_key") + +def test_observe_decorator(tracer): + """Test the @tracer.observe decorator""" + @tracer.observe + def test_function(x, y): + return x + y + + with tracer.trace("test_trace"): + result = test_function(1, 2) + + assert result == 3 + +def test_observe_decorator_with_error(tracer): + """Test decorator error handling""" + @tracer.observe + def failing_function(): + raise ValueError("Test error") + + with tracer.trace("test_trace"): + with pytest.raises(ValueError): + failing_function() diff --git a/tests/data/test_scorer_data.py b/tests/data/test_scorer_data.py index 1f1e7829..a9ea1dc9 100644 --- a/tests/data/test_scorer_data.py +++ b/tests/data/test_scorer_data.py @@ -44,7 +44,7 @@ def score_example(self, example, *args, **kwargs): async def a_score_example(self, example, *args, **kwargs): pass - def success_check(self) -> bool: + def _success_check(self) -> bool: return self.score >= self.threshold if self.score is not None else False diff --git a/tests/scorers/test_custom_scorer.py b/tests/scorers/test_custom_scorer.py index c01b12a9..6cf4e7ef 100644 --- a/tests/scorers/test_custom_scorer.py +++ b/tests/scorers/test_custom_scorer.py @@ -29,7 +29,7 @@ def score_example(self, example, *args, **kwargs) -> float: async def a_score_example(self, example, *args, **kwargs) -> float: return 0.9 - def success_check(self) -> bool: + def _success_check(self) -> bool: return self.score >= self.threshold if self.score is not None else False @pytest.fixture @@ -118,15 +118,15 @@ def test_success_check_implementation(self, basic_scorer): """Test success_check with various scores""" # Test with score above threshold basic_scorer.score = 0.8 - assert basic_scorer.success_check() is True + assert basic_scorer._success_check() is True # Test with score below threshold basic_scorer.score = 0.6 - assert basic_scorer.success_check() is False + assert basic_scorer._success_check() is False # Test with no score basic_scorer.score = None - assert basic_scorer.success_check() is False + assert basic_scorer._success_check() is False def test_str_representation(self, basic_scorer): """Test string representation of scorer""" @@ -149,4 +149,4 @@ class IncompleteScorer(CustomScorer): asyncio.run(scorer.a_score_example({})) with pytest.raises(NotImplementedError): - scorer.success_check() + scorer._success_check() diff --git a/tests/scorers/test_prompt_scorer.py b/tests/scorers/test_prompt_scorer.py index e5e7e9ed..7c50e195 100644 --- a/tests/scorers/test_prompt_scorer.py +++ b/tests/scorers/test_prompt_scorer.py @@ -35,20 +35,20 @@ def __init__(self, mock_model, *args, **kwargs): super().__init__(*args, **kwargs) self.model = mock_model - def build_measure_prompt(self, example: Example) -> List[dict]: + def _build_measure_prompt(self, example: Example) -> List[dict]: return [ {"role": "system", "content": "Test system prompt"}, {"role": "user", "content": f"Response: {example.actual_output}"} ] - def build_schema(self) -> dict: + def _build_schema(self) -> dict: return {"score": float, "reason": str} - def process_response(self, response: dict): + def _process_response(self, response: dict): return response["score"], response["reason"] - def success_check(self, **kwargs) -> bool: - return self.result >= self.threshold + def _success_check(self, **kwargs) -> bool: + return self._result >= self.threshold # Tests for PromptScorer class TestPromptScorer: @@ -68,7 +68,7 @@ def test_enforce_prompt_format(self, mock_model): prompt = [{"role": "system", "content": "Base prompt"}] schema = {"score": float, "reason": str} - formatted = scorer.enforce_prompt_format(prompt, schema) + formatted = scorer._enforce_prompt_format(prompt, schema) assert "JSON format" in formatted[0]["content"] assert '"score": (float)' in formatted[0]["content"] assert '"reason": (str)' in formatted[0]["content"] @@ -76,7 +76,7 @@ def test_enforce_prompt_format(self, mock_model): def test_enforce_prompt_format_invalid_input(self, mock_model): scorer = SampleScorer(name="test_scorer", mock_model=mock_model) with pytest.raises(TypeError): - scorer.enforce_prompt_format("invalid", {}) + scorer._enforce_prompt_format("invalid", {}) @pytest.mark.asyncio async def test_a_score_example(self, example, mock_model): @@ -124,7 +124,7 @@ def test_build_measure_prompt(self, example, classifier_conversation, classifier options=classifier_options ) - prompt = scorer.build_measure_prompt(example) + prompt = scorer._build_measure_prompt(example) assert "This is a test response" in prompt[0]["content"] def test_process_response(self, classifier_conversation, classifier_options): @@ -136,7 +136,7 @@ def test_process_response(self, classifier_conversation, classifier_options): ) response = {"choice": "positive", "reason": "Test reason"} - score, reason = scorer.process_response(response) + score, reason = scorer._process_response(response) assert score == 1.0 assert reason == "Test reason" @@ -150,7 +150,7 @@ def test_process_response_invalid_choice(self, classifier_conversation, classifi response = {"choice": "invalid", "reason": "Test reason"} with pytest.raises(ValueError): - scorer.process_response(response) + scorer._process_response(response) def test_success_check(self, classifier_conversation, classifier_options): scorer = ClassifierScorer( @@ -161,7 +161,7 @@ def test_success_check(self, classifier_conversation, classifier_options): ) scorer.score = 1.0 - assert scorer.success_check() is True + assert scorer._success_check() is True scorer.score = 0.0 - assert scorer.success_check() is False + assert scorer._success_check() is False diff --git a/tests/scorers/test_score.py b/tests/scorers/test_score.py index 08354fd9..500412e2 100644 --- a/tests/scorers/test_score.py +++ b/tests/scorers/test_score.py @@ -20,7 +20,7 @@ def score_example(self, example, *args, **kwargs): async def a_score_example(self, example, *args, **kwargs): pass - def success_check(self): + def _success_check(self): return True @@ -798,7 +798,7 @@ def mock_scorer(): scorer.evaluation_model = "test-model" scorer.score = 0.9 scorer.reason = "Test reason" - scorer.success_check.return_value = True + scorer._success_check.return_value = True scorer.evaluation_cost = 0.1 scorer.verbose_logs = "Test logs" scorer.additional_metadata = {"key": "value"} diff --git a/tests/scorers/test_scorer_utils.py b/tests/scorers/test_scorer_utils.py index c10ac0a6..d355cff0 100644 --- a/tests/scorers/test_scorer_utils.py +++ b/tests/scorers/test_scorer_utils.py @@ -33,7 +33,7 @@ def score_example(self, example: Example, *args, **kwargs) -> float: async def a_score_example(self, example: Example, *args, **kwargs) -> float: return 1.0 - def success_check(self) -> bool: + def _success_check(self) -> bool: return True