diff --git a/Pipfile b/Pipfile index 7a1a9f02..f5fd917a 100644 --- a/Pipfile +++ b/Pipfile @@ -16,6 +16,7 @@ pandas = "*" openai = "*" together = "*" anthropic = "*" +patronus = "*" [dev-packages] pytest = "*" diff --git a/demo/test_competitors.py b/demo/test_competitors.py new file mode 100644 index 00000000..423906ce --- /dev/null +++ b/demo/test_competitors.py @@ -0,0 +1,96 @@ +from dotenv import load_dotenv +from patronus import Client +import os +import asyncio +import time +from openai import OpenAI +from anthropic import Anthropic + +load_dotenv() + +PATRONUS_API_KEY = os.getenv("PATRONUS_API_KEY") + +client = Client(api_key=PATRONUS_API_KEY) + +# Initialize clients +openai_client = OpenAI() +anthropic_client = Anthropic() + +async def make_upper(input: str) -> str: + output = input.upper() + result = client.evaluate( + evaluator="answer-relevance", + criteria="patronus:answer-relevance", + evaluated_model_input=input, + evaluated_model_output=output, + threshold=0.5, + model="gpt-4o-mini", + log_results=True + ) + return output + +def llm_call(input): + time.sleep(1.3) + return "We have a 30 day full refund policy on shoes." + +async def answer_user_question(input): + output = llm_call(input) + result = client.evaluate( + evaluator="answer-relevance", + criteria="patronus:answer-relevance", + evaluated_model_input=input, + evaluated_model_output=output, + evaluated_model_retrieved_context=["All customers are eligible for a 30 day full refund at no extra cost."], + expected_output="We offer a 30-day full refund at no extra cost.", + threshold=0.5, + model="gpt-4o-mini", + log_results=True + ) + return output + +async def make_poem(input: str) -> str: + try: + # Using Anthropic API + anthropic_response = anthropic_client.messages.create( + model="claude-3-sonnet-20240229", + messages=[{"role": "user", "content": input}], + max_tokens=30 + ) + anthropic_result = anthropic_response.content[0].text + + result = client.evaluate( + evaluator="answer-relevance", + criteria="patronus:answer-relevance", + evaluated_model_input=input, + evaluated_model_output=anthropic_result, + threshold=0.5, + model="gpt-4o-mini", + log_results=True + ) + + # Using OpenAI API + openai_response = openai_client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": "Make a short sentence with the input."}, + {"role": "user", "content": input} + ] + ) + openai_result = openai_response.choices[0].message.content + + return f"{anthropic_result} {openai_result}".lower() + + except Exception as e: + print(f"Error generating poem: {e}") + return "" + +async def test_evaluation_mixed(input): + upper = await make_upper(input) + result = await make_poem(upper) + await answer_user_question("What if these shoes don't fit?") + return result + +if __name__ == "__main__": + test_input = "Write a poem about Nissan R32 GTR" + asyncio.run(test_evaluation_mixed(test_input)) + diff --git a/docs/notebooks/prompt_scorer.ipynb b/docs/notebooks/prompt_scorer.ipynb index efe0323c..fb3f0223 100644 --- a/docs/notebooks/prompt_scorer.ipynb +++ b/docs/notebooks/prompt_scorer.ipynb @@ -157,7 +157,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/e2etests/judgment_client_test.py b/e2etests/judgment_client_test.py index 4eb7484b..b5872f55 100644 --- a/e2etests/judgment_client_test.py +++ b/e2etests/judgment_client_test.py @@ -3,11 +3,14 @@ """ import os +from pydantic import BaseModel + from judgeval.judgment_client import JudgmentClient from judgeval.data import Example from judgeval.scorers import ( FaithfulnessScorer, HallucinationScorer, + JSONCorrectnessScorer ) from judgeval.judges import TogetherJudge from judgeval.playground import CustomFaithfulnessMetric @@ -16,6 +19,8 @@ import random import string +from judgeval.scorers.prompt_scorer import ClassifierScorer + load_dotenv() def get_client(): @@ -35,10 +40,49 @@ def test_dataset(client: JudgmentClient): print(dataset) def test_run_eval(client: JudgmentClient): + # Single step in our workflow, an outreach Sales Agent + + example1 = Example( + input="Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.", + actual_output="Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex", + retrieval_context=["TechCorp launched AI analytics platform in 2024", "Sarah Chen is CEO, ex-Google executive", "Current client base: 50+ enterprise customers"], + ) + + example2 = Example( + input="Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.", + actual_output="Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex", + expected_output="A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans", + context=["Business Development"], + retrieval_context=["GreenEnergy Solutions won 2023 sustainability award", "New solar technology 30% more efficient", "Planning European market expansion"], + ) + + scorer = FaithfulnessScorer(threshold=0.5) + scorer2 = HallucinationScorer(threshold=0.5) + c_scorer = CustomFaithfulnessMetric(threshold=0.6) + + PROJECT_NAME = "OutreachWorkflow" + EVAL_RUN_NAME = "ColdEmailGenerator-Improve-BasePrompt" + + client.run_evaluation( + examples=[example1, example2], + scorers=[scorer, scorer2], + model="QWEN", + metadata={"batch": "test"}, + project_name=PROJECT_NAME, + eval_run_name=EVAL_RUN_NAME, + log_results=True, + override=True, + ) + + results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME) + print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results) + + +def test_json_scorer(client: JudgmentClient): example1 = Example( input="What if these shoes don't fit?", - actual_output="We offer a 30-day full refund at no extra cost.", + actual_output='{"tool": "authentication"}', retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."], trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6" ) @@ -55,16 +99,16 @@ def test_run_eval(client: JudgmentClient): additional_metadata={"difficulty": "medium"} ) - scorer = FaithfulnessScorer(threshold=0.5) - scorer2 = HallucinationScorer(threshold=0.5) - c_scorer = CustomFaithfulnessMetric(threshold=0.6) + class SampleSchema(BaseModel): + tool: str + scorer = JSONCorrectnessScorer(threshold=0.5, json_schema=SampleSchema) PROJECT_NAME = "test_project_JOSEPH" EVAL_RUN_NAME = "yomadude" - _ = client.run_evaluation( + res = client.run_evaluation( examples=[example1, example2], - scorers=[scorer, c_scorer], + scorers=[scorer], model="QWEN", metadata={"batch": "test"}, project_name=PROJECT_NAME, @@ -73,8 +117,8 @@ def test_run_eval(client: JudgmentClient): override=True, ) - results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME) - print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results) + print(res) + def test_override_eval(client: JudgmentClient): example1 = Example( @@ -146,8 +190,6 @@ def test_override_eval(client: JudgmentClient): if "already exists" not in str(e): raise print(f"Successfully caught expected error: {e}") - - def test_evaluate_dataset(client: JudgmentClient): @@ -194,8 +236,10 @@ def test_classifier_scorer(client: JudgmentClient): examples=[example1], scorers=[classifier_scorer, faithfulness_scorer], model="QWEN", + log_results=True, + eval_run_name="ToneScorerTest", + project_name="ToneScorerTest", ) - print(res) if __name__ == "__main__": # Test client functionality @@ -213,6 +257,13 @@ def test_classifier_scorer(client: JudgmentClient): test_run_eval(ui_client) print("Evaluation run successful") print("*" * 40) + +<<<<<<< HEAD +======= + print("Testing JSON scorer") + test_json_scorer(ui_client) + print("JSON scorer test successful") + print("*" * 40) print("Testing evaluation run override") test_override_eval(client) diff --git a/e2etests/test_prompt_scoring.py b/e2etests/test_prompt_scoring.py index 51f8c9c3..ac535d76 100644 --- a/e2etests/test_prompt_scoring.py +++ b/e2etests/test_prompt_scoring.py @@ -36,7 +36,7 @@ def __init__( ) self.score = 0.0 - def build_measure_prompt(self, example: Example): + def _build_measure_prompt(self, example: Example): SYSTEM_ROLE = ( 'You are a great judge of emotional intelligence. You understand the feelings ' 'and intentions of others. You will be tasked with judging whether the following ' @@ -51,16 +51,16 @@ def build_measure_prompt(self, example: Example): ] return conversation - def build_schema(self): + def _build_schema(self): return { "score": int, "reason": str } - def process_response(self, response): + def _process_response(self, response): return response["score"], response["reason"] - def success_check(self): + def _success_check(self): POSITIVITY_THRESHOLD = 3 # we want all model responses to be somewhat positive in tone return self.score <= POSITIVITY_THRESHOLD diff --git a/e2etests/test_tracer.py b/e2etests/test_tracer.py index 2f97280c..d2262377 100644 --- a/e2etests/test_tracer.py +++ b/e2etests/test_tracer.py @@ -14,11 +14,11 @@ from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer # Initialize the tracer and clients -judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY")) +judgment = Tracer(api_key=os.getenv("UI_JUDGMENT_API_KEY")) openai_client = wrap(OpenAI()) anthropic_client = wrap(Anthropic()) -@judgment.observe +@judgment.observe(span_type="tool") async def make_upper(input: str) -> str: """Convert input to uppercase and evaluate using judgment API. @@ -28,6 +28,7 @@ async def make_upper(input: str) -> str: The uppercase version of the input string """ output = input.upper() + await judgment.get_current_trace().async_evaluate( scorers=[FaithfulnessScorer(threshold=0.5)], input="What if these shoes don't fit?", @@ -38,9 +39,10 @@ async def make_upper(input: str) -> str: model="gpt-4o-mini", log_results=True ) + return output -@judgment.observe +@judgment.observe(span_type="tool") async def make_lower(input): output = input.lower() @@ -59,11 +61,12 @@ async def make_lower(input): ) return output -@judgment.observe +@judgment.observe(span_type="llm") def llm_call(input): + time.sleep(1.3) return "We have a 30 day full refund policy on shoes." -@judgment.observe +@judgment.observe(span_type="tool") async def answer_user_question(input): output = llm_call(input) await judgment.get_current_trace().async_evaluate( @@ -77,7 +80,7 @@ async def answer_user_question(input): ) return output -@judgment.observe +@judgment.observe(span_type="tool") async def make_poem(input: str) -> str: """Generate a poem using both Anthropic and OpenAI APIs. @@ -95,6 +98,15 @@ async def make_poem(input: str) -> str: ) anthropic_result = anthropic_response.content[0].text + await judgment.get_current_trace().async_evaluate( + input=input, + actual_output=anthropic_result, + score_type=APIScorer.ANSWER_RELEVANCY, + threshold=0.5, + model="gpt-4o-mini", + log_results=True + ) + # Using OpenAI API openai_response = openai_client.chat.completions.create( model="gpt-4o-mini", @@ -112,7 +124,8 @@ async def make_poem(input: str) -> str: return "" async def test_evaluation_mixed(input): - with judgment.trace("test_evaluation") as trace: + PROJECT_NAME = "NewPoemBot" + with judgment.trace("Use-claude", project_name=PROJECT_NAME, overwrite=True) as trace: upper = await make_upper(input) result = await make_poem(upper) await answer_user_question("What if these shoes don't fit?") diff --git a/judgeval/common/telemetry.py b/judgeval/common/telemetry.py deleted file mode 100644 index 22fd05db..00000000 --- a/judgeval/common/telemetry.py +++ /dev/null @@ -1,123 +0,0 @@ -from contextlib import contextmanager -import logging -import os -import socket -import sys -import uuid -import sentry_sdk -from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( - OTLPSpanExporter, -) - - -def get_unique_id(): - unique_id = os.getenv("judgeval_UNIQUE_ID") - if unique_id is None: - unique_id = str(uuid.uuid4()) - os.environ["judgeval_UNIQUE_ID"] = unique_id - return unique_id - - -def telemetry_opt_out(): - return os.getenv("judgeval_TELEMETRY_OPT_OUT") == "YES" - - -def blocked_by_firewall(): - try: - socket.create_connection(("www.google.com", 80)) - return False - except OSError: - return True - - -if not telemetry_opt_out(): - sentry_sdk.init( - dsn="https://5ef587d58109ee45d6544f3657efdd1f@o4506098477236224.ingest.sentry.io/4506098479136768", - profiles_sample_rate=1.0, - traces_sample_rate=1.0, # For performance monitoring - send_default_pii=False, # Don't send personally identifiable information - attach_stacktrace=False, # Don't attach stack traces to messages - default_integrations=False, # Disable Sentry's default integrations - ) - - # Set up the Tracer Provider - trace.set_tracer_provider(TracerProvider()) - tracer_provider = trace.get_tracer_provider() - - # New Relic License Key and OTLP Endpoint - NEW_RELIC_LICENSE_KEY = "1711c684db8a30361a7edb0d0398772cFFFFNRAL" - NEW_RELIC_OTLP_ENDPOINT = "https://otlp.nr-data.net:4317" - otlp_exporter = OTLPSpanExporter( - endpoint=NEW_RELIC_OTLP_ENDPOINT, - headers={"api-key": NEW_RELIC_LICENSE_KEY}, - ) - - # Add the OTLP exporter to the span processor - span_processor = BatchSpanProcessor(otlp_exporter) - tracer_provider.add_span_processor(span_processor) - - logging.getLogger("opentelemetry.exporter.otlp").setLevel(logging.CRITICAL) - - # Create a tracer for your application - tracer = trace.get_tracer(__name__) - - -if ( - os.getenv("ERROR_REPORTING") == "YES" - and not blocked_by_firewall() - and not os.getenv("TELEMETRY_OPT_OUT") -): - - def handle_exception(exc_type, exc_value, exc_traceback): - print({"exc_type": exc_type, "exc_value": exc_value}) - sentry_sdk.capture_exception(exc_value) - sys.__excepthook__(exc_type, exc_value, exc_traceback) - - sys.excepthook = handle_exception - - -@contextmanager -def capture_evaluation_run(type: str): - if not telemetry_opt_out(): - with tracer.start_as_current_span(f"Evaluation run: {type}") as span: - span.set_attribute("user.unique_id", get_unique_id()) - yield span - else: - yield - - -@contextmanager -def capture_metric_type(metric_name: str, _track: bool = True): - if not telemetry_opt_out() and _track: - with tracer.start_as_current_span(metric_name) as span: - span.set_attribute("user.unique_id", get_unique_id()) - yield span - else: - yield - - -@contextmanager -def capture_synthesizer_run(max_generations: int = None, method: str = None): - if not telemetry_opt_out() and max_generations is not None: - with tracer.start_as_current_span( - f"Invoked synthesizer ({max_generations}) | Method: {method}" - ) as span: - span.set_attribute("user.unique_id", get_unique_id()) - yield span - else: - yield - - -@contextmanager -def capture_red_teamer_run(task: str): - if not telemetry_opt_out(): - with tracer.start_as_current_span( - f"Invoked red teamer: ({task})" - ) as span: - span.set_attribute("user.unique_id", get_unique_id()) - yield span - else: - yield diff --git a/judgeval/common/tracer.py b/judgeval/common/tracer.py index bc6bf071..34b135cd 100644 --- a/judgeval/common/tracer.py +++ b/judgeval/common/tracer.py @@ -28,6 +28,7 @@ import json import warnings from pydantic import BaseModel +from http import HTTPStatus from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL from judgeval.judgment_client import JudgmentClient @@ -38,7 +39,7 @@ # Define type aliases for better code readability and maintainability ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types - +SpanType = Literal['span', 'tool', 'llm', 'evaluation'] @dataclass class TraceEntry: """Represents a single trace entry with its visual representation. @@ -58,7 +59,8 @@ class TraceEntry: duration: Optional[float] = None # Time taken (for exit/evaluation entries) output: Any = None # Function output value # Use field() for mutable defaults to avoid shared state issues - inputs: dict = field(default_factory=dict) + inputs: dict = field(default_factory=dict) + span_type: SpanType = "span" evaluation_result: Optional[List[ScoringResult]] = field(default=None) def print_entry(self): @@ -93,7 +95,8 @@ def to_dict(self) -> dict: "duration": self.duration, "output": output, "inputs": self.inputs or None, # Convert empty dict to None - "evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None + "evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None, + "span_type": self.span_type } def _serialize_output(self) -> Any: @@ -112,17 +115,19 @@ def _serialize_output(self) -> Any: class TraceClient: """Client for managing a single trace context""" - def __init__(self, tracer, trace_id: str, name: str): + def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project"): self.tracer = tracer self.trace_id = trace_id self.name = name + self.project_name = project_name self.client: JudgmentClient = tracer.client self.entries: List[TraceEntry] = [] self.start_time = time.time() - self._current_span = None + self.span_type = None + self._current_span: Optional[TraceEntry] = None @contextmanager - def span(self, name: str): + def span(self, name: str, span_type: SpanType = "span"): """Context manager for creating a trace span""" start_time = time.time() @@ -132,7 +137,8 @@ def span(self, name: str): function=name, depth=self.tracer.depth, message=name, - timestamp=start_time + timestamp=start_time, + span_type=span_type )) self.tracer.depth += 1 @@ -152,7 +158,8 @@ def span(self, name: str): depth=self.tracer.depth, message=f"← {name}", timestamp=time.time(), - duration=duration + duration=duration, + span_type=span_type )) self._current_span = prev_span @@ -199,6 +206,7 @@ def record_evaluation(self, results: List[ScoringResult], start_time: float): """Record evaluation results for the current span""" if self._current_span: duration = time.time() - start_time # Calculate duration from start_time + self.add_entry(TraceEntry( type="evaluation", function=self._current_span, @@ -206,7 +214,8 @@ def record_evaluation(self, results: List[ScoringResult], start_time: float): message=f"Evaluation results for {self._current_span}", timestamp=time.time(), evaluation_result=results, - duration=duration + duration=duration, + span_type="evaluation" )) def record_input(self, inputs: dict): @@ -218,7 +227,8 @@ def record_input(self, inputs: dict): depth=self.tracer.depth, message=f"Inputs to {self._current_span}", timestamp=time.time(), - inputs=inputs + inputs=inputs, + span_type=self.span_type )) async def _update_coroutine_output(self, entry: TraceEntry, coroutine: Any): @@ -240,7 +250,8 @@ def record_output(self, output: Any): depth=self.tracer.depth, message=f"Output from {self._current_span}", timestamp=time.time(), - output="" if inspect.iscoroutine(output) else output + output="" if inspect.iscoroutine(output) else output, + span_type=self.span_type ) self.add_entry(entry) @@ -266,45 +277,39 @@ def get_duration(self) -> float: def condense_trace(self, entries: List[dict]) -> List[dict]: """ - Condenses trace entries into a single entry for each function. - - Groups entries by function call and combines them into a single entry with: - - depth: deepest depth for this function call - - duration: time from first to last timestamp - - function: function name - - inputs: non-None inputs - - output: non-None outputs - - evaluation_result: evaluation results - - timestamp: first timestamp of the function call + Condenses trace entries into a single entry for each function call. """ condensed = [] - current_func = None - current_entry = None + active_functions = [] # Stack to track nested function calls + function_entries = {} # Store entries for each function for entry in entries: + function = entry["function"] + if entry["type"] == "enter": - # Start of new function call - current_func = entry["function"] - current_entry = { + # Initialize new function entry + function_entries[function] = { "depth": entry["depth"], - "function": entry["function"], + "function": function, "timestamp": entry["timestamp"], "inputs": None, "output": None, - "evaluation_result": None + "evaluation_result": None, + "span_type": entry.get("span_type", "span") } - - elif entry["type"] == "exit" and entry["function"] == current_func: - # End of current function + active_functions.append(function) + + elif entry["type"] == "exit" and function in active_functions: + # Complete function entry + current_entry = function_entries[function] current_entry["duration"] = entry["timestamp"] - current_entry["timestamp"] condensed.append(current_entry) - current_func = None - current_entry = None - - elif current_func and entry["function"] == current_func: - # Additional entries for current function - if entry["depth"] > current_entry["depth"]: - current_entry["depth"] = entry["depth"] + active_functions.remove(function) + del function_entries[function] + + elif function in active_functions: + # Update existing function entry with additional data + current_entry = function_entries[function] if entry["type"] == "input" and entry["inputs"]: current_entry["inputs"] = entry["inputs"] @@ -315,9 +320,11 @@ def condense_trace(self, entries: List[dict]) -> List[dict]: if entry["type"] == "evaluation" and entry["evaluation_result"]: current_entry["evaluation_result"] = entry["evaluation_result"] + # Sort by timestamp + condensed.sort(key=lambda x: x["timestamp"]) return condensed - def save(self) -> Tuple[str, dict]: + def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str, dict]: """ Save the current trace to the database. Returns a tuple of (trace_id, trace_data) where trace_data is the trace data that was saved. @@ -333,6 +340,7 @@ def save(self) -> Tuple[str, dict]: "trace_id": self.trace_id, "api_key": self.tracer.api_key, "name": self.name, + "project_name": self.project_name, "created_at": datetime.fromtimestamp(self.start_time).isoformat(), "duration": total_duration, "token_counts": { @@ -340,7 +348,9 @@ def save(self) -> Tuple[str, dict]: "completion_tokens": 0, # Dummy value "total_tokens": 0, # Dummy value }, # TODO: Add token counts - "entries": condensed_entries + "entries": condensed_entries, + "empty_save": empty_save, + "overwrite": overwrite } # Save trace data by making POST request to API @@ -351,7 +361,11 @@ def save(self) -> Tuple[str, dict]: "Content-Type": "application/json", } ) - response.raise_for_status() + + if response.status_code == HTTPStatus.BAD_REQUEST: + raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}") + elif response.status_code != HTTPStatus.OK: + raise ValueError(f"Failed to save trace data: {response.text}") return self.trace_id, trace_data @@ -369,17 +383,17 @@ def __init__(self, api_key: str): if not api_key: raise ValueError("Tracer must be configured with a Judgment API key") - self.api_key = api_key - self.client = JudgmentClient(judgment_api_key=api_key) - self.depth = 0 - self._current_trace: Optional[TraceClient] = None - self.initialized = True + self.api_key: str = api_key + self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key) + self.depth: int = 0 + self._current_trace: Optional[str] = None + self.initialized: bool = True @contextmanager - def trace(self, name: str = None) -> Generator[TraceClient, None, None]: + def trace(self, name: str, project_name: str = "default_project", overwrite: bool = False) -> Generator[TraceClient, None, None]: """Start a new trace context using a context manager""" trace_id = str(uuid.uuid4()) - trace = TraceClient(self, trace_id, name or "unnamed_trace") + trace = TraceClient(self, trace_id, name, project_name=project_name) prev_trace = self._current_trace self._current_trace = trace @@ -387,7 +401,7 @@ def trace(self, name: str = None) -> Generator[TraceClient, None, None]: with trace.span(name or "unnamed_trace") as span: try: # Save the trace to the database to handle Evaluations' trace_id referential integrity - trace.save() + trace.save(empty_save=True, overwrite=overwrite) yield trace finally: self._current_trace = prev_trace @@ -398,16 +412,17 @@ def get_current_trace(self) -> Optional[TraceClient]: """ return self._current_trace - def observe(self, func=None, *, name=None): + def observe(self, func=None, *, name=None, span_type: SpanType = "span"): """ Decorator to trace function execution with detailed entry/exit information. Args: func: The function to trace name: Optional custom name for the function + span_type: The type of span to use for this observation (default: "span") """ if func is None: - return lambda f: self.observe(f, name=name) + return lambda f: self.observe(f, name=name, span_type=span_type) if asyncio.iscoroutinefunction(func): @functools.wraps(func) @@ -415,7 +430,10 @@ async def async_wrapper(*args, **kwargs): if self._current_trace: span_name = name or func.__name__ - with self._current_trace.span(span_name) as span: + with self._current_trace.span(span_name, span_type=span_type) as span: + # Set the span type + span.span_type = span_type + # Record inputs span.record_input({ 'args': list(args), @@ -438,7 +456,10 @@ def wrapper(*args, **kwargs): if self._current_trace: span_name = name or func.__name__ - with self._current_trace.span(span_name) as span: + with self._current_trace.span(span_name, span_type=span_type) as span: + # Set the span type + span.span_type = span_type + # Record inputs span.record_input({ 'args': list(args), @@ -471,7 +492,7 @@ def traced_create(*args, **kwargs): if not (tracer and tracer._current_trace): return original_create(*args, **kwargs) - with tracer._current_trace.span(span_name) as span: + with tracer._current_trace.span(span_name, span_type="llm") as span: # Format and record the input parameters input_data = _format_input_data(client, **kwargs) span.record_input(input_data) diff --git a/judgeval/data/result.py b/judgeval/data/result.py index 9b9f4c1d..dc24c670 100644 --- a/judgeval/data/result.py +++ b/judgeval/data/result.py @@ -7,6 +7,7 @@ class ScoringResult: """ A ScoringResult contains the output of one or more scorers applied to a single example. + Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...) Args: success (bool): Whether the evaluation was successful. @@ -32,6 +33,9 @@ class ScoringResult: retrieval_context: Optional[List[str]] = None trace_id: Optional[str] = None + example_id: Optional[str] = None + eval_run_name: Optional[str] = None + def to_dict(self) -> dict: """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data.""" return { @@ -42,7 +46,8 @@ def to_dict(self) -> dict: "expected_output": self.expected_output, "context": self.context, "retrieval_context": self.retrieval_context, - "trace_id": self.trace_id + "trace_id": self.trace_id, + "example_id": self.example_id } def __str__(self) -> str: diff --git a/judgeval/data/scorer_data.py b/judgeval/data/scorer_data.py index 787bc6c4..85272f7f 100644 --- a/judgeval/data/scorer_data.py +++ b/judgeval/data/scorer_data.py @@ -76,7 +76,7 @@ def create_scorer_data(scorer: CustomScorer) -> ScorerData: score=scorer.score, threshold=scorer.threshold, reason=scorer.reason, - success=scorer.success_check(), + success=scorer._success_check(), strict_mode=scorer.strict_mode, evaluation_model=scorer.evaluation_model, error=None, diff --git a/judgeval/evaluation_run.py b/judgeval/evaluation_run.py index a731c581..a7fad1de 100644 --- a/judgeval/evaluation_run.py +++ b/judgeval/evaluation_run.py @@ -6,6 +6,8 @@ from judgeval.scorers import CustomScorer, JudgmentScorer from judgeval.constants import ACCEPTABLE_MODELS from judgeval.common.logger import debug, error + + class EvaluationRun(BaseModel): """ Stores example and evaluation scorers together for running an eval task @@ -33,6 +35,16 @@ class EvaluationRun(BaseModel): # API Key will be "" until user calls client.run_eval(), then API Key will be set judgment_api_key: Optional[str] = "" + def model_dump(self, **kwargs): + data = super().model_dump(**kwargs) + + data["scorers"] = [ + scorer.to_dict() \ + if hasattr(scorer, "to_dict") else {"score_type": scorer.score_type, "threshold": scorer.threshold} + for scorer in self.scorers + ] + return data + @field_validator('log_results', mode='before') def validate_log_results(cls, v): if not isinstance(v, bool): diff --git a/judgeval/judgment_client.py b/judgeval/judgment_client.py index fe4636a1..d8190ceb 100644 --- a/judgeval/judgment_client.py +++ b/judgeval/judgment_client.py @@ -129,20 +129,34 @@ def pull_dataset(self, alias: str) -> EvalDataset: return dataset # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend - def pull_eval(self, project_name: str, eval_run_name: str) -> List[ScoringResult]: + def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]: + """Pull evaluation results from the server. + + Args: + project_name (str): Name of the project + eval_run_name (str): Name of the evaluation run + + Returns: + Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing: + - id (str): The evaluation run ID + - results (List[ScoringResult]): List of scoring results + """ eval_run_request_body = EvalRunRequestBody(project_name=project_name, eval_name=eval_run_name, judgment_api_key=self.judgment_api_key) - eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL, + eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL, json=eval_run_request_body.model_dump()) if eval_run.status_code != requests.codes.ok: raise ValueError(f"Error fetching eval results: {eval_run.json()}") - eval_results = [] + + eval_run_result = [{}] for result in eval_run.json(): - result = result.get("result", dict()) - filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__} - eval_results.append(ScoringResult(**filtered_result)) - return eval_results + result_id = result.get("id", "") + result_data = result.get("result", dict()) + filtered_result = {k: v for k, v in result_data.items() if k in ScoringResult.__annotations__} + eval_run_result[0]["id"] = result_id + eval_run_result[0]["results"] = [ScoringResult(**filtered_result)] + return eval_run_result def _validate_api_key(self): """ @@ -191,3 +205,37 @@ def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer: return ClassifierScorer(**scorer_config) except Exception as e: raise JudgmentAPIError(f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}") + + def push_classifier_scorer(self, scorer: ClassifierScorer, slug: str = None) -> str: + """ + Pushes a classifier scorer configuration to the Judgment API. + + Args: + slug (str): Slug identifier for the scorer. If it exists, the scorer will be updated. + scorer (ClassifierScorer): The classifier scorer to save + + Returns: + str: The slug identifier of the saved scorer + + Raises: + JudgmentAPIError: If there's an error saving the scorer + """ + request_body = { + "name": scorer.name, + "conversation": [m.model_dump() for m in scorer.conversation], + "options": scorer.options, + "judgment_api_key": self.judgment_api_key, + "slug": slug + } + + response = requests.post( + f"{ROOT_API}/save_scorer/", + json=request_body + ) + + if response.status_code == 500: + raise JudgmentAPIError(f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}") + elif response.status_code != 200: + raise JudgmentAPIError(f"Failed to save classifier scorer: {response.json().get('detail', '')}") + + return response.json()["slug"] \ No newline at end of file diff --git a/judgeval/playground.py b/judgeval/playground.py index c5d065c6..907ad24e 100644 --- a/judgeval/playground.py +++ b/judgeval/playground.py @@ -15,7 +15,6 @@ from judgeval.judges.utils import create_judge from judgeval.scorers.custom_scorer import CustomScorer from judgeval.scorers.score import * -from judgeval.common.telemetry import capture_metric_type """ Testing implementation of CustomFaithfulness @@ -195,22 +194,21 @@ def metric_progress_indicator( total: int = 9999, transient: bool = True, ): - with capture_metric_type(metric.__name__): - console = Console(file=sys.stderr) # Direct output to standard error - if _show_indicator: - with Progress( - SpinnerColumn(style="rgb(106,0,255)"), - TextColumn("[progress.description]{task.description}"), - console=console, # Use the custom console - transient=transient, - ) as progress: - progress.add_task( - description=scorer_console_msg(metric, async_mode), - total=total, - ) - yield - else: + console = Console(file=sys.stderr) # Direct output to standard error + if _show_indicator: + with Progress( + SpinnerColumn(style="rgb(106,0,255)"), + TextColumn("[progress.description]{task.description}"), + console=console, # Use the custom console + transient=transient, + ) as progress: + progress.add_task( + description=scorer_console_msg(metric, async_mode), + total=total, + ) yield + else: + yield def prettify_list(lst: List[Any]): @@ -568,7 +566,7 @@ def _calculate_score(self) -> float: score = faithfulness_count / number_of_verdicts return 0 if self.strict_mode and score < self.threshold else score - def success_check(self) -> bool: + def _success_check(self) -> bool: if self.error is not None: self.success = False else: diff --git a/judgeval/run_evaluation.py b/judgeval/run_evaluation.py index c32676cf..42b4e67a 100644 --- a/judgeval/run_evaluation.py +++ b/judgeval/run_evaluation.py @@ -47,7 +47,8 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]: try: # submit API request to execute evals - response = requests.post(JUDGMENT_EVAL_API_URL, json=evaluation_run.model_dump(warnings=False)) + payload = evaluation_run.model_dump(warnings=False) + response = requests.post(JUDGMENT_EVAL_API_URL, json=payload) response_data = response.json() except Exception as e: error(f"Error: {e}") diff --git a/judgeval/scorers/custom_scorer.py b/judgeval/scorers/custom_scorer.py index 75816e7d..d21e47ee 100644 --- a/judgeval/scorers/custom_scorer.py +++ b/judgeval/scorers/custom_scorer.py @@ -101,7 +101,7 @@ async def a_score_example(self, example, *args, **kwargs) -> float: raise NotImplementedError("You must implement the `a_score` method in your custom scorer") @abstractmethod - def success_check(self) -> bool: + def _success_check(self) -> bool: """ For unit testing, determines whether the test case passes or fails """ diff --git a/judgeval/scorers/judgeval_scorers/json_correctness.py b/judgeval/scorers/judgeval_scorers/json_correctness.py index 0ea2c6dc..98585731 100644 --- a/judgeval/scorers/judgeval_scorers/json_correctness.py +++ b/judgeval/scorers/judgeval_scorers/json_correctness.py @@ -20,6 +20,13 @@ def __init__(self, threshold: float, json_schema: BaseModel): super().__init__(threshold=threshold, score_type=APIScorer.JSON_CORRECTNESS) object.__setattr__(self, 'json_schema', json_schema) + def to_dict(self): + return { + "score_type": self.score_type, + "threshold": self.threshold, + "kwargs": {"json_schema": self.json_schema.model_json_schema()} + } + @property def __name__(self): return "JSON Correctness" diff --git a/judgeval/scorers/prompt_scorer.py b/judgeval/scorers/prompt_scorer.py index b1829afe..fb996a96 100644 --- a/judgeval/scorers/prompt_scorer.py +++ b/judgeval/scorers/prompt_scorer.py @@ -49,8 +49,8 @@ class PromptScorer(CustomScorer, BaseModel): using_native_model: bool = Field(default=True) # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD - response: Optional[dict] = None - result: Optional[float] = None + _response: Optional[dict] = None + _result: Optional[float] = None def __init__( self, @@ -100,11 +100,11 @@ def score_example( else: result, reason = self.evaluate(example) self.reason = reason - self.result = result + self._result = result self.verbose_logs = create_verbose_logs( self, steps=[ - f"Results: {self.result}\nReason: {self.reason}", + f"Results: {self._result}\nReason: {self.reason}", ], ) return result @@ -120,11 +120,11 @@ async def a_score_example( with scorer_progress_meter(self, display_meter=_show_indicator): result, reason = await self.a_evaluate(example) self.reason = reason - self.result = result + self._result = result self.verbose_logs = create_verbose_logs( self, steps=[ - f"Results: {self.result}\nReason: {self.reason}", + f"Results: {self._result}\nReason: {self.reason}", ], ) return result @@ -138,11 +138,11 @@ def evaluate(self, example: Example) -> Tuple[Any, str]: NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field. """ - prompt = self.build_measure_prompt(example) + prompt = self._build_measure_prompt(example) if self.using_native_model: res = self.model.generate(prompt) response = parse_response_json(res, self) - result, reason = self.process_response(response) + result, reason = self._process_response(response) return result, reason else: raise NotImplementedError("Non-native judge models are not supported in synchronous mode yet.") @@ -156,25 +156,25 @@ async def a_evaluate(self, example: Example) -> Tuple[Any, str]: NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field. """ - judge_prompt = self.build_measure_prompt(example) - schema = self.build_schema() - prompt = self.enforce_prompt_format(judge_prompt=judge_prompt, schema=schema) + judge_prompt = self._build_measure_prompt(example) + schema = self._build_schema() + prompt = self._enforce_prompt_format(judge_prompt=judge_prompt, schema=schema) if self.using_native_model: res = await self.model.a_generate(prompt) response = parse_response_json(res, self) - self.response = response + self._response = response - result, reason = self.process_response(response) + result, reason = self._process_response(response) self.score = result self.reason = reason - self.response = response + self._response = response return result, reason else: raise NotImplementedError("Non-native judge models are not supported in async mode yet.") # TODO: can we make this take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args @abstractmethod - def build_measure_prompt(self, example: Example) -> List[dict]: + def _build_measure_prompt(self, example: Example) -> List[dict]: # builds the prompt that is sent to the model inside of the `score_example()` method # returns either a string prompt or a conversation prompt of the form [{"role": "system", "content": "..."}, ...] @@ -197,7 +197,7 @@ def build_measure_prompt(self, example: Example) -> List[dict]: # TODO: does this need to take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args @abstractmethod - def build_schema(self) -> dict: + def _build_schema(self) -> dict: """ This function returns a dictionary that represents the schema of the JSON response that the judge model should return. @@ -208,7 +208,7 @@ def build_schema(self) -> dict: """ pass - def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict): + def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict): """ Formats the final prompt to the judge model. @@ -248,7 +248,7 @@ def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict): raise TypeError(f"Prompt must be a list of dictionaries. Got {type(judge_prompt)} instead.") @abstractmethod - def process_response(self, response: dict): + def _process_response(self, response: dict): """ Customizable method for processing the response from the judge model. @@ -264,7 +264,7 @@ def process_response(self, response: dict): pass @abstractmethod - def success_check(self, **kwargs) -> bool: + def _success_check(self, **kwargs) -> bool: """ Determines whether or not the PromptScorer should consider the evaluation of a single example successful. """ @@ -320,7 +320,16 @@ def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapp verbose_mode=verbose_mode, ) - def build_measure_prompt(self, example: Example) -> List[dict]: + def _build_measure_prompt(self, example: Example) -> List[dict]: + """ + Builds the measure prompt for the classifier scorer. + + Args: + example (Example): The example to build the prompt for + + Returns: + List[dict]: The measure prompt for the classifier scorer + """ replacement_words = { "{{actual_output}}": example.actual_output, "{{expected_output}}": example.expected_output, @@ -341,10 +350,10 @@ def build_measure_prompt(self, example: Example) -> List[dict]: message["content"] = content.replace(key, str(value)) return conversation_copy - def build_schema(self) -> dict: + def _build_schema(self) -> dict: return self.options - def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]: + def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]: """ Enforces the judge model to choose an option from the schema. @@ -369,15 +378,45 @@ def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[ judge_prompt[0]["content"] = system_role return judge_prompt - def process_response(self, response: dict) -> Tuple[float, str]: + def _process_response(self, response: dict) -> Tuple[float, str]: choice = response.get("choice") if choice not in self.options: raise ValueError(f"Invalid choice: {choice}. Expected one of: {self.options.keys()}") reason = response.get("reason", "No reason could be found in model response.") return self.options[choice], reason - def success_check(self, **kwargs) -> bool: + def _success_check(self, **kwargs) -> bool: return self.score >= self.threshold + + def update_name(self, name: str): + """ + Updates the name of the scorer. + """ + self.name = name + + def update_threshold(self, threshold: float): + """ + Updates the threshold of the scorer. + """ + self.threshold = threshold + + def update_conversation(self, conversation: List[dict]): + """ + Updates the conversation with the new conversation. + + Sample conversation: + [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}] + """ + self.conversation = conversation + + def update_options(self, options: Mapping[str, float]): + """ + Updates the options with the new options. + + Sample options: + {"yes": 1, "no": 0} + """ + self.options = options def __str__(self): return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})" diff --git a/judgeval/scorers/score.py b/judgeval/scorers/score.py index b16352e8..9878bc80 100644 --- a/judgeval/scorers/score.py +++ b/judgeval/scorers/score.py @@ -18,7 +18,6 @@ ) from judgeval.scorers import CustomScorer from judgeval.scorers.utils import clone_scorers, scorer_console_msg -from judgeval.common.telemetry import capture_evaluation_run from judgeval.common.exceptions import MissingTestCaseParamsError from judgeval.common.logger import example_logging_context, debug, error, warning, info from judgeval.judges import judgevalJudge @@ -312,36 +311,9 @@ async def execute_with_semaphore(func: Callable, *args, **kwargs): debug(f"Scorer threshold: {scorer.threshold}") if hasattr(scorer, 'model'): debug(f"Scorer model: {type(scorer.model).__name__}") - with capture_evaluation_run("Example"): - if isinstance(ex, Example): - if len(scorers) == 0: - pbar.update(1) - continue - - cloned_scorers: List[CustomScorer] = clone_scorers( - scorers - ) - task = execute_with_semaphore( - func=a_eval_examples_helper, - scorers=cloned_scorers, - example=ex, - scoring_results=scoring_results, - score_index=i, - ignore_errors=ignore_errors, - skip_on_missing_params=skip_on_missing_params, - show_indicator=show_indicator, - _use_bar_indicator=_use_bar_indicator, - pbar=pbar, - ) - tasks.append(asyncio.create_task(task)) - - await asyncio.sleep(throttle_value) - await asyncio.gather(*tasks) - else: - for i, ex in enumerate(examples): - with capture_evaluation_run("Example"): if isinstance(ex, Example): if len(scorers) == 0: + pbar.update(1) continue cloned_scorers: List[CustomScorer] = clone_scorers( @@ -355,12 +327,37 @@ async def execute_with_semaphore(func: Callable, *args, **kwargs): score_index=i, ignore_errors=ignore_errors, skip_on_missing_params=skip_on_missing_params, - _use_bar_indicator=_use_bar_indicator, show_indicator=show_indicator, + _use_bar_indicator=_use_bar_indicator, + pbar=pbar, ) - tasks.append(asyncio.create_task((task))) + tasks.append(asyncio.create_task(task)) await asyncio.sleep(throttle_value) + await asyncio.gather(*tasks) + else: + for i, ex in enumerate(examples): + if isinstance(ex, Example): + if len(scorers) == 0: + continue + + cloned_scorers: List[CustomScorer] = clone_scorers( + scorers + ) + task = execute_with_semaphore( + func=a_eval_examples_helper, + scorers=cloned_scorers, + example=ex, + scoring_results=scoring_results, + score_index=i, + ignore_errors=ignore_errors, + skip_on_missing_params=skip_on_missing_params, + _use_bar_indicator=_use_bar_indicator, + show_indicator=show_indicator, + ) + tasks.append(asyncio.create_task((task))) + + await asyncio.sleep(throttle_value) await asyncio.gather(*tasks) return scoring_results diff --git a/tests/common/test_tracer.py b/tests/common/test_tracer.py index cb6abd61..edc58197 100644 --- a/tests/common/test_tracer.py +++ b/tests/common/test_tracer.py @@ -149,17 +149,21 @@ def test_record_input_output(trace_client): def test_condense_trace(trace_client): """Test trace condensing functionality""" + # Store the base depth from the enter event + base_depth = 0 entries = [ - {"type": "enter", "function": "test_func", "depth": 0, "timestamp": 1.0}, - {"type": "input", "function": "test_func", "depth": 1, "timestamp": 1.1, "inputs": {"x": 1}}, - {"type": "output", "function": "test_func", "depth": 1, "timestamp": 1.2, "output": "result"}, - {"type": "exit", "function": "test_func", "depth": 0, "timestamp": 2.0}, + {"type": "enter", "function": "test_func", "depth": base_depth, "timestamp": 1.0}, + {"type": "input", "function": "test_func", "depth": base_depth + 1, "timestamp": 1.1, "inputs": {"x": 1}}, + {"type": "output", "function": "test_func", "depth": base_depth + 1, "timestamp": 1.2, "output": "result"}, + {"type": "exit", "function": "test_func", "depth": base_depth, "timestamp": 2.0}, ] condensed = trace_client.condense_trace(entries) + print(f"{condensed=}") + # Test that the condensed entry's depth matches the enter event's depth assert len(condensed) == 1 assert condensed[0]["function"] == "test_func" - assert condensed[0]["depth"] == 1 + assert condensed[0]["depth"] == entries[0]["depth"] # Should match the input event's depth assert condensed[0]["inputs"] == {"x": 1} assert condensed[0]["output"] == "result" assert condensed[0]["duration"] == 1.0 @@ -167,50 +171,35 @@ def test_condense_trace(trace_client): @patch('requests.post') def test_save_trace(mock_post, trace_client): """Test saving trace data""" - mock_post.return_value.raise_for_status = Mock() + # Configure mock response properly + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = '{"message": "success"}' + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response with trace_client.span("test_span"): trace_client.record_input({"arg": 1}) trace_client.record_output("result") trace_id, data = trace_client.save() - assert mock_post.called assert data["trace_id"] == trace_client.trace_id - assert data["name"] == "test_trace" - assert len(data["entries"]) > 0 - assert isinstance(data["created_at"], str) - assert isinstance(data["duration"], float) - -def test_observe_decorator(tracer): - """Test the @tracer.observe decorator""" - @tracer.observe - def test_function(x, y): - return x + y - - with tracer.trace("test_trace"): - result = test_function(1, 2) - - assert result == 3 - -def test_observe_decorator_with_error(tracer): - """Test decorator error handling""" - @tracer.observe - def failing_function(): - raise ValueError("Test error") - - with tracer.trace("test_trace"): - with pytest.raises(ValueError): - failing_function() @patch('requests.post') def test_wrap_openai(mock_post, tracer): """Test wrapping OpenAI client""" + # Configure mock response properly + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = '{"message": "success"}' + mock_post.return_value = mock_response + client = OpenAI() - mock_response = MagicMock() - mock_response.choices = [MagicMock(message=MagicMock(content="test response"))] - mock_response.usage = MagicMock(prompt_tokens=10, completion_tokens=20, total_tokens=30) - client.chat.completions.create = MagicMock(return_value=mock_response) + mock_completion = MagicMock() + mock_completion.choices = [MagicMock(message=MagicMock(content="test response"))] + mock_completion.usage = MagicMock(prompt_tokens=10, completion_tokens=20, total_tokens=30) + client.chat.completions.create = MagicMock(return_value=mock_completion) wrapped_client = wrap(client) @@ -220,16 +209,22 @@ def test_wrap_openai(mock_post, tracer): messages=[{"role": "user", "content": "test"}] ) - assert response == mock_response + assert response == mock_completion @patch('requests.post') def test_wrap_anthropic(mock_post, tracer): """Test wrapping Anthropic client""" + # Configure mock response properly + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = '{"message": "success"}' + mock_post.return_value = mock_response + client = Anthropic() - mock_response = MagicMock() - mock_response.content = [MagicMock(text="test response")] - mock_response.usage = MagicMock(input_tokens=10, output_tokens=20) - client.messages.create = MagicMock(return_value=mock_response) + mock_completion = MagicMock() + mock_completion.content = [MagicMock(text="test response")] + mock_completion.usage = MagicMock(input_tokens=10, output_tokens=20) + client.messages.create = MagicMock(return_value=mock_completion) wrapped_client = wrap(client) @@ -239,7 +234,7 @@ def test_wrap_anthropic(mock_post, tracer): messages=[{"role": "user", "content": "test"}] ) - assert response == mock_response + assert response == mock_completion def test_wrap_unsupported_client(tracer): """Test wrapping unsupported client type""" @@ -266,3 +261,24 @@ def test_tracer_invalid_api_key(mocker): with pytest.raises(JudgmentAPIError, match="Issue with passed in Judgment API key: API key is invalid"): Tracer(api_key="invalid_key") + +def test_observe_decorator(tracer): + """Test the @tracer.observe decorator""" + @tracer.observe + def test_function(x, y): + return x + y + + with tracer.trace("test_trace"): + result = test_function(1, 2) + + assert result == 3 + +def test_observe_decorator_with_error(tracer): + """Test decorator error handling""" + @tracer.observe + def failing_function(): + raise ValueError("Test error") + + with tracer.trace("test_trace"): + with pytest.raises(ValueError): + failing_function() diff --git a/tests/data/test_scorer_data.py b/tests/data/test_scorer_data.py index 1f1e7829..a9ea1dc9 100644 --- a/tests/data/test_scorer_data.py +++ b/tests/data/test_scorer_data.py @@ -44,7 +44,7 @@ def score_example(self, example, *args, **kwargs): async def a_score_example(self, example, *args, **kwargs): pass - def success_check(self) -> bool: + def _success_check(self) -> bool: return self.score >= self.threshold if self.score is not None else False diff --git a/tests/scorers/test_custom_scorer.py b/tests/scorers/test_custom_scorer.py index c01b12a9..6cf4e7ef 100644 --- a/tests/scorers/test_custom_scorer.py +++ b/tests/scorers/test_custom_scorer.py @@ -29,7 +29,7 @@ def score_example(self, example, *args, **kwargs) -> float: async def a_score_example(self, example, *args, **kwargs) -> float: return 0.9 - def success_check(self) -> bool: + def _success_check(self) -> bool: return self.score >= self.threshold if self.score is not None else False @pytest.fixture @@ -118,15 +118,15 @@ def test_success_check_implementation(self, basic_scorer): """Test success_check with various scores""" # Test with score above threshold basic_scorer.score = 0.8 - assert basic_scorer.success_check() is True + assert basic_scorer._success_check() is True # Test with score below threshold basic_scorer.score = 0.6 - assert basic_scorer.success_check() is False + assert basic_scorer._success_check() is False # Test with no score basic_scorer.score = None - assert basic_scorer.success_check() is False + assert basic_scorer._success_check() is False def test_str_representation(self, basic_scorer): """Test string representation of scorer""" @@ -149,4 +149,4 @@ class IncompleteScorer(CustomScorer): asyncio.run(scorer.a_score_example({})) with pytest.raises(NotImplementedError): - scorer.success_check() + scorer._success_check() diff --git a/tests/scorers/test_prompt_scorer.py b/tests/scorers/test_prompt_scorer.py index e5e7e9ed..7c50e195 100644 --- a/tests/scorers/test_prompt_scorer.py +++ b/tests/scorers/test_prompt_scorer.py @@ -35,20 +35,20 @@ def __init__(self, mock_model, *args, **kwargs): super().__init__(*args, **kwargs) self.model = mock_model - def build_measure_prompt(self, example: Example) -> List[dict]: + def _build_measure_prompt(self, example: Example) -> List[dict]: return [ {"role": "system", "content": "Test system prompt"}, {"role": "user", "content": f"Response: {example.actual_output}"} ] - def build_schema(self) -> dict: + def _build_schema(self) -> dict: return {"score": float, "reason": str} - def process_response(self, response: dict): + def _process_response(self, response: dict): return response["score"], response["reason"] - def success_check(self, **kwargs) -> bool: - return self.result >= self.threshold + def _success_check(self, **kwargs) -> bool: + return self._result >= self.threshold # Tests for PromptScorer class TestPromptScorer: @@ -68,7 +68,7 @@ def test_enforce_prompt_format(self, mock_model): prompt = [{"role": "system", "content": "Base prompt"}] schema = {"score": float, "reason": str} - formatted = scorer.enforce_prompt_format(prompt, schema) + formatted = scorer._enforce_prompt_format(prompt, schema) assert "JSON format" in formatted[0]["content"] assert '"score": (float)' in formatted[0]["content"] assert '"reason": (str)' in formatted[0]["content"] @@ -76,7 +76,7 @@ def test_enforce_prompt_format(self, mock_model): def test_enforce_prompt_format_invalid_input(self, mock_model): scorer = SampleScorer(name="test_scorer", mock_model=mock_model) with pytest.raises(TypeError): - scorer.enforce_prompt_format("invalid", {}) + scorer._enforce_prompt_format("invalid", {}) @pytest.mark.asyncio async def test_a_score_example(self, example, mock_model): @@ -124,7 +124,7 @@ def test_build_measure_prompt(self, example, classifier_conversation, classifier options=classifier_options ) - prompt = scorer.build_measure_prompt(example) + prompt = scorer._build_measure_prompt(example) assert "This is a test response" in prompt[0]["content"] def test_process_response(self, classifier_conversation, classifier_options): @@ -136,7 +136,7 @@ def test_process_response(self, classifier_conversation, classifier_options): ) response = {"choice": "positive", "reason": "Test reason"} - score, reason = scorer.process_response(response) + score, reason = scorer._process_response(response) assert score == 1.0 assert reason == "Test reason" @@ -150,7 +150,7 @@ def test_process_response_invalid_choice(self, classifier_conversation, classifi response = {"choice": "invalid", "reason": "Test reason"} with pytest.raises(ValueError): - scorer.process_response(response) + scorer._process_response(response) def test_success_check(self, classifier_conversation, classifier_options): scorer = ClassifierScorer( @@ -161,7 +161,7 @@ def test_success_check(self, classifier_conversation, classifier_options): ) scorer.score = 1.0 - assert scorer.success_check() is True + assert scorer._success_check() is True scorer.score = 0.0 - assert scorer.success_check() is False + assert scorer._success_check() is False diff --git a/tests/scorers/test_score.py b/tests/scorers/test_score.py index 08354fd9..500412e2 100644 --- a/tests/scorers/test_score.py +++ b/tests/scorers/test_score.py @@ -20,7 +20,7 @@ def score_example(self, example, *args, **kwargs): async def a_score_example(self, example, *args, **kwargs): pass - def success_check(self): + def _success_check(self): return True @@ -798,7 +798,7 @@ def mock_scorer(): scorer.evaluation_model = "test-model" scorer.score = 0.9 scorer.reason = "Test reason" - scorer.success_check.return_value = True + scorer._success_check.return_value = True scorer.evaluation_cost = 0.1 scorer.verbose_logs = "Test logs" scorer.additional_metadata = {"key": "value"} diff --git a/tests/scorers/test_scorer_utils.py b/tests/scorers/test_scorer_utils.py index c10ac0a6..d355cff0 100644 --- a/tests/scorers/test_scorer_utils.py +++ b/tests/scorers/test_scorer_utils.py @@ -33,7 +33,7 @@ def score_example(self, example: Example, *args, **kwargs) -> float: async def a_score_example(self, example: Example, *args, **kwargs) -> float: return 1.0 - def success_check(self) -> bool: + def _success_check(self) -> bool: return True