Merge branch 'staging' into az-trace-datasets

adivate2021 · web-flow · commit 7db30d1f4110 · 2025-06-03T12:19:46.000-07:00
diff --git a/.github/workflows/blocked-pr.yaml b/.github/workflows/blocked-pr.yaml
@@ -0,0 +1,18 @@
+name: Check Blocked PR
+
+on:
+  pull_request:
+    types:
+      - opened
+      - labeled
+      - unlabeled
+
+jobs:
+  fail-for-blocked:
+    if: contains(github.event.pull_request.labels.*.name, 'Blocked')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Fail if PR is blocked
+        run: |
+          echo "This PR is currently blocked. Please unblock it before merging."
+          exit 1
diff --git a/README.md b/README.md
@@ -9,11 +9,11 @@
 
 <br>
 
-## [🌐 Landing Page](https://www.judgmentlabs.ai/)  • [📚 Docs](https://judgment.mintlify.app/getting_started) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o) 
+## [🌐 Landing Page](https://www.judgmentlabs.ai/)  • [📚 Docs](https://docs.judgmentlabs.ai/introduction) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o) 
 
 [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
 [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
-[![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/FMxHkYTtFE)
+[![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
 
 </div>
 
@@ -28,19 +28,28 @@ We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and a
 Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
 
 ## 📋 Table of Contents
-* [✨ Features](#-features)
-    * [🔍 Tracing](#-tracing)
-    * [🧪 Evals](#-evals)
-    * [📡 Monitoring](#-monitoring)
-    * [📊 Datasets](#-datasets)
-    * [💡 Insights](#-insights)
-* [🛠️ Installation](#️-installation)
-* [🏁 Get Started](#-get-started)
-* [🏢 Self-Hosting](#-self-hosting)
-* [📚 Cookbooks](#-cookbooks)
-* [💻 Development with Cursor](#-development-with-cursor)
-* [⭐ Star Us on GitHub](#-star-us-on-github)
-* [❤️ Contributors](#️-contributors)
+- [🌐 Landing Page  • 📚 Docs • 🚀 Demos](#-landing-page----docs---demos)
+- [Judgeval: open-source testing, monitoring, and optimization for AI agents](#judgeval-open-source-testing-monitoring-and-optimization-for-ai-agents)
+- [📋 Table of Contents](#-table-of-contents)
+- [✨ Features](#-features)
+- [🛠️ Installation](#️-installation)
+- [🏁 Get Started](#-get-started)
+  - [🛰️ Tracing](#️-tracing)
+  - [📝 Offline Evaluations](#-offline-evaluations)
+  - [📡 Online Evaluations](#-online-evaluations)
+- [🏢 Self-Hosting](#-self-hosting)
+  - [Key Features](#key-features)
+  - [Getting Started](#getting-started)
+- [📚 Cookbooks](#-cookbooks)
+  - [Sample Agents](#sample-agents)
+    - [💰 LangGraph Financial QA Agent](#-langgraph-financial-qa-agent)
+    - [✈️ OpenAI Travel Agent](#️-openai-travel-agent)
+  - [Custom Evaluators](#custom-evaluators)
+    - [🔍 PII Detection](#-pii-detection)
+    - [📧 Cold Email Generation](#-cold-email-generation)
+- [💻 Development with Cursor](#-development-with-cursor)
+- [⭐ Star Us on GitHub](#-star-us-on-github)
+- [❤️ Contributors](#️-contributors)
 
 <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
 
diff --git a/src/demo/eval_test.py b/src/demo/eval_test.py
@@ -1,6 +1,6 @@
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data.example import Example
-from judgeval.scorers import AnswerRelevancyScorer
+from judgeval.scorers import AnswerRelevancyScorer, FaithfulnessScorer
 from judgeval.common.tracer import Tracer
 
 judgment = JudgmentClient()
@@ -9,7 +9,7 @@
 qa_pairs = [
     ("What is the capital of France?", "Paris"),
     ("What is the largest planet in our solar system?", "Jupiter"),
-    # ("Who wrote 'Romeo and Juliet'?", "William Shakespeare"),
+    ("Who wrote 'Romeo and Juliet'?", "William Shakespeare"),
     # ("What is the chemical symbol for gold?", "Au"),
     # ("What is the square root of 144?", "12"),
     # ("Who painted the Mona Lisa?", "Leonardo da Vinci"),
@@ -61,10 +61,10 @@
 
 # Create a list of Example objects
 examples = [Example(input=question, actual_output=answer) for question, answer in qa_pairs]
-for example in examples:
-    print(example.model_dump())
+
+
 judgment.run_evaluation(
     examples=examples,
-    scorers=[AnswerRelevancyScorer(threshold=0.5)],
-    append=True
+    scorers=[AnswerRelevancyScorer(threshold=0.5), FaithfulnessScorer(threshold=0.5)],
+    override=True
 )
diff --git a/src/e2etests/test_all_scorers.py b/src/e2etests/test_all_scorers.py
@@ -26,7 +26,7 @@
 )
 
 from judgeval.data import Example
-
+from judgeval.data.example import ExampleParams
 
 def test_ac_scorer(client: JudgmentClient):
     
@@ -682,7 +682,8 @@ def _success_check(self, **kwargs) -> bool:
         threshold=0.5,  # Expect positive sentiment (3 or higher on 1-5 scale)
         include_reason=True,
         strict_mode=False,
-        verbose_mode=True
+        verbose_mode=True,
+        required_params=[ExampleParams.INPUT, ExampleParams.ACTUAL_OUTPUT]
     )
 
     # Run evaluation
diff --git a/src/e2etests/test_eval_operations.py b/src/e2etests/test_eval_operations.py
@@ -55,7 +55,7 @@ def run_eval_helper(self, client: JudgmentClient, project_name: str, eval_run_na
         )
 
         scorer = FaithfulnessScorer(threshold=0.5)
-        scorer2 = HallucinationScorer(threshold=0.5)
+        scorer2 = AnswerRelevancyScorer(threshold=0.5)
 
         client.run_evaluation(
             examples=[example1, example2],
@@ -164,15 +164,14 @@ async def test_assert_test(self, client: JudgmentClient):
             actual_output="No, the room is too small.",
         )
 
-        scorer = FaithfulnessScorer(threshold=0.5)
-        scorer1 = AnswerRelevancyScorer(threshold=0.5)
+        scorer = AnswerRelevancyScorer(threshold=0.5)
 
         with pytest.raises(AssertionError):
             await client.assert_test(
                 eval_run_name="test_eval",
                 project_name="test_project",
                 examples=[example, example1, example2],
-                scorers=[scorer, scorer1],
+                scorers=[scorer],
                 model="Qwen/Qwen2.5-72B-Instruct-Turbo",
                 override=True
             )
diff --git a/src/judgeval/common/tracer.py b/src/judgeval/common/tracer.py
@@ -5,7 +5,6 @@
 import asyncio
 import functools
 import inspect
-import json
 import os
 import site
 import sysconfig
@@ -16,6 +15,7 @@
 import warnings
 import contextvars
 import sys
+import json
 from contextlib import contextmanager, asynccontextmanager, AbstractAsyncContextManager, AbstractContextManager # Import context manager bases
 from dataclasses import dataclass, field
 from datetime import datetime
@@ -29,20 +29,16 @@
     Literal,
     Optional,
     Tuple,
-    Type,
-    TypeVar,
     Union,
     AsyncGenerator,
     TypeAlias,
-    Set
 )
 from rich import print as rprint
-import types # <--- Add this import
+import types
 
 # Third-party imports
 import requests
 from litellm import cost_per_token as _original_cost_per_token
-from pydantic import BaseModel
 from rich import print as rprint
 from openai import OpenAI, AsyncOpenAI
 from together import Together, AsyncTogether
@@ -64,8 +60,7 @@
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.rules import Rule
 from judgeval.evaluation_run import EvaluationRun
-from judgeval.data.result import ScoringResult
-from judgeval.common.utils import validate_api_key
+from judgeval.common.utils import ExcInfo, validate_api_key
 from judgeval.common.exceptions import JudgmentAPIError
 
 # Standard library imports needed for the new class
@@ -562,7 +557,7 @@ def record_usage(self, usage: TraceUsage):
         # Removed else block - original didn't have one
         return None # Return None if no span_id found
     
-    def record_error(self, error: Any):
+    def record_error(self, error: Dict[str, Any]):
         current_span_id = current_span_var.get()
         if current_span_id:
             span = self.span_id_to_span[current_span_id]
@@ -621,7 +616,7 @@ def save(self, overwrite: bool = False) -> Tuple[str, dict]:
     def delete(self):
         return self.trace_manager_client.delete_trace(self.trace_id)
     
-def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: Tuple[Optional[type], Optional[BaseException], Optional[types.TracebackType]]):
+def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: ExcInfo):
     if not current_trace:
         return
 
@@ -631,6 +626,27 @@ def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_inf
         "message": str(exc_value) if exc_value else "No exception message",
         "traceback": traceback.format_tb(exc_traceback_obj) if exc_traceback_obj else []
     }
+    
+    # This is where we specially handle exceptions that we might want to collect additional data for.
+    # When we do this, always try checking the module from sys.modules instead of importing. This will
+    # Let us support a wider range of exceptions without needing to import them for all clients.
+    
+    # Most clients (requests, httpx, urllib) support the standard format of exposing error.request.url and error.response.status_code 
+    # The alternative is to hand select libraries we want from sys.modules and check for them:
+    # As an example:  requests_module = sys.modules.get("requests", None) // then do things with requests_module;
+
+     # General HTTP Like errors
+    try:
+        url = getattr(getattr(exc_value, "request", None), "url", None)
+        status_code = getattr(getattr(exc_value, "response", None), "status_code", None)
+        if status_code:
+            formatted_exception["http"] = {
+                "url": url if url else "Unknown URL",
+                "status_code": status_code if status_code else None,
+            }
+    except Exception as e:
+        pass
+
     current_trace.record_error(formatted_exception)
 class _DeepTracer:
     _instance: Optional["_DeepTracer"] = None
@@ -1476,13 +1492,6 @@ def _format_and_record_output(span, response, is_streaming, is_async, is_respons
             span.record_usage(usage)
             return response
     
-    def _handle_error(span, e, is_async):
-        """Handle and record errors"""
-        call_type = "async" if is_async else "sync"
-        print(f"Error during wrapped {call_type} API call ({span_name}): {e}")
-        span.record_output({"error": str(e)})
-        raise
-    
     # --- Traced Async Functions ---
     async def traced_create_async(*args, **kwargs):
         current_trace = current_trace_var.get()
@@ -1496,7 +1505,8 @@ async def traced_create_async(*args, **kwargs):
                 response_or_iterator = await original_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, True, False)
             except Exception as e:
-                return _handle_error(span, e, True)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     
     # Async responses for OpenAI clients
     async def traced_response_create_async(*args, **kwargs):
@@ -1511,7 +1521,8 @@ async def traced_response_create_async(*args, **kwargs):
                 response_or_iterator = await original_responses_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, True, True)
             except Exception as e:
-                return _handle_error(span, e, True)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     
     # Function replacing .stream() for async clients
     def traced_stream_async(*args, **kwargs):
@@ -1542,7 +1553,8 @@ def traced_create_sync(*args, **kwargs):
                 response_or_iterator = original_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, False, False)
             except Exception as e:
-                return _handle_error(span, e, False)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     
     def traced_response_create_sync(*args, **kwargs):
         current_trace = current_trace_var.get()
@@ -1556,7 +1568,8 @@ def traced_response_create_sync(*args, **kwargs):
                 response_or_iterator = original_responses_create(*args, **kwargs)
                 return _format_and_record_output(span, response_or_iterator, is_streaming, False, True)
             except Exception as e:
-                return _handle_error(span, e, False)
+                _capture_exception_for_trace(span, sys.exc_info())
+                raise e
     
     # Function replacing sync .stream()
     def traced_stream_sync(*args, **kwargs):
diff --git a/src/judgeval/common/utils.py b/src/judgeval/common/utils.py
@@ -12,9 +12,10 @@
 import asyncio
 import concurrent.futures
 import os
+from types import TracebackType
 import requests
 import pprint
-from typing import Any, Dict, List, Literal, Mapping, Optional, Union
+from typing import Any, Dict, List, Literal, Mapping, Optional, TypeAlias, Union
 
 # Third-party imports
 import litellm
@@ -782,3 +783,6 @@ async def aget_completion_multiple_models(models: List[str], messages: List[List
             ]
         ]
     ))
+    
+ExcInfo: TypeAlias = tuple[type[BaseException], BaseException, TracebackType]
+OptExcInfo: TypeAlias = ExcInfo | tuple[None, None, None]
diff --git a/src/judgeval/run_evaluation.py b/src/judgeval/run_evaluation.py
@@ -1,6 +1,7 @@
 import asyncio
 import requests
 import time
+import json
 import sys
 import itertools
 import threading
@@ -362,14 +363,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
     """
     Checks if the example contains the necessary parameters for the scorer.
     """
+    prompt_user = False
     for scorer in scorers:
         for example in examples:
             missing_params = []
             for param in scorer.required_params:
                 if getattr(example, param.value) is None:
-                    missing_params.append(f"'{param.value}'")
+                    missing_params.append(f"{param.value}")
             if missing_params:
-                print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
+                rprint(f"[yellow]⚠️  WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
+                rprint(f"Missing parameters: {', '.join(missing_params)}")
+                rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
+                rprint("-"*40)
+                prompt_user = True
+
+    if prompt_user:
+        user_input = input("Do you want to continue? (y/n)")
+        if user_input.lower() != "y":
+            sys.exit(0)  
+        else:
+            rprint("[green]Continuing...[/green]")
 
 def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -894,6 +907,7 @@ async def _async_evaluation_workflow():
             f"Processing evaluation '{evaluation_run.eval_name}': "
         )
     else:
+        check_examples(evaluation_run.examples, evaluation_run.scorers)
         if judgment_scorers:
             # Execute evaluation using Judgment API
             info("Starting API evaluation")
diff --git a/src/judgeval/scorers/judgeval_scorer.py b/src/judgeval/scorers/judgeval_scorer.py
@@ -12,7 +12,7 @@
 from judgeval.judges import JudgevalJudge
 from judgeval.judges.utils import create_judge
 from judgeval.constants import UNBOUNDED_SCORERS
-
+from judgeval.data.example import ExampleParams
 class JudgevalScorer:
     """
     Base class for scorers in `judgeval`.
@@ -39,6 +39,7 @@ class JudgevalScorer:
     evaluation_cost: Optional[float] = None  # The cost of running the scorer
     verbose_logs: Optional[str] = None  # The verbose logs of the scorer
     additional_metadata: Optional[Dict] = None  # Additional metadata for the scorer
+    required_params: Optional[List[ExampleParams]] = None  # The required parameters for the scorer
     error: Optional[str] = None
     success: Optional[bool] = None
 
@@ -51,6 +52,7 @@ def __init__(
         reason: Optional[str] = None, 
         success: Optional[bool] = None, 
         evaluation_model: Optional[str] = None, 
+        required_params: Optional[List[ExampleParams]] = None,
         strict_mode: bool = False, 
         async_mode: bool = True, 
         verbose_mode: bool = True, 
@@ -87,6 +89,7 @@ def __init__(
             self.evaluation_cost = evaluation_cost
             self.verbose_logs = verbose_logs
             self.additional_metadata = additional_metadata
+            self.required_params = required_params
 
     def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
         """
diff --git a/src/judgeval/scorers/prompt_scorer.py b/src/judgeval/scorers/prompt_scorer.py
diff --git a/src/tests/notification/test_notification_integration.py b/src/tests/notification/test_notification_integration.py

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`)`
`27`	`27`
`28`	`28`	`from judgeval.data import Example`
`29`		`-`
	`29`	`+from judgeval.data.example import ExampleParams`
`30`	`30`
`31`	`31`	`def test_ac_scorer(client: JudgmentClient):`
`32`	`32`
`@@ -682,7 +682,8 @@ def _success_check(self, **kwargs) -> bool:`
`682`	`682`	`threshold=0.5, # Expect positive sentiment (3 or higher on 1-5 scale)`
`683`	`683`	`include_reason=True,`
`684`	`684`	`strict_mode=False,`
`685`		`- verbose_mode=True`
	`685`	`+ verbose_mode=True,`
	`686`	`+ required_params=[ExampleParams.INPUT, ExampleParams.ACTUAL_OUTPUT]`
`686`	`687`	`)`
`687`	`688`
`688`	`689`	`# Run evaluation`