Skip to content

Commit 7db30d1

Browse files
authored
Merge branch 'staging' into az-trace-datasets
2 parents 669895b + 30addd8 commit 7db30d1

File tree

11 files changed

+120
-56
lines changed

11 files changed

+120
-56
lines changed

.github/workflows/blocked-pr.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
name: Check Blocked PR
2+
3+
on:
4+
pull_request:
5+
types:
6+
- opened
7+
- labeled
8+
- unlabeled
9+
10+
jobs:
11+
fail-for-blocked:
12+
if: contains(github.event.pull_request.labels.*.name, 'Blocked')
13+
runs-on: ubuntu-latest
14+
steps:
15+
- name: Fail if PR is blocked
16+
run: |
17+
echo "This PR is currently blocked. Please unblock it before merging."
18+
exit 1

README.md

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@
99

1010
<br>
1111

12-
## [🌐 Landing Page](https://www.judgmentlabs.ai/)[📚 Docs](https://judgment.mintlify.app/getting_started)[🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
12+
## [🌐 Landing Page](https://www.judgmentlabs.ai/)[📚 Docs](https://docs.judgmentlabs.ai/introduction)[🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
1313

1414
[![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
1515
[![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
16-
[![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/FMxHkYTtFE)
16+
[![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
1717

1818
</div>
1919

@@ -28,19 +28,28 @@ We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and a
2828
Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
2929

3030
## 📋 Table of Contents
31-
* [✨ Features](#-features)
32-
* [🔍 Tracing](#-tracing)
33-
* [🧪 Evals](#-evals)
34-
* [📡 Monitoring](#-monitoring)
35-
* [📊 Datasets](#-datasets)
36-
* [💡 Insights](#-insights)
37-
* [🛠️ Installation](#️-installation)
38-
* [🏁 Get Started](#-get-started)
39-
* [🏢 Self-Hosting](#-self-hosting)
40-
* [📚 Cookbooks](#-cookbooks)
41-
* [💻 Development with Cursor](#-development-with-cursor)
42-
* [⭐ Star Us on GitHub](#-star-us-on-github)
43-
* [❤️ Contributors](#️-contributors)
31+
- [🌐 Landing Page • 📚 Docs • 🚀 Demos](#-landing-page----docs---demos)
32+
- [Judgeval: open-source testing, monitoring, and optimization for AI agents](#judgeval-open-source-testing-monitoring-and-optimization-for-ai-agents)
33+
- [📋 Table of Contents](#-table-of-contents)
34+
- [✨ Features](#-features)
35+
- [🛠️ Installation](#️-installation)
36+
- [🏁 Get Started](#-get-started)
37+
- [🛰️ Tracing](#️-tracing)
38+
- [📝 Offline Evaluations](#-offline-evaluations)
39+
- [📡 Online Evaluations](#-online-evaluations)
40+
- [🏢 Self-Hosting](#-self-hosting)
41+
- [Key Features](#key-features)
42+
- [Getting Started](#getting-started)
43+
- [📚 Cookbooks](#-cookbooks)
44+
- [Sample Agents](#sample-agents)
45+
- [💰 LangGraph Financial QA Agent](#-langgraph-financial-qa-agent)
46+
- [✈️ OpenAI Travel Agent](#️-openai-travel-agent)
47+
- [Custom Evaluators](#custom-evaluators)
48+
- [🔍 PII Detection](#-pii-detection)
49+
- [📧 Cold Email Generation](#-cold-email-generation)
50+
- [💻 Development with Cursor](#-development-with-cursor)
51+
- [⭐ Star Us on GitHub](#-star-us-on-github)
52+
- [❤️ Contributors](#️-contributors)
4453

4554
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
4655

src/demo/eval_test.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from judgeval.judgment_client import JudgmentClient
22
from judgeval.data.example import Example
3-
from judgeval.scorers import AnswerRelevancyScorer
3+
from judgeval.scorers import AnswerRelevancyScorer, FaithfulnessScorer
44
from judgeval.common.tracer import Tracer
55

66
judgment = JudgmentClient()
@@ -9,7 +9,7 @@
99
qa_pairs = [
1010
("What is the capital of France?", "Paris"),
1111
("What is the largest planet in our solar system?", "Jupiter"),
12-
# ("Who wrote 'Romeo and Juliet'?", "William Shakespeare"),
12+
("Who wrote 'Romeo and Juliet'?", "William Shakespeare"),
1313
# ("What is the chemical symbol for gold?", "Au"),
1414
# ("What is the square root of 144?", "12"),
1515
# ("Who painted the Mona Lisa?", "Leonardo da Vinci"),
@@ -61,10 +61,10 @@
6161

6262
# Create a list of Example objects
6363
examples = [Example(input=question, actual_output=answer) for question, answer in qa_pairs]
64-
for example in examples:
65-
print(example.model_dump())
64+
65+
6666
judgment.run_evaluation(
6767
examples=examples,
68-
scorers=[AnswerRelevancyScorer(threshold=0.5)],
69-
append=True
68+
scorers=[AnswerRelevancyScorer(threshold=0.5), FaithfulnessScorer(threshold=0.5)],
69+
override=True
7070
)

src/e2etests/test_all_scorers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
)
2727

2828
from judgeval.data import Example
29-
29+
from judgeval.data.example import ExampleParams
3030

3131
def test_ac_scorer(client: JudgmentClient):
3232

@@ -682,7 +682,8 @@ def _success_check(self, **kwargs) -> bool:
682682
threshold=0.5, # Expect positive sentiment (3 or higher on 1-5 scale)
683683
include_reason=True,
684684
strict_mode=False,
685-
verbose_mode=True
685+
verbose_mode=True,
686+
required_params=[ExampleParams.INPUT, ExampleParams.ACTUAL_OUTPUT]
686687
)
687688

688689
# Run evaluation

src/e2etests/test_eval_operations.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def run_eval_helper(self, client: JudgmentClient, project_name: str, eval_run_na
5555
)
5656

5757
scorer = FaithfulnessScorer(threshold=0.5)
58-
scorer2 = HallucinationScorer(threshold=0.5)
58+
scorer2 = AnswerRelevancyScorer(threshold=0.5)
5959

6060
client.run_evaluation(
6161
examples=[example1, example2],
@@ -164,15 +164,14 @@ async def test_assert_test(self, client: JudgmentClient):
164164
actual_output="No, the room is too small.",
165165
)
166166

167-
scorer = FaithfulnessScorer(threshold=0.5)
168-
scorer1 = AnswerRelevancyScorer(threshold=0.5)
167+
scorer = AnswerRelevancyScorer(threshold=0.5)
169168

170169
with pytest.raises(AssertionError):
171170
await client.assert_test(
172171
eval_run_name="test_eval",
173172
project_name="test_project",
174173
examples=[example, example1, example2],
175-
scorers=[scorer, scorer1],
174+
scorers=[scorer],
176175
model="Qwen/Qwen2.5-72B-Instruct-Turbo",
177176
override=True
178177
)

src/judgeval/common/tracer.py

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import asyncio
66
import functools
77
import inspect
8-
import json
98
import os
109
import site
1110
import sysconfig
@@ -16,6 +15,7 @@
1615
import warnings
1716
import contextvars
1817
import sys
18+
import json
1919
from contextlib import contextmanager, asynccontextmanager, AbstractAsyncContextManager, AbstractContextManager # Import context manager bases
2020
from dataclasses import dataclass, field
2121
from datetime import datetime
@@ -29,20 +29,16 @@
2929
Literal,
3030
Optional,
3131
Tuple,
32-
Type,
33-
TypeVar,
3432
Union,
3533
AsyncGenerator,
3634
TypeAlias,
37-
Set
3835
)
3936
from rich import print as rprint
40-
import types # <--- Add this import
37+
import types
4138

4239
# Third-party imports
4340
import requests
4441
from litellm import cost_per_token as _original_cost_per_token
45-
from pydantic import BaseModel
4642
from rich import print as rprint
4743
from openai import OpenAI, AsyncOpenAI
4844
from together import Together, AsyncTogether
@@ -64,8 +60,7 @@
6460
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
6561
from judgeval.rules import Rule
6662
from judgeval.evaluation_run import EvaluationRun
67-
from judgeval.data.result import ScoringResult
68-
from judgeval.common.utils import validate_api_key
63+
from judgeval.common.utils import ExcInfo, validate_api_key
6964
from judgeval.common.exceptions import JudgmentAPIError
7065

7166
# Standard library imports needed for the new class
@@ -562,7 +557,7 @@ def record_usage(self, usage: TraceUsage):
562557
# Removed else block - original didn't have one
563558
return None # Return None if no span_id found
564559

565-
def record_error(self, error: Any):
560+
def record_error(self, error: Dict[str, Any]):
566561
current_span_id = current_span_var.get()
567562
if current_span_id:
568563
span = self.span_id_to_span[current_span_id]
@@ -621,7 +616,7 @@ def save(self, overwrite: bool = False) -> Tuple[str, dict]:
621616
def delete(self):
622617
return self.trace_manager_client.delete_trace(self.trace_id)
623618

624-
def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: Tuple[Optional[type], Optional[BaseException], Optional[types.TracebackType]]):
619+
def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: ExcInfo):
625620
if not current_trace:
626621
return
627622

@@ -631,6 +626,27 @@ def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_inf
631626
"message": str(exc_value) if exc_value else "No exception message",
632627
"traceback": traceback.format_tb(exc_traceback_obj) if exc_traceback_obj else []
633628
}
629+
630+
# This is where we specially handle exceptions that we might want to collect additional data for.
631+
# When we do this, always try checking the module from sys.modules instead of importing. This will
632+
# Let us support a wider range of exceptions without needing to import them for all clients.
633+
634+
# Most clients (requests, httpx, urllib) support the standard format of exposing error.request.url and error.response.status_code
635+
# The alternative is to hand select libraries we want from sys.modules and check for them:
636+
# As an example: requests_module = sys.modules.get("requests", None) // then do things with requests_module;
637+
638+
# General HTTP Like errors
639+
try:
640+
url = getattr(getattr(exc_value, "request", None), "url", None)
641+
status_code = getattr(getattr(exc_value, "response", None), "status_code", None)
642+
if status_code:
643+
formatted_exception["http"] = {
644+
"url": url if url else "Unknown URL",
645+
"status_code": status_code if status_code else None,
646+
}
647+
except Exception as e:
648+
pass
649+
634650
current_trace.record_error(formatted_exception)
635651
class _DeepTracer:
636652
_instance: Optional["_DeepTracer"] = None
@@ -1476,13 +1492,6 @@ def _format_and_record_output(span, response, is_streaming, is_async, is_respons
14761492
span.record_usage(usage)
14771493
return response
14781494

1479-
def _handle_error(span, e, is_async):
1480-
"""Handle and record errors"""
1481-
call_type = "async" if is_async else "sync"
1482-
print(f"Error during wrapped {call_type} API call ({span_name}): {e}")
1483-
span.record_output({"error": str(e)})
1484-
raise
1485-
14861495
# --- Traced Async Functions ---
14871496
async def traced_create_async(*args, **kwargs):
14881497
current_trace = current_trace_var.get()
@@ -1496,7 +1505,8 @@ async def traced_create_async(*args, **kwargs):
14961505
response_or_iterator = await original_create(*args, **kwargs)
14971506
return _format_and_record_output(span, response_or_iterator, is_streaming, True, False)
14981507
except Exception as e:
1499-
return _handle_error(span, e, True)
1508+
_capture_exception_for_trace(span, sys.exc_info())
1509+
raise e
15001510

15011511
# Async responses for OpenAI clients
15021512
async def traced_response_create_async(*args, **kwargs):
@@ -1511,7 +1521,8 @@ async def traced_response_create_async(*args, **kwargs):
15111521
response_or_iterator = await original_responses_create(*args, **kwargs)
15121522
return _format_and_record_output(span, response_or_iterator, is_streaming, True, True)
15131523
except Exception as e:
1514-
return _handle_error(span, e, True)
1524+
_capture_exception_for_trace(span, sys.exc_info())
1525+
raise e
15151526

15161527
# Function replacing .stream() for async clients
15171528
def traced_stream_async(*args, **kwargs):
@@ -1542,7 +1553,8 @@ def traced_create_sync(*args, **kwargs):
15421553
response_or_iterator = original_create(*args, **kwargs)
15431554
return _format_and_record_output(span, response_or_iterator, is_streaming, False, False)
15441555
except Exception as e:
1545-
return _handle_error(span, e, False)
1556+
_capture_exception_for_trace(span, sys.exc_info())
1557+
raise e
15461558

15471559
def traced_response_create_sync(*args, **kwargs):
15481560
current_trace = current_trace_var.get()
@@ -1556,7 +1568,8 @@ def traced_response_create_sync(*args, **kwargs):
15561568
response_or_iterator = original_responses_create(*args, **kwargs)
15571569
return _format_and_record_output(span, response_or_iterator, is_streaming, False, True)
15581570
except Exception as e:
1559-
return _handle_error(span, e, False)
1571+
_capture_exception_for_trace(span, sys.exc_info())
1572+
raise e
15601573

15611574
# Function replacing sync .stream()
15621575
def traced_stream_sync(*args, **kwargs):

src/judgeval/common/utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@
1212
import asyncio
1313
import concurrent.futures
1414
import os
15+
from types import TracebackType
1516
import requests
1617
import pprint
17-
from typing import Any, Dict, List, Literal, Mapping, Optional, Union
18+
from typing import Any, Dict, List, Literal, Mapping, Optional, TypeAlias, Union
1819

1920
# Third-party imports
2021
import litellm
@@ -782,3 +783,6 @@ async def aget_completion_multiple_models(models: List[str], messages: List[List
782783
]
783784
]
784785
))
786+
787+
ExcInfo: TypeAlias = tuple[type[BaseException], BaseException, TracebackType]
788+
OptExcInfo: TypeAlias = ExcInfo | tuple[None, None, None]

src/judgeval/run_evaluation.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import asyncio
22
import requests
33
import time
4+
import json
45
import sys
56
import itertools
67
import threading
@@ -362,14 +363,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
362363
"""
363364
Checks if the example contains the necessary parameters for the scorer.
364365
"""
366+
prompt_user = False
365367
for scorer in scorers:
366368
for example in examples:
367369
missing_params = []
368370
for param in scorer.required_params:
369371
if getattr(example, param.value) is None:
370-
missing_params.append(f"'{param.value}'")
372+
missing_params.append(f"{param.value}")
371373
if missing_params:
372-
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
374+
rprint(f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
375+
rprint(f"Missing parameters: {', '.join(missing_params)}")
376+
rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
377+
rprint("-"*40)
378+
prompt_user = True
379+
380+
if prompt_user:
381+
user_input = input("Do you want to continue? (y/n)")
382+
if user_input.lower() != "y":
383+
sys.exit(0)
384+
else:
385+
rprint("[green]Continuing...[/green]")
373386

374387
def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
375388
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -894,6 +907,7 @@ async def _async_evaluation_workflow():
894907
f"Processing evaluation '{evaluation_run.eval_name}': "
895908
)
896909
else:
910+
check_examples(evaluation_run.examples, evaluation_run.scorers)
897911
if judgment_scorers:
898912
# Execute evaluation using Judgment API
899913
info("Starting API evaluation")

src/judgeval/scorers/judgeval_scorer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from judgeval.judges import JudgevalJudge
1313
from judgeval.judges.utils import create_judge
1414
from judgeval.constants import UNBOUNDED_SCORERS
15-
15+
from judgeval.data.example import ExampleParams
1616
class JudgevalScorer:
1717
"""
1818
Base class for scorers in `judgeval`.
@@ -39,6 +39,7 @@ class JudgevalScorer:
3939
evaluation_cost: Optional[float] = None # The cost of running the scorer
4040
verbose_logs: Optional[str] = None # The verbose logs of the scorer
4141
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
42+
required_params: Optional[List[ExampleParams]] = None # The required parameters for the scorer
4243
error: Optional[str] = None
4344
success: Optional[bool] = None
4445

@@ -51,6 +52,7 @@ def __init__(
5152
reason: Optional[str] = None,
5253
success: Optional[bool] = None,
5354
evaluation_model: Optional[str] = None,
55+
required_params: Optional[List[ExampleParams]] = None,
5456
strict_mode: bool = False,
5557
async_mode: bool = True,
5658
verbose_mode: bool = True,
@@ -87,6 +89,7 @@ def __init__(
8789
self.evaluation_cost = evaluation_cost
8890
self.verbose_logs = verbose_logs
8991
self.additional_metadata = additional_metadata
92+
self.required_params = required_params
9093

9194
def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
9295
"""

0 commit comments

Comments
 (0)