Skip to content

Commit c973649

Browse files
authored
Merge branch 'main' into ahh/new-error-format
2 parents 781644f + e72e440 commit c973649

File tree

10 files changed

+2438
-254
lines changed

10 files changed

+2438
-254
lines changed

Pipfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ verify_ssl = true
44
name = "pypi"
55

66
[packages]
7-
litellm = "==1.38.12"
7+
litellm = "==1.61.15"
88
python-dotenv = "==1.0.1"
99
requests = "*"
1010
pandas = "*"

Pipfile.lock

Lines changed: 345 additions & 200 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 1195 additions & 6 deletions
Large diffs are not rendered by default.

src/demo/async_evaluation_example.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Examples demonstrating how to use async evaluation in multiple ways.
4+
"""
5+
6+
import asyncio
7+
import os
8+
import time
9+
from typing import List
10+
11+
from judgeval.data import Example, ScoringResult
12+
from judgeval.judgment_client import JudgmentClient
13+
14+
# Get Judgment API key from environment (replace with your actual API key)
15+
JUDGMENT_API_KEY = os.environ.get("JUDGMENT_API_KEY", "your_api_key_here")
16+
ORGANIZATION_ID = os.environ.get("ORGANIZATION_ID", "your_organization_id_here")
17+
18+
# Initialize the JudgmentClient
19+
judgment_client = JudgmentClient(judgment_api_key=JUDGMENT_API_KEY, organization_id=ORGANIZATION_ID)
20+
21+
22+
async def example_direct_await():
23+
"""
24+
Example of directly awaiting the Task returned by run_evaluation with async_execution=True.
25+
This is the simplest approach and blocks until evaluation is complete.
26+
"""
27+
print("\n=== Example: Direct Await ===")
28+
29+
# Create example list
30+
examples = [
31+
Example(
32+
input="What is the capital of France?",
33+
actual_output="The capital of France is Paris.",
34+
expected_output="Paris"
35+
),
36+
Example(
37+
input="What is the capital of Italy?",
38+
actual_output="Rome is the capital of Italy.",
39+
expected_output="Rome"
40+
)
41+
]
42+
43+
# Set up scorers
44+
from judgeval.scorers import AnswerCorrectnessScorer
45+
scorers = [AnswerCorrectnessScorer(threshold=0.9)]
46+
47+
# Start evaluation asynchronously and get a Task object
48+
print("Starting evaluation...")
49+
task = judgment_client.run_evaluation(
50+
examples=examples,
51+
scorers=scorers,
52+
model="gpt-4o-mini",
53+
project_name="async-examples",
54+
eval_run_name="async-example-direct",
55+
override=True,
56+
async_execution=True
57+
)
58+
59+
# Directly await the task - this will block until the evaluation is done
60+
print("Awaiting results...")
61+
results = await task
62+
63+
print(f"Evaluation completed! Received {len(results)} results")
64+
65+
# Process the results
66+
print(results)
67+
68+
69+
async def example_with_other_work():
70+
"""
71+
Example of running other work while evaluation is in progress.
72+
Shows how to check task status and get results when ready.
73+
"""
74+
print("\n=== Example: Do Other Work While Evaluating ===")
75+
76+
# Create example list
77+
examples = [
78+
Example(
79+
input="What is the tallest mountain in the world?",
80+
actual_output="Mount Everest is the tallest mountain in the world.",
81+
expected_output="Mount Everest"
82+
),
83+
Example(
84+
input="What is the largest ocean?",
85+
actual_output="The Pacific Ocean is the largest ocean on Earth.",
86+
expected_output="Pacific Ocean"
87+
)
88+
]
89+
90+
# Set up scorers
91+
from judgeval.scorers import AnswerCorrectnessScorer
92+
scorers = [AnswerCorrectnessScorer(threshold=0.9)]
93+
94+
# Start evaluation asynchronously and get a Task object
95+
print("Starting evaluation...")
96+
task = judgment_client.run_evaluation(
97+
examples=examples,
98+
scorers=scorers,
99+
model="gpt-4o-mini",
100+
project_name="async-examples",
101+
eval_run_name="async-example-other-work",
102+
override=True,
103+
async_execution=True
104+
)
105+
106+
# Do other work while evaluation is running
107+
print("Doing other work while evaluation runs in the background...")
108+
109+
# Simulate other work with a few iterations
110+
for i in range(1, 4):
111+
print(f" Doing work iteration {i}...")
112+
await asyncio.sleep(2) # Simulate work with a delay
113+
114+
# Check if the evaluation is done
115+
if task.done():
116+
print(" Evaluation completed during other work!")
117+
break
118+
else:
119+
print(" Evaluation still running...")
120+
121+
# Get the results when ready
122+
try:
123+
if not task.done():
124+
print("Waiting for evaluation to complete...")
125+
126+
results = await task # Will return immediately if already done
127+
128+
print(results)
129+
130+
except Exception as e:
131+
print(f"Error in evaluation: {str(e)}")
132+
if task.exception():
133+
print(f"Task exception: {task.exception()}")
134+
135+
136+
async def main():
137+
"""Run the examples."""
138+
# Run the first example: direct await
139+
await example_direct_await()
140+
141+
# Run the second example: do other work while evaluating
142+
await example_with_other_work()
143+
144+
145+
if __name__ == "__main__":
146+
asyncio.run(main())

src/demo/sequence_test.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -146,24 +146,12 @@ def generate_itinerary(destination, start_date, end_date):
146146
}
147147
]
148148
)
149-
example2 = Example(
150-
input={"destination": "Tokyo", "start_date": "2025-06-01", "end_date": "2025-06-02"},
151-
expected_tools=[
152-
{"tool_name": "search_tavily", "parameters": {"query": "Best tourist attractions in Tokyo"}},
153-
{"tool_name": "search_tavily", "parameters": {"query": "Best hotels in Tokyo"}},
154-
{"tool_name": "search_tavily", "parameters": {"query": "Flights to Tokyo from major cities"}},
155-
{"tool_name": "search_tavily", "parameters": {"query": "Weather forecast for Tokyo from 2025-06-01 to 2025-06-03"}}
156-
]
157-
)
158149

159150
judgment.assert_test(
160-
project_name="travel_agent_demo",
161151
examples=[example],
162152
scorers=[ToolOrderScorer()],
163-
model="gpt-4.1-mini",
164153
function=generate_itinerary,
165154
tracer=tracer,
166-
override=True
167155
)
168156

169157

0 commit comments

Comments
 (0)