Skip to content

Commit bd7327b

Browse files
authored
Merge pull request #32 from JudgmentLabs/joseph/span-level-evals
Add Span Level Evals (multi-step evaluation)
2 parents ca23e6d + a2b7e3c commit bd7327b

File tree

5 files changed

+438
-154
lines changed

5 files changed

+438
-154
lines changed

e2etests/judgment_client_test.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ def test_dataset(client: JudgmentClient):
2929
# PULL
3030
dataset = client.pull_dataset(alias="test_dataset_5")
3131
print(dataset)
32-
3332

3433
def test_run_eval(client: JudgmentClient):
3534

e2etests/test_tracer.py

Lines changed: 108 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,132 @@
1+
# Standard library imports
2+
import os
3+
import time
4+
import asyncio
5+
6+
# Third-party imports
17
from openai import OpenAI
28
from together import Together
39
from anthropic import Anthropic
4-
from judgeval.common.tracer import Tracer, wrap
510

6-
import time
11+
# Local imports
12+
from judgeval.common.tracer import Tracer, wrap
13+
from judgeval.constants import APIScorer
714

815
# Initialize the tracer and clients
916
judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
1017
openai_client = wrap(OpenAI())
1118
anthropic_client = wrap(Anthropic())
1219

1320
@judgment.observe
14-
def make_upper(input):
15-
return input.upper()
16-
17-
@judgment.observe
18-
def make_lower(input):
19-
return input.lower()
21+
async def make_upper(input: str) -> str:
22+
"""Convert input to uppercase and evaluate using judgment API.
23+
24+
Args:
25+
input: The input string to convert
26+
Returns:
27+
The uppercase version of the input string
28+
"""
29+
output = input.upper()
30+
await judgment.get_current_trace().async_evaluate(
31+
input="What if these shoes don't fit?",
32+
actual_output="We offer a 30-day full refund at no extra cost.",
33+
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
34+
expected_output="We offer a 30-day full refund at no extra cost.",
35+
expected_tools=["refund"],
36+
score_type=APIScorer.FAITHFULNESS,
37+
threshold=0.5,
38+
model="gpt-4o-mini",
39+
log_results=True
40+
)
41+
return output
2042

2143
@judgment.observe
22-
def make_poem(input):
44+
async def make_lower(input):
45+
output = input.lower()
2346

24-
# Using Anthropic API
25-
anthropic_response = anthropic_client.messages.create(
26-
model="claude-3-sonnet-20240229",
27-
messages=[{
28-
"role": "user",
29-
"content": input
30-
}],
31-
max_tokens=30
47+
await judgment.get_current_trace().async_evaluate(
48+
input="How do I reset my password?",
49+
actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
50+
expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
51+
context=["User Account"],
52+
retrieval_context=["Password reset instructions"],
53+
tools_called=["authentication"],
54+
expected_tools=["authentication"],
55+
additional_metadata={"difficulty": "medium"},
56+
score_type=APIScorer.ANSWER_RELEVANCY,
57+
threshold=0.5,
58+
model="gpt-4o-mini",
59+
log_results=True
3260
)
33-
anthropic_result = anthropic_response.content[0].text
34-
35-
# Using OpenAI API
36-
openai_response = openai_client.chat.completions.create(
61+
return output
62+
63+
@judgment.observe
64+
def llm_call(input):
65+
return "We have a 30 day full refund policy on shoes."
66+
67+
@judgment.observe
68+
async def answer_user_question(input):
69+
output = llm_call(input)
70+
await judgment.get_current_trace().async_evaluate(
71+
input=input,
72+
actual_output=output,
73+
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
74+
expected_output="We offer a 30-day full refund at no extra cost.",
75+
score_type=APIScorer.ANSWER_RELEVANCY,
76+
threshold=0.5,
3777
model="gpt-4o-mini",
38-
messages=[
39-
{"role": "system", "content": "Make a short sentence with the input."},
40-
{"role": "user", "content": input}
41-
]
78+
log_results=True
4279
)
43-
openai_result = openai_response.choices[0].message.content
44-
print(openai_result)
80+
return output
81+
82+
@judgment.observe
83+
async def make_poem(input: str) -> str:
84+
"""Generate a poem using both Anthropic and OpenAI APIs.
4585
46-
return make_lower(anthropic_result + openai_result)
86+
Args:
87+
input: The prompt for poem generation
88+
Returns:
89+
Combined and lowercase version of both API responses
90+
"""
91+
try:
92+
# Using Anthropic API
93+
anthropic_response = anthropic_client.messages.create(
94+
model="claude-3-sonnet-20240229",
95+
messages=[{"role": "user", "content": input}],
96+
max_tokens=30
97+
)
98+
anthropic_result = anthropic_response.content[0].text
99+
100+
# Using OpenAI API
101+
openai_response = openai_client.chat.completions.create(
102+
model="gpt-4o-mini",
103+
messages=[
104+
{"role": "system", "content": "Make a short sentence with the input."},
105+
{"role": "user", "content": input}
106+
]
107+
)
108+
openai_result = openai_response.choices[0].message.content
109+
110+
return await make_lower(f"{anthropic_result} {openai_result}")
111+
112+
except Exception as e:
113+
print(f"Error generating poem: {e}")
114+
return ""
47115

48-
def test_evaluation_mixed(input):
116+
async def test_evaluation_mixed(input):
49117
with judgment.trace("test_evaluation") as trace:
50-
result = make_poem(make_upper(input))
118+
upper = await make_upper(input)
119+
result = await make_poem(upper)
120+
await answer_user_question("What if these shoes don't fit?")
51121

52-
trace.print()
53122
trace.save()
123+
124+
trace.print()
125+
54126
return result
55127

56-
result3 = test_evaluation_mixed("hello the world is flat")
128+
if __name__ == "__main__":
129+
# Use a more meaningful test input
130+
test_input = "Write a poem about Nissan R32 GTR"
131+
asyncio.run(test_evaluation_mixed(test_input))
132+

0 commit comments

Comments
 (0)