Skip to content

Commit 4587969

Browse files
authored
Merge pull request #34 from JudgmentLabs/joseph/span-level-evals
Additional features for span-level (multi-step) evaluations
2 parents a2c541c + 22ebf64 commit 4587969

19 files changed

+437
-200
lines changed

Pipfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ pandas = "*"
1616
openai = "*"
1717
together = "*"
1818
anthropic = "*"
19+
patronus = "*"
1920

2021
[dev-packages]
2122
pytest = "*"

demo/test_competitors.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
from dotenv import load_dotenv
2+
from patronus import Client
3+
import os
4+
import asyncio
5+
import time
6+
from openai import OpenAI
7+
from anthropic import Anthropic
8+
9+
load_dotenv()
10+
11+
PATRONUS_API_KEY = os.getenv("PATRONUS_API_KEY")
12+
13+
client = Client(api_key=PATRONUS_API_KEY)
14+
15+
# Initialize clients
16+
openai_client = OpenAI()
17+
anthropic_client = Anthropic()
18+
19+
async def make_upper(input: str) -> str:
20+
output = input.upper()
21+
result = client.evaluate(
22+
evaluator="answer-relevance",
23+
criteria="patronus:answer-relevance",
24+
evaluated_model_input=input,
25+
evaluated_model_output=output,
26+
threshold=0.5,
27+
model="gpt-4o-mini",
28+
log_results=True
29+
)
30+
return output
31+
32+
def llm_call(input):
33+
time.sleep(1.3)
34+
return "We have a 30 day full refund policy on shoes."
35+
36+
async def answer_user_question(input):
37+
output = llm_call(input)
38+
result = client.evaluate(
39+
evaluator="answer-relevance",
40+
criteria="patronus:answer-relevance",
41+
evaluated_model_input=input,
42+
evaluated_model_output=output,
43+
evaluated_model_retrieved_context=["All customers are eligible for a 30 day full refund at no extra cost."],
44+
expected_output="We offer a 30-day full refund at no extra cost.",
45+
threshold=0.5,
46+
model="gpt-4o-mini",
47+
log_results=True
48+
)
49+
return output
50+
51+
async def make_poem(input: str) -> str:
52+
try:
53+
# Using Anthropic API
54+
anthropic_response = anthropic_client.messages.create(
55+
model="claude-3-sonnet-20240229",
56+
messages=[{"role": "user", "content": input}],
57+
max_tokens=30
58+
)
59+
anthropic_result = anthropic_response.content[0].text
60+
61+
result = client.evaluate(
62+
evaluator="answer-relevance",
63+
criteria="patronus:answer-relevance",
64+
evaluated_model_input=input,
65+
evaluated_model_output=anthropic_result,
66+
threshold=0.5,
67+
model="gpt-4o-mini",
68+
log_results=True
69+
)
70+
71+
# Using OpenAI API
72+
openai_response = openai_client.chat.completions.create(
73+
model="gpt-4o-mini",
74+
messages=[
75+
{"role": "system", "content": "Make a short sentence with the input."},
76+
{"role": "user", "content": input}
77+
]
78+
)
79+
openai_result = openai_response.choices[0].message.content
80+
81+
return f"{anthropic_result} {openai_result}".lower()
82+
83+
except Exception as e:
84+
print(f"Error generating poem: {e}")
85+
return ""
86+
87+
async def test_evaluation_mixed(input):
88+
upper = await make_upper(input)
89+
result = await make_poem(upper)
90+
await answer_user_question("What if these shoes don't fit?")
91+
return result
92+
93+
if __name__ == "__main__":
94+
test_input = "Write a poem about Nissan R32 GTR"
95+
asyncio.run(test_evaluation_mixed(test_input))
96+

docs/notebooks/prompt_scorer.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@
157157
"name": "python",
158158
"nbconvert_exporter": "python",
159159
"pygments_lexer": "ipython3",
160-
"version": "3.9.6"
160+
"version": "3.11.4"
161161
}
162162
},
163163
"nbformat": 4,

e2etests/judgment_client_test.py

Lines changed: 34 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import random
1717
import string
1818

19+
from judgeval.scorers.prompt_scorer import ClassifierScorer
20+
1921
load_dotenv()
2022

2123
def get_client():
@@ -35,36 +37,32 @@ def test_dataset(client: JudgmentClient):
3537
print(dataset)
3638

3739
def test_run_eval(client: JudgmentClient):
40+
# Single step in our workflow, an outreach Sales Agent
3841

3942
example1 = Example(
40-
input="What if these shoes don't fit?",
41-
actual_output="We offer a 30-day full refund at no extra cost.",
42-
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
43-
trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
43+
input="Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.",
44+
actual_output="Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex",
45+
retrieval_context=["TechCorp launched AI analytics platform in 2024", "Sarah Chen is CEO, ex-Google executive", "Current client base: 50+ enterprise customers"],
4446
)
4547

4648
example2 = Example(
47-
input="How do I reset my password?",
48-
actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
49-
expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
50-
name="Password Reset",
51-
context=["User Account"],
52-
retrieval_context=["Password reset instructions"],
53-
tools_called=["authentication"],
54-
expected_tools=["authentication"],
55-
additional_metadata={"difficulty": "medium"}
49+
input="Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.",
50+
actual_output="Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex",
51+
expected_output="A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans",
52+
context=["Business Development"],
53+
retrieval_context=["GreenEnergy Solutions won 2023 sustainability award", "New solar technology 30% more efficient", "Planning European market expansion"],
5654
)
5755

5856
scorer = FaithfulnessScorer(threshold=0.5)
5957
scorer2 = HallucinationScorer(threshold=0.5)
6058
c_scorer = CustomFaithfulnessMetric(threshold=0.6)
6159

62-
PROJECT_NAME = "test_project_JOSEPH"
63-
EVAL_RUN_NAME = "yomadude"
60+
PROJECT_NAME = "OutreachWorkflow"
61+
EVAL_RUN_NAME = "ColdEmailGenerator-Improve-BasePrompt"
6462

65-
_ = client.run_evaluation(
63+
client.run_evaluation(
6664
examples=[example1, example2],
67-
scorers=[scorer, c_scorer],
65+
scorers=[scorer, scorer2],
6866
model="QWEN",
6967
metadata={"batch": "test"},
7068
project_name=PROJECT_NAME,
@@ -146,8 +144,6 @@ def test_override_eval(client: JudgmentClient):
146144
if "already exists" not in str(e):
147145
raise
148146
print(f"Successfully caught expected error: {e}")
149-
150-
151147

152148
def test_evaluate_dataset(client: JudgmentClient):
153149

@@ -194,8 +190,10 @@ def test_classifier_scorer(client: JudgmentClient):
194190
examples=[example1],
195191
scorers=[classifier_scorer, faithfulness_scorer],
196192
model="QWEN",
193+
log_results=True,
194+
eval_run_name="ToneScorerTest",
195+
project_name="ToneScorerTest",
197196
)
198-
print(res)
199197

200198
if __name__ == "__main__":
201199
# Test client functionality
@@ -204,29 +202,29 @@ def test_classifier_scorer(client: JudgmentClient):
204202
print("Client initialized successfully")
205203
print("*" * 40)
206204

207-
print("Testing dataset creation, pushing, and pulling")
208-
test_dataset(ui_client)
209-
print("Dataset creation, pushing, and pulling successful")
210-
print("*" * 40)
205+
# print("Testing dataset creation, pushing, and pulling")
206+
# test_dataset(ui_client)
207+
# print("Dataset creation, pushing, and pulling successful")
208+
# print("*" * 40)
211209

212210
print("Testing evaluation run")
213211
test_run_eval(ui_client)
214212
print("Evaluation run successful")
215213
print("*" * 40)
216214

217-
print("Testing evaluation run override")
218-
test_override_eval(client)
219-
print("Evaluation run override successful")
220-
print("*" * 40)
215+
# print("Testing evaluation run override")
216+
# test_override_eval(client)
217+
# print("Evaluation run override successful")
218+
# print("*" * 40)
221219

222-
print("Testing dataset evaluation")
223-
test_evaluate_dataset(ui_client)
224-
print("Dataset evaluation successful")
225-
print("*" * 40)
220+
# print("Testing dataset evaluation")
221+
# test_evaluate_dataset(ui_client)
222+
# print("Dataset evaluation successful")
223+
# print("*" * 40)
226224

227-
print("Testing classifier scorer")
228-
test_classifier_scorer(ui_client)
229-
print("Classifier scorer test successful")
230-
print("*" * 40)
225+
# print("Testing classifier scorer")
226+
# test_classifier_scorer(ui_client)
227+
# print("Classifier scorer test successful")
228+
# print("*" * 40)
231229

232230
print("All tests passed successfully")

e2etests/test_prompt_scoring.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def __init__(
3636
)
3737
self.score = 0.0
3838

39-
def build_measure_prompt(self, example: Example):
39+
def _build_measure_prompt(self, example: Example):
4040
SYSTEM_ROLE = (
4141
'You are a great judge of emotional intelligence. You understand the feelings '
4242
'and intentions of others. You will be tasked with judging whether the following '
@@ -51,16 +51,16 @@ def build_measure_prompt(self, example: Example):
5151
]
5252
return conversation
5353

54-
def build_schema(self):
54+
def _build_schema(self):
5555
return {
5656
"score": int,
5757
"reason": str
5858
}
5959

60-
def process_response(self, response):
60+
def _process_response(self, response):
6161
return response["score"], response["reason"]
6262

63-
def success_check(self):
63+
def _success_check(self):
6464
POSITIVITY_THRESHOLD = 3 # we want all model responses to be somewhat positive in tone
6565
return self.score <= POSITIVITY_THRESHOLD
6666

e2etests/test_tracer.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@
1414
from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer
1515

1616
# Initialize the tracer and clients
17-
judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
17+
judgment = Tracer(api_key=os.getenv("UI_JUDGMENT_API_KEY"))
1818
openai_client = wrap(OpenAI())
1919
anthropic_client = wrap(Anthropic())
2020

21-
@judgment.observe
21+
@judgment.observe(span_type="tool")
2222
async def make_upper(input: str) -> str:
2323
"""Convert input to uppercase and evaluate using judgment API.
2424
@@ -28,6 +28,7 @@ async def make_upper(input: str) -> str:
2828
The uppercase version of the input string
2929
"""
3030
output = input.upper()
31+
3132
await judgment.get_current_trace().async_evaluate(
3233
scorers=[FaithfulnessScorer(threshold=0.5)],
3334
input="What if these shoes don't fit?",
@@ -38,9 +39,10 @@ async def make_upper(input: str) -> str:
3839
model="gpt-4o-mini",
3940
log_results=True
4041
)
42+
4143
return output
4244

43-
@judgment.observe
45+
@judgment.observe(span_type="tool")
4446
async def make_lower(input):
4547
output = input.lower()
4648

@@ -59,11 +61,12 @@ async def make_lower(input):
5961
)
6062
return output
6163

62-
@judgment.observe
64+
@judgment.observe(span_type="llm")
6365
def llm_call(input):
66+
time.sleep(1.3)
6467
return "We have a 30 day full refund policy on shoes."
6568

66-
@judgment.observe
69+
@judgment.observe(span_type="tool")
6770
async def answer_user_question(input):
6871
output = llm_call(input)
6972
await judgment.get_current_trace().async_evaluate(
@@ -77,7 +80,7 @@ async def answer_user_question(input):
7780
)
7881
return output
7982

80-
@judgment.observe
83+
@judgment.observe(span_type="tool")
8184
async def make_poem(input: str) -> str:
8285
"""Generate a poem using both Anthropic and OpenAI APIs.
8386
@@ -95,6 +98,15 @@ async def make_poem(input: str) -> str:
9598
)
9699
anthropic_result = anthropic_response.content[0].text
97100

101+
await judgment.get_current_trace().async_evaluate(
102+
input=input,
103+
actual_output=anthropic_result,
104+
score_type=APIScorer.ANSWER_RELEVANCY,
105+
threshold=0.5,
106+
model="gpt-4o-mini",
107+
log_results=True
108+
)
109+
98110
# Using OpenAI API
99111
openai_response = openai_client.chat.completions.create(
100112
model="gpt-4o-mini",
@@ -112,7 +124,8 @@ async def make_poem(input: str) -> str:
112124
return ""
113125

114126
async def test_evaluation_mixed(input):
115-
with judgment.trace("test_evaluation") as trace:
127+
PROJECT_NAME = "NewPoemBot"
128+
with judgment.trace("Use-claude", project_name=PROJECT_NAME, overwrite=True) as trace:
116129
upper = await make_upper(input)
117130
result = await make_poem(upper)
118131
await answer_user_question("What if these shoes don't fit?")

0 commit comments

Comments
 (0)