Skip to content

Commit 5e00fa0

Browse files
committed
Add demo folder. Add Patronus tracing workflow for comparison in demos.
1 parent c3f3cca commit 5e00fa0

File tree

2 files changed

+139
-83
lines changed

2 files changed

+139
-83
lines changed

demo/test_competitors.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
from dotenv import load_dotenv
2+
from patronus import Client
3+
import os
4+
import asyncio
5+
import time
6+
from openai import OpenAI
7+
from anthropic import Anthropic
8+
9+
load_dotenv()
10+
11+
PATRONUS_API_KEY = os.getenv("PATRONUS_API_KEY")
12+
13+
client = Client(api_key=PATRONUS_API_KEY)
14+
15+
# Initialize clients
16+
openai_client = OpenAI()
17+
anthropic_client = Anthropic()
18+
19+
async def make_upper(input: str) -> str:
20+
output = input.upper()
21+
result = client.evaluate(
22+
evaluator="answer-relevance",
23+
criteria="patronus:answer-relevance",
24+
evaluated_model_input=input,
25+
evaluated_model_output=output,
26+
threshold=0.5,
27+
model="gpt-4o-mini",
28+
log_results=True
29+
)
30+
return output
31+
32+
def llm_call(input):
33+
time.sleep(1.3)
34+
return "We have a 30 day full refund policy on shoes."
35+
36+
async def answer_user_question(input):
37+
output = llm_call(input)
38+
result = client.evaluate(
39+
evaluator="answer-relevance",
40+
criteria="patronus:answer-relevance",
41+
evaluated_model_input=input,
42+
evaluated_model_output=output,
43+
evaluated_model_retrieved_context=["All customers are eligible for a 30 day full refund at no extra cost."],
44+
expected_output="We offer a 30-day full refund at no extra cost.",
45+
threshold=0.5,
46+
model="gpt-4o-mini",
47+
log_results=True
48+
)
49+
return output
50+
51+
async def make_poem(input: str) -> str:
52+
try:
53+
# Using Anthropic API
54+
anthropic_response = anthropic_client.messages.create(
55+
model="claude-3-sonnet-20240229",
56+
messages=[{"role": "user", "content": input}],
57+
max_tokens=30
58+
)
59+
anthropic_result = anthropic_response.content[0].text
60+
61+
result = client.evaluate(
62+
evaluator="answer-relevance",
63+
criteria="patronus:answer-relevance",
64+
evaluated_model_input=input,
65+
evaluated_model_output=anthropic_result,
66+
threshold=0.5,
67+
model="gpt-4o-mini",
68+
log_results=True
69+
)
70+
71+
# Using OpenAI API
72+
openai_response = openai_client.chat.completions.create(
73+
model="gpt-4o-mini",
74+
messages=[
75+
{"role": "system", "content": "Make a short sentence with the input."},
76+
{"role": "user", "content": input}
77+
]
78+
)
79+
openai_result = openai_response.choices[0].message.content
80+
81+
return f"{anthropic_result} {openai_result}".lower()
82+
83+
except Exception as e:
84+
print(f"Error generating poem: {e}")
85+
return ""
86+
87+
async def test_evaluation_mixed(input):
88+
upper = await make_upper(input)
89+
result = await make_poem(upper)
90+
await answer_user_question("What if these shoes don't fit?")
91+
return result
92+
93+
if __name__ == "__main__":
94+
test_input = "Write a poem about Nissan R32 GTR"
95+
asyncio.run(test_evaluation_mixed(test_input))
96+

e2etests/judgment_client_test.py

Lines changed: 43 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -35,36 +35,31 @@ def test_dataset(client: JudgmentClient):
3535
print(dataset)
3636

3737
def test_run_eval(client: JudgmentClient):
38+
# Single step in our workflow, an outreach Sales Agent
3839

3940
example1 = Example(
40-
input="What if these shoes don't fit?",
41-
actual_output="We offer a 30-day full refund at no extra cost.",
42-
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
43-
trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
41+
input="Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.",
42+
actual_output="Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex",
43+
retrieval_context=["TechCorp launched AI analytics platform in 2024", "Sarah Chen is CEO, ex-Google executive", "Current client base: 50+ enterprise customers"],
4444
)
4545

4646
example2 = Example(
47-
input="How do I reset my password?",
48-
actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
49-
expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
50-
name="Password Reset",
51-
context=["User Account"],
52-
retrieval_context=["Password reset instructions"],
53-
tools_called=["authentication"],
54-
expected_tools=["authentication"],
55-
additional_metadata={"difficulty": "medium"}
47+
input="Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.",
48+
actual_output="Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex",
49+
expected_output="A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans",
50+
context=["Business Development"],
51+
retrieval_context=["GreenEnergy Solutions won 2023 sustainability award", "New solar technology 30% more efficient", "Planning European market expansion"],
5652
)
5753

5854
scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
59-
scorer2 = JudgmentScorer(threshold=0.5, score_type=APIScorer.HALLUCINATION)
60-
c_scorer = CustomFaithfulnessMetric(threshold=0.6)
55+
scorer2 = JudgmentScorer(threshold=0.5, score_type=APIScorer.ANSWER_RELEVANCY)
6156

62-
PROJECT_NAME = "test_project_JOSEPH"
63-
EVAL_RUN_NAME = "yomadude"
57+
PROJECT_NAME = "OutreachWorkflow"
58+
EVAL_RUN_NAME = "ColdEmailGenerator-Improve-BasePrompt"
6459

65-
_ = client.run_evaluation(
60+
client.run_evaluation(
6661
examples=[example1, example2],
67-
scorers=[scorer, c_scorer],
62+
scorers=[scorer, scorer2],
6863
model="QWEN",
6964
metadata={"batch": "test"},
7065
project_name=PROJECT_NAME,
@@ -73,10 +68,7 @@ def test_run_eval(client: JudgmentClient):
7368
override=True,
7469
)
7570

76-
results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME)
77-
# print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
78-
79-
def test_override_eval(client: JudgmentClient):
71+
def test_override_eval(client: JudgmentClient):
8072
example1 = Example(
8173
input="What if these shoes don't fit?",
8274
actual_output="We offer a 30-day full refund at no extra cost.",
@@ -146,8 +138,6 @@ def test_override_eval(client: JudgmentClient):
146138
if "already exists" not in str(e):
147139
raise
148140
print(f"Successfully caught expected error: {e}")
149-
150-
151141

152142
def test_evaluate_dataset(client: JudgmentClient):
153143

@@ -181,47 +171,23 @@ def test_evaluate_dataset(client: JudgmentClient):
181171
print(res)
182172

183173
def test_classifier_scorer(client: JudgmentClient):
184-
# Modifying a classifier scorer
185-
# TODO: Some of the field names are not consistent between regular scorers and classifier scorers
186-
# Make some methods private
187-
classifier_scorer = client.fetch_classifier_scorer("tonescorer-72gl")
188-
print(f"{classifier_scorer=}")
174+
classifier_scorer = client.fetch_classifier_scorer("tonescorer-pt0z")
175+
faithfulness_scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
189176

190-
# TODO: Does ClassifierScorer actually use build_measure_prompt, enforce_prompt_format, etc.
191-
# TODO: Ik PromptScorer uses it, but I don't think we need to redefine it in ClassifierScorer
192-
193-
# Creating a classifier scorer from SDK
194-
classifier_scorer_custom = ClassifierScorer(
195-
name="Test Classifier Scorer",
196-
threshold=0.5,
197-
conversation=[],
198-
options={}
177+
example1 = Example(
178+
input="What if these shoes don't fit?",
179+
actual_output="We offer a 30-day full refund at no extra cost, you would have known that if you read the website stupid!",
180+
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
199181
)
200182

201-
classifier_scorer_custom.update_conversation(conversation=[{"role": "user", "content": "What is the capital of France?"}])
202-
classifier_scorer_custom.update_options(options={"yes": 1, "no": 0})
203-
204-
slug = client.push_classifier_scorer(scorer=classifier_scorer_custom)
205-
206-
classifier_scorer_custom = client.fetch_classifier_scorer(slug=slug)
207-
print(f"{classifier_scorer_custom=}")
208-
209-
# faithfulness_scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
210-
211-
# example1 = Example(
212-
# input="What if these shoes don't fit?",
213-
# actual_output="We offer a 30-day full refund at no extra cost, you would have known that if you read the website stupid!",
214-
# retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
215-
# )
216-
217-
# res = client.run_evaluation(
218-
# examples=[example1],
219-
# scorers=[classifier_scorer, faithfulness_scorer],
220-
# model="QWEN",
221-
# )
222-
# print(res)
223-
224-
# Pushing a classifier scorer (from SDK)
183+
res = client.run_evaluation(
184+
examples=[example1],
185+
scorers=[classifier_scorer, faithfulness_scorer],
186+
model="QWEN",
187+
log_results=True,
188+
eval_run_name="ToneScorerTest",
189+
project_name="ToneScorerTest",
190+
)
225191

226192
if __name__ == "__main__":
227193
# Test client functionality
@@ -235,30 +201,24 @@ def test_classifier_scorer(client: JudgmentClient):
235201
# print("Dataset creation, pushing, and pulling successful")
236202
# print("*" * 40)
237203

238-
# print("Testing evaluation run")
239-
# test_run_eval(ui_client)
240-
# print("Evaluation run successful")
241-
# print("*" * 40)
242-
243-
print("Testing evaluation run override")
244-
test_override_eval(client)
245-
print("Evaluation run override successful")
204+
print("Testing evaluation run")
205+
test_run_eval(ui_client)
206+
print("Evaluation run successful")
246207
print("*" * 40)
247208

248-
print("Testing evaluation run override")
249-
test_override_eval(client)
250-
print("Evaluation run override successful")
251-
print("*" * 40)
209+
# print("Testing evaluation run override")
210+
# test_override_eval(client)
211+
# print("Evaluation run override successful")
212+
# print("*" * 40)
252213

253-
print("Testing dataset evaluation")
254-
test_evaluate_dataset(ui_client)
255-
print("Dataset evaluation successful")
256-
print("*" * 40)
214+
# print("Testing dataset evaluation")
215+
# test_evaluate_dataset(ui_client)
216+
# print("Dataset evaluation successful")
257217
# print("*" * 40)
258218

259-
print("Testing classifier scorer")
260-
test_classifier_scorer(ui_client)
261-
print("Classifier scorer test successful")
262-
print("*" * 40)
219+
# print("Testing classifier scorer")
220+
# test_classifier_scorer(ui_client)
221+
# print("Classifier scorer test successful")
222+
# print("*" * 40)
263223

264224
print("All tests passed successfully")

0 commit comments

Comments
 (0)