Skip to content

Span-level evals additional features #34

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 37 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
f51cbf3
Small changes.
JCamyre Jan 8, 2025
1bd3197
Add a span_type field to traces, to specify between LLM calls, evalua…
JCamyre Jan 8, 2025
cf2adbb
Pass span_type's into @judgment.observe()'s.
JCamyre Jan 8, 2025
1d3f9ff
Fix span_types not being passed in for all observe() cases.
JCamyre Jan 8, 2025
dcff0f9
Fix depth count issues with spans.
JCamyre Jan 8, 2025
ccc9171
Add span_type to the TraceEntry 'to dictionary' function so that span…
JCamyre Jan 9, 2025
fa9bec7
Remove debugging print statements.
JCamyre Jan 9, 2025
c57a0c7
Update prompt_scorer notebook docs to proper python version.
JCamyre Jan 10, 2025
e3d272a
Add e2e test for editing, updating, and pushing a classifier scorer.
JCamyre Jan 10, 2025
7fe7337
Private functions for e2etests/test_prompt_scoring.py.
JCamyre Jan 10, 2025
0bb2f91
Fix unit tests which was accessing old private method names.
JCamyre Jan 10, 2025
5ac335d
Privatize methods and use new method name.
JCamyre Jan 10, 2025
7d19cd1
Update unit test and unit test mock object to use private method names.
JCamyre Jan 10, 2025
c48d072
More privatization.
JCamyre Jan 10, 2025
1c33164
Add update functions for ClassifierScorer.
JCamyre Jan 10, 2025
941ecbb
Add Judgment Client method to push classifier scorers from SDK side.
JCamyre Jan 10, 2025
9a9f28e
Add sleep to make llm_call function more realistic. Pass in project n…
JCamyre Jan 10, 2025
ea895ef
Add project name field to traces.
JCamyre Jan 10, 2025
78a5857
Remove judgment client test changes.
JCamyre Jan 13, 2025
22f8f17
Add automatic eval run name generation. Don't allow empty Trace name.…
JCamyre Jan 13, 2025
fe07188
Change trace and project name. Specify overwrite kwaarg.
JCamyre Jan 16, 2025
d756a4f
Add and pass arguments for logic relating to saving and overwriting t…
JCamyre Jan 16, 2025
8d214d0
Add error handling from save trace API call.
JCamyre Jan 16, 2025
5be3b0d
Remove logic related to actual_eval_run_name. Add logic for receiving…
JCamyre Jan 16, 2025
f526528
Add comments for pull_eval. Properly handle receiving updated fetch e…
JCamyre Jan 16, 2025
cc66f54
Add new fields to ScoringResult, needed for linking between trace and…
JCamyre Jan 16, 2025
4aa109c
Merge branch 'joseph/improve-trace-pages' into joseph/span-level-evals
JCamyre Jan 17, 2025
c9043ff
Merge branch 'joseph/simplify-classifier-scorers' into joseph/span-le…
JCamyre Jan 17, 2025
c3f3cca
Merge branch 'joseph/eval-run-name-uniqueness' into joseph/span-level…
JCamyre Jan 17, 2025
5e00fa0
Add demo folder. Add Patronus tracing workflow for comparison in demos.
JCamyre Jan 19, 2025
684b8ce
Add Patronus library, needed for Patronus demo.
JCamyre Jan 19, 2025
8829ab9
Make tracer test evals make more contextual sense.
JCamyre Jan 19, 2025
74b62a0
Remove print statement.
JCamyre Jan 19, 2025
951858a
Merge branch 'main' into joseph/span-level-evals
JCamyre Jan 19, 2025
1d20648
Merge branch 'main' of https://github.com/JudgmentLabs/judgeval into …
JCamyre Jan 19, 2025
b01c104
Fix failing UT's.
JCamyre Jan 20, 2025
22ebf64
Fix test_condense_trace UT.
JCamyre Jan 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pandas = "*"
openai = "*"
together = "*"
anthropic = "*"
patronus = "*"

[dev-packages]
pytest = "*"
Expand Down
96 changes: 96 additions & 0 deletions demo/test_competitors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from dotenv import load_dotenv
from patronus import Client
import os
import asyncio
import time
from openai import OpenAI
from anthropic import Anthropic

load_dotenv()

PATRONUS_API_KEY = os.getenv("PATRONUS_API_KEY")

client = Client(api_key=PATRONUS_API_KEY)

# Initialize clients
openai_client = OpenAI()
anthropic_client = Anthropic()

async def make_upper(input: str) -> str:
output = input.upper()
result = client.evaluate(
evaluator="answer-relevance",
criteria="patronus:answer-relevance",
evaluated_model_input=input,
evaluated_model_output=output,
threshold=0.5,
model="gpt-4o-mini",
log_results=True
)
return output

def llm_call(input):
time.sleep(1.3)
return "We have a 30 day full refund policy on shoes."

async def answer_user_question(input):
output = llm_call(input)
result = client.evaluate(
evaluator="answer-relevance",
criteria="patronus:answer-relevance",
evaluated_model_input=input,
evaluated_model_output=output,
evaluated_model_retrieved_context=["All customers are eligible for a 30 day full refund at no extra cost."],
expected_output="We offer a 30-day full refund at no extra cost.",
threshold=0.5,
model="gpt-4o-mini",
log_results=True
)
return output

async def make_poem(input: str) -> str:
try:
# Using Anthropic API
anthropic_response = anthropic_client.messages.create(
model="claude-3-sonnet-20240229",
messages=[{"role": "user", "content": input}],
max_tokens=30
)
anthropic_result = anthropic_response.content[0].text

result = client.evaluate(
evaluator="answer-relevance",
criteria="patronus:answer-relevance",
evaluated_model_input=input,
evaluated_model_output=anthropic_result,
threshold=0.5,
model="gpt-4o-mini",
log_results=True
)

# Using OpenAI API
openai_response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "Make a short sentence with the input."},
{"role": "user", "content": input}
]
)
openai_result = openai_response.choices[0].message.content

return f"{anthropic_result} {openai_result}".lower()

except Exception as e:
print(f"Error generating poem: {e}")
return ""

async def test_evaluation_mixed(input):
upper = await make_upper(input)
result = await make_poem(upper)
await answer_user_question("What if these shoes don't fit?")
return result

if __name__ == "__main__":
test_input = "Write a poem about Nissan R32 GTR"
asyncio.run(test_evaluation_mixed(test_input))

2 changes: 1 addition & 1 deletion docs/notebooks/prompt_scorer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
"version": "3.11.4"
}
},
"nbformat": 4,
Expand Down
70 changes: 34 additions & 36 deletions e2etests/judgment_client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import random
import string

from judgeval.scorers.prompt_scorer import ClassifierScorer

load_dotenv()

def get_client():
Expand All @@ -35,36 +37,32 @@ def test_dataset(client: JudgmentClient):
print(dataset)

def test_run_eval(client: JudgmentClient):
# Single step in our workflow, an outreach Sales Agent

example1 = Example(
input="What if these shoes don't fit?",
actual_output="We offer a 30-day full refund at no extra cost.",
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
input="Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.",
actual_output="Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex",
retrieval_context=["TechCorp launched AI analytics platform in 2024", "Sarah Chen is CEO, ex-Google executive", "Current client base: 50+ enterprise customers"],
)

example2 = Example(
input="How do I reset my password?",
actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
name="Password Reset",
context=["User Account"],
retrieval_context=["Password reset instructions"],
tools_called=["authentication"],
expected_tools=["authentication"],
additional_metadata={"difficulty": "medium"}
input="Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.",
actual_output="Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex",
expected_output="A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans",
context=["Business Development"],
retrieval_context=["GreenEnergy Solutions won 2023 sustainability award", "New solar technology 30% more efficient", "Planning European market expansion"],
)

scorer = FaithfulnessScorer(threshold=0.5)
scorer2 = HallucinationScorer(threshold=0.5)
c_scorer = CustomFaithfulnessMetric(threshold=0.6)

PROJECT_NAME = "test_project_JOSEPH"
EVAL_RUN_NAME = "yomadude"
PROJECT_NAME = "OutreachWorkflow"
EVAL_RUN_NAME = "ColdEmailGenerator-Improve-BasePrompt"

_ = client.run_evaluation(
client.run_evaluation(
examples=[example1, example2],
scorers=[scorer, c_scorer],
scorers=[scorer, scorer2],
model="QWEN",
metadata={"batch": "test"},
project_name=PROJECT_NAME,
Expand Down Expand Up @@ -146,8 +144,6 @@ def test_override_eval(client: JudgmentClient):
if "already exists" not in str(e):
raise
print(f"Successfully caught expected error: {e}")



def test_evaluate_dataset(client: JudgmentClient):

Expand Down Expand Up @@ -194,8 +190,10 @@ def test_classifier_scorer(client: JudgmentClient):
examples=[example1],
scorers=[classifier_scorer, faithfulness_scorer],
model="QWEN",
log_results=True,
eval_run_name="ToneScorerTest",
project_name="ToneScorerTest",
)
print(res)

if __name__ == "__main__":
# Test client functionality
Expand All @@ -204,29 +202,29 @@ def test_classifier_scorer(client: JudgmentClient):
print("Client initialized successfully")
print("*" * 40)

print("Testing dataset creation, pushing, and pulling")
test_dataset(ui_client)
print("Dataset creation, pushing, and pulling successful")
print("*" * 40)
# print("Testing dataset creation, pushing, and pulling")
# test_dataset(ui_client)
# print("Dataset creation, pushing, and pulling successful")
# print("*" * 40)

print("Testing evaluation run")
test_run_eval(ui_client)
print("Evaluation run successful")
print("*" * 40)

print("Testing evaluation run override")
test_override_eval(client)
print("Evaluation run override successful")
print("*" * 40)
# print("Testing evaluation run override")
# test_override_eval(client)
# print("Evaluation run override successful")
# print("*" * 40)

print("Testing dataset evaluation")
test_evaluate_dataset(ui_client)
print("Dataset evaluation successful")
print("*" * 40)
# print("Testing dataset evaluation")
# test_evaluate_dataset(ui_client)
# print("Dataset evaluation successful")
# print("*" * 40)

print("Testing classifier scorer")
test_classifier_scorer(ui_client)
print("Classifier scorer test successful")
print("*" * 40)
# print("Testing classifier scorer")
# test_classifier_scorer(ui_client)
# print("Classifier scorer test successful")
# print("*" * 40)

print("All tests passed successfully")
8 changes: 4 additions & 4 deletions e2etests/test_prompt_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(
)
self.score = 0.0

def build_measure_prompt(self, example: Example):
def _build_measure_prompt(self, example: Example):
SYSTEM_ROLE = (
'You are a great judge of emotional intelligence. You understand the feelings '
'and intentions of others. You will be tasked with judging whether the following '
Expand All @@ -51,16 +51,16 @@ def build_measure_prompt(self, example: Example):
]
return conversation

def build_schema(self):
def _build_schema(self):
return {
"score": int,
"reason": str
}

def process_response(self, response):
def _process_response(self, response):
return response["score"], response["reason"]

def success_check(self):
def _success_check(self):
POSITIVITY_THRESHOLD = 3 # we want all model responses to be somewhat positive in tone
return self.score <= POSITIVITY_THRESHOLD

Expand Down
27 changes: 20 additions & 7 deletions e2etests/test_tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer

# Initialize the tracer and clients
judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
judgment = Tracer(api_key=os.getenv("UI_JUDGMENT_API_KEY"))
openai_client = wrap(OpenAI())
anthropic_client = wrap(Anthropic())

@judgment.observe
@judgment.observe(span_type="tool")
async def make_upper(input: str) -> str:
"""Convert input to uppercase and evaluate using judgment API.

Expand All @@ -28,6 +28,7 @@ async def make_upper(input: str) -> str:
The uppercase version of the input string
"""
output = input.upper()

await judgment.get_current_trace().async_evaluate(
scorers=[FaithfulnessScorer(threshold=0.5)],
input="What if these shoes don't fit?",
Expand All @@ -38,9 +39,10 @@ async def make_upper(input: str) -> str:
model="gpt-4o-mini",
log_results=True
)

return output

@judgment.observe
@judgment.observe(span_type="tool")
async def make_lower(input):
output = input.lower()

Expand All @@ -59,11 +61,12 @@ async def make_lower(input):
)
return output

@judgment.observe
@judgment.observe(span_type="llm")
def llm_call(input):
time.sleep(1.3)
return "We have a 30 day full refund policy on shoes."

@judgment.observe
@judgment.observe(span_type="tool")
async def answer_user_question(input):
output = llm_call(input)
await judgment.get_current_trace().async_evaluate(
Expand All @@ -77,7 +80,7 @@ async def answer_user_question(input):
)
return output

@judgment.observe
@judgment.observe(span_type="tool")
async def make_poem(input: str) -> str:
"""Generate a poem using both Anthropic and OpenAI APIs.

Expand All @@ -95,6 +98,15 @@ async def make_poem(input: str) -> str:
)
anthropic_result = anthropic_response.content[0].text

await judgment.get_current_trace().async_evaluate(
input=input,
actual_output=anthropic_result,
score_type=APIScorer.ANSWER_RELEVANCY,
threshold=0.5,
model="gpt-4o-mini",
log_results=True
)

# Using OpenAI API
openai_response = openai_client.chat.completions.create(
model="gpt-4o-mini",
Expand All @@ -112,7 +124,8 @@ async def make_poem(input: str) -> str:
return ""

async def test_evaluation_mixed(input):
with judgment.trace("test_evaluation") as trace:
PROJECT_NAME = "NewPoemBot"
with judgment.trace("Use-claude", project_name=PROJECT_NAME, overwrite=True) as trace:
upper = await make_upper(input)
result = await make_poem(upper)
await answer_user_question("What if these shoes don't fit?")
Expand Down
Loading
Loading