Skip to content

Add Span Level Evals (multi-step evaluation) #32

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7d0b6df
Add function declaration for an async run evaluation function for the…
JCamyre Jan 5, 2025
9ffce76
Add sample span-level evaluation call to a traced function.
JCamyre Jan 5, 2025
a360428
Add function to asynchronously evaluate on the span-level, calling Ju…
JCamyre Jan 5, 2025
eacb092
Clean up Tracer's __init__ function. Add function to return the curre…
JCamyre Jan 5, 2025
75a37dc
Add a better evaluation for testing. Fix tracer_id missing issue by c…
JCamyre Jan 5, 2025
10ebcc0
Add evaluation_result to TraceEntry.
JCamyre Jan 5, 2025
dedd993
Fix some run eval logic. Add logic to record Evaluation results.
JCamyre Jan 5, 2025
dab6420
Add temp fix to handle out of order entries (due to async evaluations…
JCamyre Jan 5, 2025
e88d8de
Mute unnecessary Pydantic type warnings.
JCamyre Jan 6, 2025
cb3ef4a
Small changes.
JCamyre Jan 6, 2025
70fe50c
Add an evaluation to make_lower function, add corresponding async/await.
JCamyre Jan 6, 2025
9260d63
Properly handle asynchronous functions' outputs. Remove _sort_entries…
JCamyre Jan 6, 2025
f151ce7
Add new traced function that uses the input and output in the evaluat…
JCamyre Jan 6, 2025
e33c2ad
Track and display evaluation time.
JCamyre Jan 6, 2025
36ed129
Improve style and formatting. Add comments. Use normal prompt to test…
JCamyre Jan 6, 2025
ca5ccc8
Change an evaluation to an appropiate metric given the Q&A. Change po…
JCamyre Jan 6, 2025
aa708dc
Add necessary imports, add comments, use literals to better structure…
JCamyre Jan 6, 2025
44135a4
Make to_dict more modular. Make cheeky boolean checking change.
JCamyre Jan 6, 2025
567a3f6
Modularize LLM API wrap function.
JCamyre Jan 6, 2025
e3fbd94
Remove from this PR, doesn't apply.
JCamyre Jan 6, 2025
0dcc2be
Mock requests so that unit tests pass.
JCamyre Jan 7, 2025
a2b7e3c
Remove unnecessary mocks. Add explicit mocks to specific tests when n…
JCamyre Jan 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 108 additions & 32 deletions e2etests/test_tracer.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,132 @@
# Standard library imports
import os
import time
import asyncio

# Third-party imports
from openai import OpenAI
from together import Together
from anthropic import Anthropic
from judgeval.common.tracer import Tracer, wrap

import time
# Local imports
from judgeval.common.tracer import Tracer, wrap
from judgeval.constants import APIScorer

# Initialize the tracer and clients
judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
openai_client = wrap(OpenAI())
anthropic_client = wrap(Anthropic())

@judgment.observe
def make_upper(input):
return input.upper()

@judgment.observe
def make_lower(input):
return input.lower()
async def make_upper(input: str) -> str:
"""Convert input to uppercase and evaluate using judgment API.

Args:
input: The input string to convert
Returns:
The uppercase version of the input string
"""
output = input.upper()
await judgment.get_current_trace().async_evaluate(
input="What if these shoes don't fit?",
actual_output="We offer a 30-day full refund at no extra cost.",
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
expected_output="We offer a 30-day full refund at no extra cost.",
expected_tools=["refund"],
score_type=APIScorer.FAITHFULNESS,
threshold=0.5,
model="gpt-4o-mini",
log_results=True
)
return output

@judgment.observe
def make_poem(input):
async def make_lower(input):
output = input.lower()

# Using Anthropic API
anthropic_response = anthropic_client.messages.create(
model="claude-3-sonnet-20240229",
messages=[{
"role": "user",
"content": input
}],
max_tokens=30
await judgment.get_current_trace().async_evaluate(
input="How do I reset my password?",
actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
context=["User Account"],
retrieval_context=["Password reset instructions"],
tools_called=["authentication"],
expected_tools=["authentication"],
additional_metadata={"difficulty": "medium"},
score_type=APIScorer.ANSWER_RELEVANCY,
threshold=0.5,
model="gpt-4o-mini",
log_results=True
)
anthropic_result = anthropic_response.content[0].text

# Using OpenAI API
openai_response = openai_client.chat.completions.create(
return output

@judgment.observe
def llm_call(input):
return "We have a 30 day full refund policy on shoes."

@judgment.observe
async def answer_user_question(input):
output = llm_call(input)
await judgment.get_current_trace().async_evaluate(
input=input,
actual_output=output,
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
expected_output="We offer a 30-day full refund at no extra cost.",
score_type=APIScorer.ANSWER_RELEVANCY,
threshold=0.5,
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "Make a short sentence with the input."},
{"role": "user", "content": input}
]
log_results=True
)
openai_result = openai_response.choices[0].message.content
print(openai_result)
return output

@judgment.observe
async def make_poem(input: str) -> str:
"""Generate a poem using both Anthropic and OpenAI APIs.

return make_lower(anthropic_result + openai_result)
Args:
input: The prompt for poem generation
Returns:
Combined and lowercase version of both API responses
"""
try:
# Using Anthropic API
anthropic_response = anthropic_client.messages.create(
model="claude-3-sonnet-20240229",
messages=[{"role": "user", "content": input}],
max_tokens=30
)
anthropic_result = anthropic_response.content[0].text

# Using OpenAI API
openai_response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "Make a short sentence with the input."},
{"role": "user", "content": input}
]
)
openai_result = openai_response.choices[0].message.content

return await make_lower(f"{anthropic_result} {openai_result}")

except Exception as e:
print(f"Error generating poem: {e}")
return ""

def test_evaluation_mixed(input):
async def test_evaluation_mixed(input):
with judgment.trace("test_evaluation") as trace:
result = make_poem(make_upper(input))
upper = await make_upper(input)
result = await make_poem(upper)
await answer_user_question("What if these shoes don't fit?")

trace.print()
trace.save()

trace.print()

return result

result3 = test_evaluation_mixed("hello the world is flat")
if __name__ == "__main__":
# Use a more meaningful test input
test_input = "Write a poem about Nissan R32 GTR"
asyncio.run(test_evaluation_mixed(test_input))

Loading
Loading