Skip to content

Commit 37545f7

Browse files
committed
Merge branch 'alex/add-unit-tests' of https://github.com/JudgmentLabs/judgeval into alex/add-unit-tests
2 parents e805737 + 95319e8 commit 37545f7

File tree

7 files changed

+272
-155
lines changed

7 files changed

+272
-155
lines changed

.github/workflows/ci.yaml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
name: CI
2+
3+
# Run on pull request, when first opened, and when approved, don't allow until the approved unit test pass
4+
on:
5+
pull_request:
6+
types: [opened, reopened]
7+
branches:
8+
- main
9+
pull_request_review:
10+
types: [submitted]
11+
branches:
12+
- main
13+
14+
jobs:
15+
run-tests:
16+
strategy:
17+
fail-fast: false
18+
matrix:
19+
os: [ubuntu-latest, macos-latest]
20+
python-version:
21+
- "3.11"
22+
name: Test
23+
runs-on: ${{ matrix.os }}
24+
env:
25+
PYTHONPATH: "."
26+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
27+
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
28+
29+
steps:
30+
- name: Checkout code
31+
uses: actions/checkout@v4
32+
33+
- name: Set up Python
34+
uses: actions/setup-python@v4
35+
with:
36+
python-version: ${{ matrix.python-version }}
37+
38+
- name: Install dependencies
39+
run: |
40+
pip install pipenv
41+
pipenv install --dev
42+
43+
44+
- name: Run tests
45+
run: |
46+
pipenv run pytest

Pipfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,16 @@ python-dotenv = "==1.0.1"
1111
together = "*"
1212
fastapi = "*"
1313
uvicorn = "*"
14-
pytest = "*"
1514
deepeval = "*"
1615
supabase = "*"
1716
requests = "*"
1817
pandas = "*"
1918
anthropic = "*"
2019

2120
[dev-packages]
21+
pytest = "*"
22+
pytest-asyncio = "*"
23+
pytest-mock = "*"
2224

2325
[requires]
2426
python_version = "3.11"

e2etests/test_tracer.py

Lines changed: 46 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,89 +1,56 @@
1+
from openai import OpenAI
2+
from together import Together
3+
from anthropic import Anthropic
4+
from judgeval.common.tracer import Tracer, wrap
15

2-
from judgeval.common.tracer import tracer
6+
import time
37

8+
# Initialize the tracer and clients
9+
judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
10+
openai_client = wrap(OpenAI())
11+
anthropic_client = wrap(Anthropic())
412

5-
# @tracer.observe(name="generate_movie_review", top_level=True)
6-
def generate_movie_review(summary: str) -> str:
13+
@judgment.observe
14+
def make_upper(input):
15+
return input.upper()
716

8-
trace = tracer.start_trace()
9-
# Analyze key elements
10-
plot_quality = analyze_plot(summary)
11-
trace.print_trace()
12-
engagement = analyze_engagement(summary)
13-
originality = analyze_originality(summary)
14-
trace.print_trace()
15-
16-
# Generate final review
17-
review = compose_review(plot_quality, engagement, originality)
18-
return review
19-
20-
@tracer.observe(name="analyze_plot")
21-
def analyze_plot(summary: str) -> dict:
22-
# Analyze plot elements like structure, pacing, coherence
23-
return {
24-
"structure": 8, # 1-10 rating
25-
"pacing": 7,
26-
"coherence": 9,
27-
"notes": "Well structured plot with good pacing"
28-
}
29-
30-
@tracer.observe(name="analyze_engagement")
31-
def analyze_engagement(summary: str) -> dict:
32-
# Analyze how engaging/interesting the story seems
33-
return {
34-
"interest_level": 8,
35-
"emotional_impact": 7,
36-
"memorability": 8,
37-
"notes": "Engaging story with emotional resonance"
38-
}
39-
40-
@tracer.observe(name="analyze_originality")
41-
def analyze_originality(summary: str) -> dict:
42-
# Analyze uniqueness and creativity
43-
return {
44-
"uniqueness": 6,
45-
"creativity": 7,
46-
"innovation": 5,
47-
"notes": "Some fresh elements but follows familiar patterns"
48-
}
17+
@judgment.observe
18+
def make_lower(input):
19+
return input.lower()
4920

50-
@tracer.observe(name="compose_review")
51-
def compose_review(plot: dict, engagement: dict, originality: dict) -> str:
52-
# Calculate overall score
53-
plot_score = sum([plot["structure"], plot["pacing"], plot["coherence"]]) / 3
54-
engagement_score = sum([engagement["interest_level"],
55-
engagement["emotional_impact"],
56-
engagement["memorability"]]) / 3
57-
originality_score = sum([originality["uniqueness"],
58-
originality["creativity"],
59-
originality["innovation"]]) / 3
21+
@judgment.observe
22+
def make_poem(input):
6023

61-
overall_score = (plot_score + engagement_score + originality_score) / 3
24+
# Using Anthropic API
25+
anthropic_response = anthropic_client.messages.create(
26+
model="claude-3-sonnet-20240229",
27+
messages=[{
28+
"role": "user",
29+
"content": input
30+
}],
31+
max_tokens=30
32+
)
33+
anthropic_result = anthropic_response.content[0].text
6234

63-
# Generate review text
64-
review = f"""Movie Review:
65-
Plot: {plot['notes']} ({plot_score:.1f}/10)
66-
Engagement: {engagement['notes']} ({engagement_score:.1f}/10)
67-
Originality: {originality['notes']} ({originality_score:.1f}/10)
68-
69-
Overall Score: {overall_score:.1f}/10
70-
"""
71-
return review
72-
73-
# Test the workflow
74-
summary = """
75-
A brilliant mathematician discovers a pattern that could predict global catastrophes.
76-
As she races to convince authorities of the impending doom, she must confront her own
77-
past traumas and decide whether to trust the pattern or her instincts. The fate of
78-
millions hangs in the balance as time runs out.
79-
"""
35+
# Using OpenAI API
36+
openai_response = openai_client.chat.completions.create(
37+
model="gpt-4o-mini",
38+
messages=[
39+
{"role": "system", "content": "Make a short sentence with the input."},
40+
{"role": "user", "content": input}
41+
]
42+
)
43+
openai_result = openai_response.choices[0].message.content
44+
print(openai_result)
45+
46+
return make_lower(anthropic_result + openai_result)
8047

81-
result = generate_movie_review(summary)
48+
def test_evaluation_mixed(input):
49+
with judgment.trace("test_evaluation") as trace:
50+
result = make_poem(make_upper(input))
8251

83-
print(type(result))
84-
assert isinstance(result, str)
85-
# assert "Movie Review:" in result
86-
# assert "Overall Score:" in result
52+
trace.print()
53+
trace.save()
54+
return result
8755

88-
# Print the trace
89-
# result.print_trace()
56+
result3 = test_evaluation_mixed("hello the world is flat")

0 commit comments

Comments
 (0)