JudgmentLabs
diff --git a/‎.github/workflows/ci.yaml
Lines changed: 46 additions & 0 deletions b/‎.github/workflows/ci.yaml
Lines changed: 46 additions & 0 deletions
diff --git a/‎Pipfile
Lines changed: 3 additions & 1 deletion b/‎Pipfile
Lines changed: 3 additions & 1 deletion
diff --git a/‎e2etests/test_tracer.py
Lines changed: 46 additions & 79 deletions b/‎e2etests/test_tracer.py
Lines changed: 46 additions & 79 deletions
@@ -0,0 +1,46 @@
+name: CI
+
+# Run on pull request, when first opened, and when approved, don't allow until the approved unit test pass
+on:
+  pull_request:
+    types: [opened, reopened]
+    branches:
+      - main
+  pull_request_review:
+    types: [submitted]
+    branches:
+      - main
+
+jobs:
+  run-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+        python-version:
+          - "3.11"
+    name: Test
+    runs-on: ${{ matrix.os }}
+    env:
+      PYTHONPATH: "."
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          pip install pipenv
+          pipenv install --dev
+          
+
+      - name: Run tests
+        run: |
+          pipenv run pytest
@@ -11,14 +11,16 @@ python-dotenv = "==1.0.1"
 together = "*"
 fastapi = "*"
 uvicorn = "*"
-pytest = "*"
 deepeval = "*"
 supabase = "*"
 requests = "*"
 pandas = "*"
 anthropic = "*"
 
 [dev-packages]
+pytest = "*"
+pytest-asyncio = "*"
+pytest-mock = "*"
 
 [requires]
 python_version = "3.11"
@@ -1,89 +1,56 @@
+from openai import OpenAI
+from together import Together
+from anthropic import Anthropic
+from judgeval.common.tracer import Tracer, wrap
 
-from judgeval.common.tracer import tracer
+import time
 
+# Initialize the tracer and clients
+judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
+openai_client = wrap(OpenAI())
+anthropic_client = wrap(Anthropic())
 
-# @tracer.observe(name="generate_movie_review", top_level=True)
-def generate_movie_review(summary: str) -> str:
+@judgment.observe
+def make_upper(input):
+    return input.upper()
 
-    trace = tracer.start_trace()
-    # Analyze key elements
-    plot_quality = analyze_plot(summary)
-    trace.print_trace()
-    engagement = analyze_engagement(summary)
-    originality = analyze_originality(summary)
-    trace.print_trace()
-    
-    # Generate final review
-    review = compose_review(plot_quality, engagement, originality)
-    return review
-
-@tracer.observe(name="analyze_plot")
-def analyze_plot(summary: str) -> dict:
-    # Analyze plot elements like structure, pacing, coherence
-    return {
-        "structure": 8,  # 1-10 rating
-        "pacing": 7,
-        "coherence": 9,
-        "notes": "Well structured plot with good pacing"
-    }
-
-@tracer.observe(name="analyze_engagement") 
-def analyze_engagement(summary: str) -> dict:
-    # Analyze how engaging/interesting the story seems
-    return {
-        "interest_level": 8,
-        "emotional_impact": 7,
-        "memorability": 8,
-        "notes": "Engaging story with emotional resonance"
-    }
-
-@tracer.observe(name="analyze_originality")
-def analyze_originality(summary: str) -> dict:
-    # Analyze uniqueness and creativity
-    return {
-        "uniqueness": 6,
-        "creativity": 7,
-        "innovation": 5,
-        "notes": "Some fresh elements but follows familiar patterns"
-    }
+@judgment.observe
+def make_lower(input):
+    return input.lower()
 
-@tracer.observe(name="compose_review")
-def compose_review(plot: dict, engagement: dict, originality: dict) -> str:
-    # Calculate overall score
-    plot_score = sum([plot["structure"], plot["pacing"], plot["coherence"]]) / 3
-    engagement_score = sum([engagement["interest_level"], 
-                            engagement["emotional_impact"],
-                            engagement["memorability"]]) / 3
-    originality_score = sum([originality["uniqueness"],
-                            originality["creativity"], 
-                            originality["innovation"]]) / 3
+@judgment.observe
+def make_poem(input):
 
-    overall_score = (plot_score + engagement_score + originality_score) / 3
+    # Using Anthropic API
+    anthropic_response = anthropic_client.messages.create(
+        model="claude-3-sonnet-20240229",
+        messages=[{
+            "role": "user",
+            "content": input
+        }],
+        max_tokens=30
+    )
+    anthropic_result = anthropic_response.content[0].text
 
-    # Generate review text
-    review = f"""Movie Review:
-Plot: {plot['notes']} ({plot_score:.1f}/10)
-Engagement: {engagement['notes']} ({engagement_score:.1f}/10) 
-Originality: {originality['notes']} ({originality_score:.1f}/10)
-
-Overall Score: {overall_score:.1f}/10
-"""
-    return review
-
-# Test the workflow
-summary = """
-A brilliant mathematician discovers a pattern that could predict global catastrophes. 
-As she races to convince authorities of the impending doom, she must confront her own 
-past traumas and decide whether to trust the pattern or her instincts. The fate of 
-millions hangs in the balance as time runs out.
-"""
+    # Using OpenAI API
+    openai_response = openai_client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": "Make a short sentence with the input."},
+            {"role": "user", "content": input}
+        ]
+    )
+    openai_result = openai_response.choices[0].message.content
+    print(openai_result)
+    
+    return make_lower(anthropic_result +  openai_result)
 
-result = generate_movie_review(summary)
+def test_evaluation_mixed(input):
+    with judgment.trace("test_evaluation") as trace:
+        result = make_poem(make_upper(input))
 
-print(type(result))
-assert isinstance(result, str)
-# assert "Movie Review:" in result
-# assert "Overall Score:" in result
+    trace.print()
+    trace.save()
+    return result
 
-# Print the trace
-# result.print_trace()
+result3 = test_evaluation_mixed("hello the world is flat")