JudgmentLabs
diff --git a/‎Pipfile
Lines changed: 1 addition & 0 deletions b/‎Pipfile
Lines changed: 1 addition & 0 deletions
diff --git a/‎demo/test_competitors.py
Lines changed: 96 additions & 0 deletions b/‎demo/test_competitors.py
Lines changed: 96 additions & 0 deletions
diff --git a/‎docs/notebooks/prompt_scorer.ipynb
Lines changed: 1 addition & 1 deletion b/‎docs/notebooks/prompt_scorer.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎e2etests/judgment_client_test.py
Lines changed: 34 additions & 36 deletions b/‎e2etests/judgment_client_test.py
Lines changed: 34 additions & 36 deletions
diff --git a/‎e2etests/test_prompt_scoring.py
Lines changed: 4 additions & 4 deletions b/‎e2etests/test_prompt_scoring.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎e2etests/test_tracer.py
Lines changed: 20 additions & 7 deletions b/‎e2etests/test_tracer.py
Lines changed: 20 additions & 7 deletions
@@ -16,6 +16,7 @@ pandas = "*"
 openai = "*"
 together = "*"
 anthropic = "*"
+patronus = "*"
 
 [dev-packages]
 pytest = "*"
 
@@ -0,0 +1,96 @@
+from dotenv import load_dotenv
+from patronus import Client
+import os 
+import asyncio
+import time
+from openai import OpenAI
+from anthropic import Anthropic
+
+load_dotenv()
+
+PATRONUS_API_KEY = os.getenv("PATRONUS_API_KEY")
+
+client = Client(api_key=PATRONUS_API_KEY)
+
+# Initialize clients
+openai_client = OpenAI()
+anthropic_client = Anthropic()
+
+async def make_upper(input: str) -> str:
+    output = input.upper()
+    result = client.evaluate(
+        evaluator="answer-relevance",
+        criteria="patronus:answer-relevance",
+        evaluated_model_input=input,
+        evaluated_model_output=output,
+        threshold=0.5,
+        model="gpt-4o-mini",
+        log_results=True
+    )
+    return output
+
+def llm_call(input):
+    time.sleep(1.3)
+    return "We have a 30 day full refund policy on shoes."
+
+async def answer_user_question(input):
+    output = llm_call(input)
+    result = client.evaluate(
+        evaluator="answer-relevance",
+        criteria="patronus:answer-relevance",
+        evaluated_model_input=input,
+        evaluated_model_output=output,
+        evaluated_model_retrieved_context=["All customers are eligible for a 30 day full refund at no extra cost."],
+        expected_output="We offer a 30-day full refund at no extra cost.",
+        threshold=0.5,
+        model="gpt-4o-mini",
+        log_results=True
+    )
+    return output
+
+async def make_poem(input: str) -> str:
+    try:
+        # Using Anthropic API
+        anthropic_response = anthropic_client.messages.create(
+            model="claude-3-sonnet-20240229",
+            messages=[{"role": "user", "content": input}],
+            max_tokens=30
+        )
+        anthropic_result = anthropic_response.content[0].text
+        
+        result = client.evaluate(
+            evaluator="answer-relevance",
+            criteria="patronus:answer-relevance",
+            evaluated_model_input=input,
+            evaluated_model_output=anthropic_result,
+            threshold=0.5,
+            model="gpt-4o-mini",
+            log_results=True
+        )
+        
+        # Using OpenAI API
+        openai_response = openai_client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "Make a short sentence with the input."},
+                {"role": "user", "content": input}
+            ]
+        )
+        openai_result = openai_response.choices[0].message.content
+        
+        return f"{anthropic_result} {openai_result}".lower()
+    
+    except Exception as e:
+        print(f"Error generating poem: {e}")
+        return ""
+
+async def test_evaluation_mixed(input):
+    upper = await make_upper(input)
+    result = await make_poem(upper)
+    await answer_user_question("What if these shoes don't fit?")
+    return result
+
+if __name__ == "__main__":
+    test_input = "Write a poem about Nissan R32 GTR"
+    asyncio.run(test_evaluation_mixed(test_input))
+    
@@ -157,7 +157,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
 
@@ -16,6 +16,8 @@
 import random
 import string
 
+from judgeval.scorers.prompt_scorer import ClassifierScorer
+
 load_dotenv()
 
 def get_client():
@@ -35,36 +37,32 @@ def test_dataset(client: JudgmentClient):
     print(dataset)
 
 def test_run_eval(client: JudgmentClient):
+    # Single step in our workflow, an outreach Sales Agent
 
     example1 = Example(
-        input="What if these shoes don't fit?",
-        actual_output="We offer a 30-day full refund at no extra cost.",
-        retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
-        trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
+        input="Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.",
+        actual_output="Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex",
+        retrieval_context=["TechCorp launched AI analytics platform in 2024", "Sarah Chen is CEO, ex-Google executive", "Current client base: 50+ enterprise customers"],
     )
 
     example2 = Example(
-        input="How do I reset my password?",
-        actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
-        expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
-        name="Password Reset",
-        context=["User Account"],
-        retrieval_context=["Password reset instructions"],
-        tools_called=["authentication"],
-        expected_tools=["authentication"],
-        additional_metadata={"difficulty": "medium"}
+        input="Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.",
+        actual_output="Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex",
+        expected_output="A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans",
+        context=["Business Development"],
+        retrieval_context=["GreenEnergy Solutions won 2023 sustainability award", "New solar technology 30% more efficient", "Planning European market expansion"],
     )
 
     scorer = FaithfulnessScorer(threshold=0.5)
     scorer2 = HallucinationScorer(threshold=0.5)
     c_scorer = CustomFaithfulnessMetric(threshold=0.6)
 
-    PROJECT_NAME = "test_project_JOSEPH"
-    EVAL_RUN_NAME = "yomadude"
+    PROJECT_NAME = "OutreachWorkflow"
+    EVAL_RUN_NAME = "ColdEmailGenerator-Improve-BasePrompt"
 
-    _ = client.run_evaluation(
+    client.run_evaluation(
         examples=[example1, example2],
-        scorers=[scorer, c_scorer],
+        scorers=[scorer, scorer2],
         model="QWEN",
         metadata={"batch": "test"},
         project_name=PROJECT_NAME,
@@ -146,8 +144,6 @@ def test_override_eval(client: JudgmentClient):
         if "already exists" not in str(e):
             raise
         print(f"Successfully caught expected error: {e}")
-    
-    
 
 def test_evaluate_dataset(client: JudgmentClient):
 
@@ -194,8 +190,10 @@ def test_classifier_scorer(client: JudgmentClient):
         examples=[example1],
         scorers=[classifier_scorer, faithfulness_scorer],
         model="QWEN",
+        log_results=True,
+        eval_run_name="ToneScorerTest",
+        project_name="ToneScorerTest",
     )
-    print(res)
 
 if __name__ == "__main__":
     # Test client functionality
@@ -204,29 +202,29 @@ def test_classifier_scorer(client: JudgmentClient):
     print("Client initialized successfully")
     print("*" * 40)
 
-    print("Testing dataset creation, pushing, and pulling")
-    test_dataset(ui_client)
-    print("Dataset creation, pushing, and pulling successful")
-    print("*" * 40)
+    # print("Testing dataset creation, pushing, and pulling")
+    # test_dataset(ui_client)
+    # print("Dataset creation, pushing, and pulling successful")
+    # print("*" * 40)
 
     print("Testing evaluation run")
     test_run_eval(ui_client)
     print("Evaluation run successful")
     print("*" * 40)
 
-    print("Testing evaluation run override")
-    test_override_eval(client)
-    print("Evaluation run override successful")
-    print("*" * 40)
+    # print("Testing evaluation run override")
+    # test_override_eval(client)
+    # print("Evaluation run override successful")
+    # print("*" * 40)
 
-    print("Testing dataset evaluation")
-    test_evaluate_dataset(ui_client)
-    print("Dataset evaluation successful")
-    print("*" * 40)
+    # print("Testing dataset evaluation")
+    # test_evaluate_dataset(ui_client)
+    # print("Dataset evaluation successful")
+    # print("*" * 40)
 
-    print("Testing classifier scorer")
-    test_classifier_scorer(ui_client)
-    print("Classifier scorer test successful")
-    print("*" * 40)
+    # print("Testing classifier scorer")
+    # test_classifier_scorer(ui_client)
+    # print("Classifier scorer test successful")
+    # print("*" * 40)
 
     print("All tests passed successfully")
@@ -36,7 +36,7 @@ def __init__(
         )
         self.score = 0.0
 
-    def build_measure_prompt(self, example: Example):
+    def _build_measure_prompt(self, example: Example):
         SYSTEM_ROLE = (
             'You are a great judge of emotional intelligence. You understand the feelings ' 
             'and intentions of others. You will be tasked with judging whether the following '
@@ -51,16 +51,16 @@ def build_measure_prompt(self, example: Example):
         ] 
         return conversation
 
-    def build_schema(self):
+    def _build_schema(self):
         return {
             "score": int,
             "reason": str
         }
 
-    def process_response(self, response):
+    def _process_response(self, response):
         return response["score"], response["reason"]
 
-    def success_check(self):
+    def _success_check(self):
         POSITIVITY_THRESHOLD = 3  # we want all model responses to be somewhat positive in tone
         return self.score <= POSITIVITY_THRESHOLD
 
 
@@ -14,11 +14,11 @@
 from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer
 
 # Initialize the tracer and clients
-judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"))
+judgment = Tracer(api_key=os.getenv("UI_JUDGMENT_API_KEY"))
 openai_client = wrap(OpenAI())
 anthropic_client = wrap(Anthropic())
 
-@judgment.observe
+@judgment.observe(span_type="tool")
 async def make_upper(input: str) -> str:
     """Convert input to uppercase and evaluate using judgment API.
     
@@ -28,6 +28,7 @@ async def make_upper(input: str) -> str:
         The uppercase version of the input string
     """
     output = input.upper()
+    
     await judgment.get_current_trace().async_evaluate(
         scorers=[FaithfulnessScorer(threshold=0.5)],
         input="What if these shoes don't fit?",
@@ -38,9 +39,10 @@ async def make_upper(input: str) -> str:
         model="gpt-4o-mini",
         log_results=True
     )
+
     return output
 
-@judgment.observe
+@judgment.observe(span_type="tool")
 async def make_lower(input):
     output = input.lower()
 
@@ -59,11 +61,12 @@ async def make_lower(input):
     )
     return output
 
-@judgment.observe
+@judgment.observe(span_type="llm")
 def llm_call(input):
+    time.sleep(1.3)
     return "We have a 30 day full refund policy on shoes."
 
-@judgment.observe
+@judgment.observe(span_type="tool")
 async def answer_user_question(input):
     output = llm_call(input)
     await judgment.get_current_trace().async_evaluate(
@@ -77,7 +80,7 @@ async def answer_user_question(input):
     )
     return output
 
-@judgment.observe
+@judgment.observe(span_type="tool")
 async def make_poem(input: str) -> str:
     """Generate a poem using both Anthropic and OpenAI APIs.
     
@@ -95,6 +98,15 @@ async def make_poem(input: str) -> str:
         )
         anthropic_result = anthropic_response.content[0].text
 
+        await judgment.get_current_trace().async_evaluate(
+            input=input,
+            actual_output=anthropic_result,
+            score_type=APIScorer.ANSWER_RELEVANCY,
+            threshold=0.5,
+            model="gpt-4o-mini",
+            log_results=True
+        )
+        
         # Using OpenAI API
         openai_response = openai_client.chat.completions.create(
             model="gpt-4o-mini",
@@ -112,7 +124,8 @@ async def make_poem(input: str) -> str:
         return ""
 
 async def test_evaluation_mixed(input):
-    with judgment.trace("test_evaluation") as trace:
+    PROJECT_NAME = "NewPoemBot"
+    with judgment.trace("Use-claude", project_name=PROJECT_NAME, overwrite=True) as trace:
         upper = await make_upper(input)
         result = await make_poem(upper)
         await answer_user_question("What if these shoes don't fit?")
Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,7 @@`
`157`	`157`	`"name": "python",`
`158`	`158`	`"nbconvert_exporter": "python",`
`159`	`159`	`"pygments_lexer": "ipython3",`
`160`		`- "version": "3.9.6"`
	`160`	`+ "version": "3.11.4"`
`161`	`161`	`}`
`162`	`162`	`},`
`163`	`163`	`"nbformat": 4,`