JudgmentLabs
diff --git a/‎Pipfile
Lines changed: 1 addition & 1 deletion b/‎Pipfile
Lines changed: 1 addition & 1 deletion
diff --git a/‎Pipfile.lock
Lines changed: 345 additions & 200 deletions b/‎Pipfile.lock
Lines changed: 345 additions & 200 deletions
diff --git a/‎README.md
Lines changed: 6 additions & 6 deletions b/‎README.md
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/demo/async_evaluation_example.py
Lines changed: 146 additions & 0 deletions b/‎src/demo/async_evaluation_example.py
Lines changed: 146 additions & 0 deletions
diff --git a/‎src/demo/sequence_test.py
Lines changed: 1 addition & 1 deletion b/‎src/demo/sequence_test.py
Lines changed: 1 addition & 1 deletion
@@ -4,7 +4,7 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
-litellm = "==1.38.12"
+litellm = "==1.61.15"
 python-dotenv = "==1.0.1"
 requests = "*"
 pandas = "*"
 
@@ -9,7 +9,7 @@
 
 <br>
 
-## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [Twitter/X](https://x.com/JudgmentLabs) • [💼 LinkedIn](https://www.linkedin.com/company/judgmentlabs) • [📚 Docs](https://judgment.mintlify.app/getting_started) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o) • [🎮 Discord](https://discord.gg/taAufyhf)
+## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [Twitter/X](https://x.com/JudgmentLabs) • [💼 LinkedIn](https://www.linkedin.com/company/judgmentlabs) • [📚 Docs](https://docs.judgmentlabs.ai/introduction) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o) • [🎮 Discord](https://discord.gg/taAufyhf)
 </div>
 
 ## Judgeval: open-source testing, monitoring, and optimization for AI agents
@@ -18,7 +18,7 @@ Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It
 
 Judgeval gets you started in five minutes, after which you'll be ready to use all of its features as your agent becomes more complex. Judgeval is natively connected to the [Judgment Platform](https://www.judgmentlabs.ai/) for free and you can export your data and self-host at any time.
 
-We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and allow custom eval integrations for any use case. Check out our quickstarts below or our [setup guide](https://judgment.mintlify.app/getting_started) to get started.
+We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and allow custom eval integrations for any use case. Check out our quickstarts below or our [setup guide](https://docs.judgmentlabs.ai/getting-started) to get started.
 
 Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
 
@@ -94,7 +94,7 @@ def main():
 main()
 ```
 
-[Click here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation.
+[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-trace) for a more detailed explanation.
 
 ### 📝 Offline Evaluations
 
@@ -123,7 +123,7 @@ results = client.run_evaluation(
 print(results)
 ```
 
-[Click here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation.
+[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-experiment) for a more detailed explanation.
 
 ### 📡 Online Evaluations
 
@@ -163,7 +163,7 @@ def main():
 main()
 ```
 
-[Click here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation.
+[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-online-evaluation) for a more detailed explanation.
 
 ## 🏢 Self-Hosting
 
@@ -175,7 +175,7 @@ Run Judgment on your own infrastructure: we provide comprehensive self-hosting c
 * Access Judgment through your own custom domain
 
 ### Getting Started
-1. Check out our [self-hosting documentation](https://judgment.mintlify.app/self_hosting/get_started) for detailed setup instructions, along with how your self-hosted instance can be accessed
+1. Check out our [self-hosting documentation](https://docs.judgmentlabs.ai/self-hosting/get_started) for detailed setup instructions, along with how your self-hosted instance can be accessed
 2. Use the [Judgment CLI](https://github.com/JudgmentLabs/judgment-cli) to deploy your self-hosted environment
 3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
 
 
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""
+Examples demonstrating how to use async evaluation in multiple ways.
+"""
+
+import asyncio
+import os
+import time
+from typing import List
+
+from judgeval.data import Example, ScoringResult
+from judgeval.judgment_client import JudgmentClient
+
+# Get Judgment API key from environment (replace with your actual API key)
+JUDGMENT_API_KEY = os.environ.get("JUDGMENT_API_KEY", "your_api_key_here")
+ORGANIZATION_ID = os.environ.get("ORGANIZATION_ID", "your_organization_id_here")
+
+# Initialize the JudgmentClient
+judgment_client = JudgmentClient(judgment_api_key=JUDGMENT_API_KEY, organization_id=ORGANIZATION_ID)
+
+
+async def example_direct_await():
+    """
+    Example of directly awaiting the Task returned by run_evaluation with async_execution=True.
+    This is the simplest approach and blocks until evaluation is complete.
+    """
+    print("\n=== Example: Direct Await ===")
+    
+    # Create example list
+    examples = [
+        Example(
+            input="What is the capital of France?",
+            actual_output="The capital of France is Paris.",
+            expected_output="Paris"
+        ),
+        Example(
+            input="What is the capital of Italy?",
+            actual_output="Rome is the capital of Italy.",
+            expected_output="Rome"
+        )
+    ]
+    
+    # Set up scorers
+    from judgeval.scorers import AnswerCorrectnessScorer
+    scorers = [AnswerCorrectnessScorer(threshold=0.9)]
+    
+    # Start evaluation asynchronously and get a Task object
+    print("Starting evaluation...")
+    task = judgment_client.run_evaluation(
+        examples=examples,
+        scorers=scorers,
+        model="gpt-4o-mini",
+        project_name="async-examples",
+        eval_run_name="async-example-direct",
+        override=True,
+        async_execution=True
+    )
+    
+    # Directly await the task - this will block until the evaluation is done
+    print("Awaiting results...")
+    results = await task
+    
+    print(f"Evaluation completed! Received {len(results)} results")
+    
+    # Process the results
+    print(results)
+
+
+async def example_with_other_work():
+    """
+    Example of running other work while evaluation is in progress.
+    Shows how to check task status and get results when ready.
+    """
+    print("\n=== Example: Do Other Work While Evaluating ===")
+    
+    # Create example list
+    examples = [
+        Example(
+            input="What is the tallest mountain in the world?",
+            actual_output="Mount Everest is the tallest mountain in the world.",
+            expected_output="Mount Everest"
+        ),
+        Example(
+            input="What is the largest ocean?",
+            actual_output="The Pacific Ocean is the largest ocean on Earth.",
+            expected_output="Pacific Ocean"
+        )
+    ]
+    
+    # Set up scorers
+    from judgeval.scorers import AnswerCorrectnessScorer
+    scorers = [AnswerCorrectnessScorer(threshold=0.9)]
+    
+    # Start evaluation asynchronously and get a Task object
+    print("Starting evaluation...")
+    task = judgment_client.run_evaluation(
+        examples=examples,
+        scorers=scorers,
+        model="gpt-4o-mini",
+        project_name="async-examples",
+        eval_run_name="async-example-other-work",
+        override=True,
+        async_execution=True
+    )
+    
+    # Do other work while evaluation is running
+    print("Doing other work while evaluation runs in the background...")
+    
+    # Simulate other work with a few iterations
+    for i in range(1, 4):
+        print(f"  Doing work iteration {i}...")
+        await asyncio.sleep(2)  # Simulate work with a delay
+        
+        # Check if the evaluation is done
+        if task.done():
+            print("  Evaluation completed during other work!")
+            break
+        else:
+            print("  Evaluation still running...")
+    
+    # Get the results when ready
+    try:
+        if not task.done():
+            print("Waiting for evaluation to complete...")
+            
+        results = await task  # Will return immediately if already done
+        
+        print(results)
+                
+    except Exception as e:
+        print(f"Error in evaluation: {str(e)}")
+        if task.exception():
+            print(f"Task exception: {task.exception()}")
+
+
+async def main():
+    """Run the examples."""
+    # Run the first example: direct await
+    await example_direct_await()
+    
+    # Run the second example: do other work while evaluating
+    await example_with_other_work()
+
+
+if __name__ == "__main__":
+    asyncio.run(main()) 
@@ -159,7 +159,7 @@ def generate_itinerary(destination, start_date, end_date):
     judgment.assert_test(
         project_name="travel_agent_demo",
         examples=[example],
-        scorers=[ToolOrderScorer(threshold=0.5)],
+        scorers=[ToolOrderScorer()],
         model="gpt-4.1-mini",
         function=generate_itinerary,
         tracer=tracer,