Merge pull request #33 from JudgmentLabs/joseph/eval-run-name-uniqueness

SecroLoL · web-flow · commit 8d83f32318e2 · 2025-01-10T14:46:24.000-08:00
Make evaluation run names unique
diff --git a/e2etests/judgment_client_test.py b/e2etests/judgment_client_test.py
@@ -11,6 +11,8 @@
 from judgeval.playground import CustomFaithfulnessMetric
 from judgeval.data.datasets.dataset import EvalDataset
 from dotenv import load_dotenv
+import random
+import string
 
 load_dotenv()
 
@@ -58,21 +60,92 @@ def test_run_eval(client: JudgmentClient):
     PROJECT_NAME = "test_project_JOSEPH"
     EVAL_RUN_NAME = "yomadude"
     
-    actual_eval_run_name, _ = client.run_evaluation(
+    _ = client.run_evaluation(
         examples=[example1, example2],
         scorers=[scorer, c_scorer],
         model="QWEN",
         metadata={"batch": "test"},
         project_name=PROJECT_NAME,
         eval_run_name=EVAL_RUN_NAME,
         log_results=True,
+        override=True,
     )
 
-    print(f"{actual_eval_run_name=}")
+    results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME)
+    # print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
 
-    results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=actual_eval_run_name)
-    print(f"Evaluation results for {actual_eval_run_name} from database:", results)
+def test_override_eval(client: JudgmentClient):
+    example1 = Example(
+        input="What if these shoes don't fit?",
+        actual_output="We offer a 30-day full refund at no extra cost.",
+        retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
+        trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
+    )
+    
+    scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
 
+    PROJECT_NAME = "test_eval_run_naming_collisions"
+    EVAL_RUN_NAME = ''.join(random.choices(string.ascii_letters + string.digits, k=12))
+
+    # First run should succeed
+    client.run_evaluation(
+        examples=[example1],
+        scorers=[scorer],
+        model="QWEN",
+        metadata={"batch": "test"},
+        project_name=PROJECT_NAME,
+        eval_run_name=EVAL_RUN_NAME,
+        log_results=True,
+        override=False,
+    )
+    
+    # Second run with log_results=False should succeed
+    client.run_evaluation(
+        examples=[example1],
+        scorers=[scorer],
+        model="QWEN",
+        metadata={"batch": "test"},
+        project_name=PROJECT_NAME,
+        eval_run_name=EVAL_RUN_NAME,
+        log_results=False,
+        override=False,
+    )
+    
+    # Third run with override=True should succeed
+    try:
+        client.run_evaluation(
+            examples=[example1],
+            scorers=[scorer],
+            model="QWEN",
+            metadata={"batch": "test"},
+            project_name=PROJECT_NAME,
+            eval_run_name=EVAL_RUN_NAME,
+            log_results=True,
+            override=True,
+        )
+    except ValueError as e:
+        print(f"Unexpected error in override run: {e}")
+        raise
+    
+    # Final non-override run should fail
+    try:
+        client.run_evaluation(
+            examples=[example1],
+            scorers=[scorer],
+            model="QWEN",
+            metadata={"batch": "test"},
+            project_name=PROJECT_NAME,
+            eval_run_name=EVAL_RUN_NAME,
+            log_results=True,
+            override=False,
+        )
+        raise AssertionError("Expected ValueError was not raised")
+    except ValueError as e:
+        if "already exists" not in str(e):
+            raise
+        print(f"Successfully caught expected error: {e}")
+    
+    
 
 def test_evaluate_dataset(client: JudgmentClient):
 
@@ -139,6 +212,11 @@ def test_classifier_scorer(client: JudgmentClient):
     print("Evaluation run successful")
     print("*" * 40)
     
+    print("Testing evaluation run override")
+    test_override_eval(client)
+    print("Evaluation run override successful")
+    print("*" * 40)
+    
     print("Testing dataset evaluation")
     test_evaluate_dataset(ui_client)
     print("Dataset evaluation successful")
diff --git a/judgeval/judgment_client.py b/judgeval/judgment_client.py
@@ -44,6 +44,7 @@ def run_evaluation(
         log_results: bool = False,
         project_name: str = "",
         eval_run_name: str = "",
+        override: bool = False,
     ) -> List[ScoringResult]:
         """
         Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -60,7 +61,7 @@ def run_evaluation(
                 metadata=metadata,
                 judgment_api_key=self.judgment_api_key
             )
-            return run_eval(eval)
+            return run_eval(eval, override)
         except ValueError as e:
             raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
     
diff --git a/judgeval/run_evaluation.py b/judgeval/run_evaluation.py
@@ -18,6 +18,7 @@
 from judgeval.scorers.score import a_execute_scoring
 
 from judgeval.constants import (
+    ROOT_API,
     JUDGMENT_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
     APIScorer,
@@ -56,6 +57,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
         details = response.json().get("detail", "No details provided")
         raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
     # Check if the response status code is not 2XX
+    # Add check for the duplicate eval run name
     if not response.ok:
         error_message = response_data.get('detail', 'An unknown error occurred.')
         error(f"Error: {error_message=}")
@@ -128,7 +130,83 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
             )
     return results
 
-def run_eval(evaluation_run: EvaluationRun):
+def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
+    """
+    Checks if an evaluation run name already exists for a given project.
+
+    Args:
+        eval_name (str): Name of the evaluation run
+        project_name (str): Name of the project
+        judgment_api_key (str): API key for authentication
+
+    Raises:
+        ValueError: If the evaluation run name already exists
+        JudgmentAPIError: If there's an API error during the check
+    """
+    try:
+        response = requests.post(
+            f"{ROOT_API}/eval-run-name-exists/",
+            json={
+                "eval_name": eval_name,
+                "project_name": project_name,
+                "judgment_api_key": judgment_api_key,
+            }
+        )
+        
+        if response.status_code == 409:
+            error(f"Evaluation run name '{eval_name}' already exists for this project")
+            raise ValueError(f"Evaluation run name '{eval_name}' already exists for this project")
+        
+        if not response.ok:
+            response_data = response.json()
+            error_message = response_data.get('detail', 'An unknown error occurred.')
+            error(f"Error checking eval run name: {error_message}")
+            raise JudgmentAPIError(error_message)
+            
+    except requests.exceptions.RequestException as e:
+        error(f"Failed to check if eval run name exists: {str(e)}")
+        raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
+
+def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
+    """
+    Logs evaluation results to the Judgment API database.
+
+    Args:
+        merged_results (List[ScoringResult]): The results to log
+        evaluation_run (EvaluationRun): The evaluation run containing project info and API key
+
+    Raises:
+        JudgmentAPIError: If there's an API error during logging
+        ValueError: If there's a validation error with the results
+    """
+    try:
+        res = requests.post(
+            JUDGMENT_EVAL_LOG_API_URL,
+            json={
+                "results": [result.to_dict() for result in merged_results],
+                "judgment_api_key": evaluation_run.judgment_api_key,
+                "project_name": evaluation_run.project_name,
+                "eval_name": evaluation_run.eval_name,
+            }
+        )
+        
+        if not res.ok:
+            response_data = res.json()
+            error_message = response_data.get('detail', 'An unknown error occurred.')
+            error(f"Error {res.status_code}: {error_message}")
+            raise JudgmentAPIError(error_message)
+        
+        if "ui_results_url" in res.json():
+            rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
+            
+    except requests.exceptions.RequestException as e:
+        error(f"Request failed while saving evaluation results to DB: {str(e)}")
+        raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
+    except Exception as e:
+        error(f"Failed to save evaluation results to DB: {str(e)}")
+        raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
+
+def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
 
@@ -150,6 +228,15 @@ def run_eval(evaluation_run: EvaluationRun):
     Returns:
         List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
     """
+    
+    # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
+    if not override and evaluation_run.log_results:
+        check_eval_run_name_exists(
+            evaluation_run.eval_name,
+            evaluation_run.project_name,
+            evaluation_run.judgment_api_key
+        )
+    
     # Set example IDs if not already set
     debug("Initializing examples with IDs and timestamps")
     for idx, example in enumerate(evaluation_run.examples):
@@ -262,39 +349,13 @@ def run_eval(evaluation_run: EvaluationRun):
 
     info(f"Successfully merged {len(merged_results)} results")
 
-    actual_eval_run_name = evaluation_run.eval_name
     if evaluation_run.log_results:
-        try:
-            res = requests.post(
-                JUDGMENT_EVAL_LOG_API_URL,
-                json={
-                    "results": [result.to_dict() for result in merged_results],
-                    "judgment_api_key": evaluation_run.judgment_api_key,
-                    "project_name": evaluation_run.project_name,
-                    "eval_name": evaluation_run.eval_name,
-                }
-            )
-            if not res.ok:
-                response_data = res.json()
-                error_message = response_data.get('detail', 'An unknown error occurred.')
-                error(f"Error {res.status_code}: {error_message}")
-                raise Exception(f"Error {res.status_code}: {error_message}")
-            else:
-                actual_eval_run_name = res.json()["eval_results_name"]
-                if "ui_results_url" in res.json():
-                    rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
-                
-        except requests.exceptions.RequestException as e:
-            error(f"Request failed while saving evaluation results to DB: {str(e)}")
-            raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
-        except Exception as e:
-            error(f"Failed to save evaluation results to DB: {str(e)}")
-            raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
+        log_evaluation_results(merged_results, evaluation_run)
 
     for i, result in enumerate(merged_results):
         if not result.scorers_data:  # none of the scorers could be executed on this example
             info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
-    return actual_eval_run_name, merged_results
+    return merged_results
 
 
 if __name__ == "__main__":