diff --git a/e2etests/judgment_client_test.py b/e2etests/judgment_client_test.py index 31d7dd79..852500d7 100644 --- a/e2etests/judgment_client_test.py +++ b/e2etests/judgment_client_test.py @@ -11,6 +11,8 @@ from judgeval.playground import CustomFaithfulnessMetric from judgeval.data.datasets.dataset import EvalDataset from dotenv import load_dotenv +import random +import string load_dotenv() @@ -58,7 +60,7 @@ def test_run_eval(client: JudgmentClient): PROJECT_NAME = "test_project_JOSEPH" EVAL_RUN_NAME = "yomadude" - actual_eval_run_name, _ = client.run_evaluation( + _ = client.run_evaluation( examples=[example1, example2], scorers=[scorer, c_scorer], model="QWEN", @@ -66,13 +68,84 @@ def test_run_eval(client: JudgmentClient): project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME, log_results=True, + override=True, ) - print(f"{actual_eval_run_name=}") + results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME) + # print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results) - results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=actual_eval_run_name) - print(f"Evaluation results for {actual_eval_run_name} from database:", results) +def test_override_eval(client: JudgmentClient): + example1 = Example( + input="What if these shoes don't fit?", + actual_output="We offer a 30-day full refund at no extra cost.", + retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."], + trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6" + ) + + scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS) + PROJECT_NAME = "test_eval_run_naming_collisions" + EVAL_RUN_NAME = ''.join(random.choices(string.ascii_letters + string.digits, k=12)) + + # First run should succeed + client.run_evaluation( + examples=[example1], + scorers=[scorer], + model="QWEN", + metadata={"batch": "test"}, + project_name=PROJECT_NAME, + eval_run_name=EVAL_RUN_NAME, + log_results=True, + override=False, + ) + + # Second run with log_results=False should succeed + client.run_evaluation( + examples=[example1], + scorers=[scorer], + model="QWEN", + metadata={"batch": "test"}, + project_name=PROJECT_NAME, + eval_run_name=EVAL_RUN_NAME, + log_results=False, + override=False, + ) + + # Third run with override=True should succeed + try: + client.run_evaluation( + examples=[example1], + scorers=[scorer], + model="QWEN", + metadata={"batch": "test"}, + project_name=PROJECT_NAME, + eval_run_name=EVAL_RUN_NAME, + log_results=True, + override=True, + ) + except ValueError as e: + print(f"Unexpected error in override run: {e}") + raise + + # Final non-override run should fail + try: + client.run_evaluation( + examples=[example1], + scorers=[scorer], + model="QWEN", + metadata={"batch": "test"}, + project_name=PROJECT_NAME, + eval_run_name=EVAL_RUN_NAME, + log_results=True, + override=False, + ) + raise AssertionError("Expected ValueError was not raised") + except ValueError as e: + if "already exists" not in str(e): + raise + print(f"Successfully caught expected error: {e}") + + def test_evaluate_dataset(client: JudgmentClient): @@ -139,6 +212,11 @@ def test_classifier_scorer(client: JudgmentClient): print("Evaluation run successful") print("*" * 40) + print("Testing evaluation run override") + test_override_eval(client) + print("Evaluation run override successful") + print("*" * 40) + print("Testing dataset evaluation") test_evaluate_dataset(ui_client) print("Dataset evaluation successful") diff --git a/judgeval/judgment_client.py b/judgeval/judgment_client.py index f610ebae..fe4636a1 100644 --- a/judgeval/judgment_client.py +++ b/judgeval/judgment_client.py @@ -44,6 +44,7 @@ def run_evaluation( log_results: bool = False, project_name: str = "", eval_run_name: str = "", + override: bool = False, ) -> List[ScoringResult]: """ Executes an evaluation of `Example`s using one or more `Scorer`s @@ -60,7 +61,7 @@ def run_evaluation( metadata=metadata, judgment_api_key=self.judgment_api_key ) - return run_eval(eval) + return run_eval(eval, override) except ValueError as e: raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}") diff --git a/judgeval/run_evaluation.py b/judgeval/run_evaluation.py index 7bd0d38c..c564018a 100644 --- a/judgeval/run_evaluation.py +++ b/judgeval/run_evaluation.py @@ -18,6 +18,7 @@ from judgeval.scorers.score import a_execute_scoring from judgeval.constants import ( + ROOT_API, JUDGMENT_EVAL_API_URL, JUDGMENT_EVAL_LOG_API_URL, APIScorer, @@ -56,6 +57,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]: details = response.json().get("detail", "No details provided") raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details) # Check if the response status code is not 2XX + # Add check for the duplicate eval run name if not response.ok: error_message = response_data.get('detail', 'An unknown error occurred.') error(f"Error: {error_message=}") @@ -128,7 +130,83 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul ) return results -def run_eval(evaluation_run: EvaluationRun): +def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None: + """ + Checks if an evaluation run name already exists for a given project. + + Args: + eval_name (str): Name of the evaluation run + project_name (str): Name of the project + judgment_api_key (str): API key for authentication + + Raises: + ValueError: If the evaluation run name already exists + JudgmentAPIError: If there's an API error during the check + """ + try: + response = requests.post( + f"{ROOT_API}/eval-run-name-exists/", + json={ + "eval_name": eval_name, + "project_name": project_name, + "judgment_api_key": judgment_api_key, + } + ) + + if response.status_code == 409: + error(f"Evaluation run name '{eval_name}' already exists for this project") + raise ValueError(f"Evaluation run name '{eval_name}' already exists for this project") + + if not response.ok: + response_data = response.json() + error_message = response_data.get('detail', 'An unknown error occurred.') + error(f"Error checking eval run name: {error_message}") + raise JudgmentAPIError(error_message) + + except requests.exceptions.RequestException as e: + error(f"Failed to check if eval run name exists: {str(e)}") + raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}") + +def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None: + """ + Logs evaluation results to the Judgment API database. + + Args: + merged_results (List[ScoringResult]): The results to log + evaluation_run (EvaluationRun): The evaluation run containing project info and API key + + Raises: + JudgmentAPIError: If there's an API error during logging + ValueError: If there's a validation error with the results + """ + try: + res = requests.post( + JUDGMENT_EVAL_LOG_API_URL, + json={ + "results": [result.to_dict() for result in merged_results], + "judgment_api_key": evaluation_run.judgment_api_key, + "project_name": evaluation_run.project_name, + "eval_name": evaluation_run.eval_name, + } + ) + + if not res.ok: + response_data = res.json() + error_message = response_data.get('detail', 'An unknown error occurred.') + error(f"Error {res.status_code}: {error_message}") + raise JudgmentAPIError(error_message) + + if "ui_results_url" in res.json(): + rprint(f"\nšŸ” You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n") + + except requests.exceptions.RequestException as e: + error(f"Request failed while saving evaluation results to DB: {str(e)}") + raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}") + except Exception as e: + error(f"Failed to save evaluation results to DB: {str(e)}") + raise ValueError(f"Failed to save evaluation results to DB: {str(e)}") + +def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]: """ Executes an evaluation of `Example`s using one or more `Scorer`s @@ -150,6 +228,15 @@ def run_eval(evaluation_run: EvaluationRun): Returns: List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object. """ + + # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results) + if not override and evaluation_run.log_results: + check_eval_run_name_exists( + evaluation_run.eval_name, + evaluation_run.project_name, + evaluation_run.judgment_api_key + ) + # Set example IDs if not already set debug("Initializing examples with IDs and timestamps") for idx, example in enumerate(evaluation_run.examples): @@ -262,39 +349,13 @@ def run_eval(evaluation_run: EvaluationRun): info(f"Successfully merged {len(merged_results)} results") - actual_eval_run_name = evaluation_run.eval_name if evaluation_run.log_results: - try: - res = requests.post( - JUDGMENT_EVAL_LOG_API_URL, - json={ - "results": [result.to_dict() for result in merged_results], - "judgment_api_key": evaluation_run.judgment_api_key, - "project_name": evaluation_run.project_name, - "eval_name": evaluation_run.eval_name, - } - ) - if not res.ok: - response_data = res.json() - error_message = response_data.get('detail', 'An unknown error occurred.') - error(f"Error {res.status_code}: {error_message}") - raise Exception(f"Error {res.status_code}: {error_message}") - else: - actual_eval_run_name = res.json()["eval_results_name"] - if "ui_results_url" in res.json(): - rprint(f"\nšŸ” You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n") - - except requests.exceptions.RequestException as e: - error(f"Request failed while saving evaluation results to DB: {str(e)}") - raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}") - except Exception as e: - error(f"Failed to save evaluation results to DB: {str(e)}") - raise ValueError(f"Failed to save evaluation results to DB: {str(e)}") + log_evaluation_results(merged_results, evaluation_run) for i, result in enumerate(merged_results): if not result.scorers_data: # none of the scorers could be executed on this example info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.") - return actual_eval_run_name, merged_results + return merged_results if __name__ == "__main__":