Skip to content

Make evaluation run names unique #33

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jan 10, 2025
86 changes: 82 additions & 4 deletions e2etests/judgment_client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from judgeval.playground import CustomFaithfulnessMetric
from judgeval.data.datasets.dataset import EvalDataset
from dotenv import load_dotenv
import random
import string
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this do


load_dotenv()

Expand Down Expand Up @@ -58,21 +60,92 @@ def test_run_eval(client: JudgmentClient):
PROJECT_NAME = "test_project_JOSEPH"
EVAL_RUN_NAME = "yomadude"

actual_eval_run_name, _ = client.run_evaluation(
_ = client.run_evaluation(
examples=[example1, example2],
scorers=[scorer, c_scorer],
model="QWEN",
metadata={"batch": "test"},
project_name=PROJECT_NAME,
eval_run_name=EVAL_RUN_NAME,
log_results=True,
override=True,
)

print(f"{actual_eval_run_name=}")
results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME)
# print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)

results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=actual_eval_run_name)
print(f"Evaluation results for {actual_eval_run_name} from database:", results)
def test_override_eval(client: JudgmentClient):
example1 = Example(
input="What if these shoes don't fit?",
actual_output="We offer a 30-day full refund at no extra cost.",
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
)

scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)

PROJECT_NAME = "test_eval_run_naming_collisions"
EVAL_RUN_NAME = ''.join(random.choices(string.ascii_letters + string.digits, k=12))

# First run should succeed
client.run_evaluation(
examples=[example1],
scorers=[scorer],
model="QWEN",
metadata={"batch": "test"},
project_name=PROJECT_NAME,
eval_run_name=EVAL_RUN_NAME,
log_results=True,
override=False,
)

# Second run with log_results=False should succeed
client.run_evaluation(
examples=[example1],
scorers=[scorer],
model="QWEN",
metadata={"batch": "test"},
project_name=PROJECT_NAME,
eval_run_name=EVAL_RUN_NAME,
log_results=False,
override=False,
)

# Third run with override=True should succeed
try:
client.run_evaluation(
examples=[example1],
scorers=[scorer],
model="QWEN",
metadata={"batch": "test"},
project_name=PROJECT_NAME,
eval_run_name=EVAL_RUN_NAME,
log_results=True,
override=True,
)
except ValueError as e:
print(f"Unexpected error in override run: {e}")
raise

# Final non-override run should fail
try:
client.run_evaluation(
examples=[example1],
scorers=[scorer],
model="QWEN",
metadata={"batch": "test"},
project_name=PROJECT_NAME,
eval_run_name=EVAL_RUN_NAME,
log_results=True,
override=False,
)
raise AssertionError("Expected ValueError was not raised")
except ValueError as e:
if "already exists" not in str(e):
raise
print(f"Successfully caught expected error: {e}")



def test_evaluate_dataset(client: JudgmentClient):

Expand Down Expand Up @@ -139,6 +212,11 @@ def test_classifier_scorer(client: JudgmentClient):
print("Evaluation run successful")
print("*" * 40)

print("Testing evaluation run override")
test_override_eval(client)
print("Evaluation run override successful")
print("*" * 40)

print("Testing dataset evaluation")
test_evaluate_dataset(ui_client)
print("Dataset evaluation successful")
Expand Down
3 changes: 2 additions & 1 deletion judgeval/judgment_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def run_evaluation(
log_results: bool = False,
project_name: str = "",
eval_run_name: str = "",
override: bool = False,
) -> List[ScoringResult]:
"""
Executes an evaluation of `Example`s using one or more `Scorer`s
Expand All @@ -60,7 +61,7 @@ def run_evaluation(
metadata=metadata,
judgment_api_key=self.judgment_api_key
)
return run_eval(eval)
return run_eval(eval, override)
except ValueError as e:
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")

Expand Down
119 changes: 90 additions & 29 deletions judgeval/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from judgeval.scorers.score import a_execute_scoring

from judgeval.constants import (
ROOT_API,
JUDGMENT_EVAL_API_URL,
JUDGMENT_EVAL_LOG_API_URL,
APIScorer,
Expand Down Expand Up @@ -56,6 +57,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
details = response.json().get("detail", "No details provided")
raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
# Check if the response status code is not 2XX
# Add check for the duplicate eval run name
if not response.ok:
error_message = response_data.get('detail', 'An unknown error occurred.')
error(f"Error: {error_message=}")
Expand Down Expand Up @@ -128,7 +130,83 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
)
return results

def run_eval(evaluation_run: EvaluationRun):
def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
"""
Checks if an evaluation run name already exists for a given project.

Args:
eval_name (str): Name of the evaluation run
project_name (str): Name of the project
judgment_api_key (str): API key for authentication

Raises:
ValueError: If the evaluation run name already exists
JudgmentAPIError: If there's an API error during the check
"""
try:
response = requests.post(
f"{ROOT_API}/eval-run-name-exists/",
json={
"eval_name": eval_name,
"project_name": project_name,
"judgment_api_key": judgment_api_key,
}
)

if response.status_code == 409:
error(f"Evaluation run name '{eval_name}' already exists for this project")
raise ValueError(f"Evaluation run name '{eval_name}' already exists for this project")

if not response.ok:
response_data = response.json()
error_message = response_data.get('detail', 'An unknown error occurred.')
error(f"Error checking eval run name: {error_message}")
raise JudgmentAPIError(error_message)

except requests.exceptions.RequestException as e:
error(f"Failed to check if eval run name exists: {str(e)}")
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")

def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
"""
Logs evaluation results to the Judgment API database.

Args:
merged_results (List[ScoringResult]): The results to log
evaluation_run (EvaluationRun): The evaluation run containing project info and API key

Raises:
JudgmentAPIError: If there's an API error during logging
ValueError: If there's a validation error with the results
"""
try:
res = requests.post(
JUDGMENT_EVAL_LOG_API_URL,
json={
"results": [result.to_dict() for result in merged_results],
"judgment_api_key": evaluation_run.judgment_api_key,
"project_name": evaluation_run.project_name,
"eval_name": evaluation_run.eval_name,
}
)

if not res.ok:
response_data = res.json()
error_message = response_data.get('detail', 'An unknown error occurred.')
error(f"Error {res.status_code}: {error_message}")
raise JudgmentAPIError(error_message)

if "ui_results_url" in res.json():
rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")

except requests.exceptions.RequestException as e:
error(f"Request failed while saving evaluation results to DB: {str(e)}")
raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
except Exception as e:
error(f"Failed to save evaluation results to DB: {str(e)}")
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")

def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
"""
Executes an evaluation of `Example`s using one or more `Scorer`s

Expand All @@ -150,6 +228,15 @@ def run_eval(evaluation_run: EvaluationRun):
Returns:
List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
"""

# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
if not override and evaluation_run.log_results:
check_eval_run_name_exists(
evaluation_run.eval_name,
evaluation_run.project_name,
evaluation_run.judgment_api_key
)

# Set example IDs if not already set
debug("Initializing examples with IDs and timestamps")
for idx, example in enumerate(evaluation_run.examples):
Expand Down Expand Up @@ -262,39 +349,13 @@ def run_eval(evaluation_run: EvaluationRun):

info(f"Successfully merged {len(merged_results)} results")

actual_eval_run_name = evaluation_run.eval_name
if evaluation_run.log_results:
try:
res = requests.post(
JUDGMENT_EVAL_LOG_API_URL,
json={
"results": [result.to_dict() for result in merged_results],
"judgment_api_key": evaluation_run.judgment_api_key,
"project_name": evaluation_run.project_name,
"eval_name": evaluation_run.eval_name,
}
)
if not res.ok:
response_data = res.json()
error_message = response_data.get('detail', 'An unknown error occurred.')
error(f"Error {res.status_code}: {error_message}")
raise Exception(f"Error {res.status_code}: {error_message}")
else:
actual_eval_run_name = res.json()["eval_results_name"]
if "ui_results_url" in res.json():
rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")

except requests.exceptions.RequestException as e:
error(f"Request failed while saving evaluation results to DB: {str(e)}")
raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
except Exception as e:
error(f"Failed to save evaluation results to DB: {str(e)}")
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
log_evaluation_results(merged_results, evaluation_run)

for i, result in enumerate(merged_results):
if not result.scorers_data: # none of the scorers could be executed on this example
info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
return actual_eval_run_name, merged_results
return merged_results


if __name__ == "__main__":
Expand Down
Loading