Skip to content

Commit 8d83f32

Browse files
authored
Merge pull request #33 from JudgmentLabs/joseph/eval-run-name-uniqueness
Make evaluation run names unique
2 parents e51b9fe + 82b011c commit 8d83f32

File tree

3 files changed

+174
-34
lines changed

3 files changed

+174
-34
lines changed

e2etests/judgment_client_test.py

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
from judgeval.playground import CustomFaithfulnessMetric
1212
from judgeval.data.datasets.dataset import EvalDataset
1313
from dotenv import load_dotenv
14+
import random
15+
import string
1416

1517
load_dotenv()
1618

@@ -58,21 +60,92 @@ def test_run_eval(client: JudgmentClient):
5860
PROJECT_NAME = "test_project_JOSEPH"
5961
EVAL_RUN_NAME = "yomadude"
6062

61-
actual_eval_run_name, _ = client.run_evaluation(
63+
_ = client.run_evaluation(
6264
examples=[example1, example2],
6365
scorers=[scorer, c_scorer],
6466
model="QWEN",
6567
metadata={"batch": "test"},
6668
project_name=PROJECT_NAME,
6769
eval_run_name=EVAL_RUN_NAME,
6870
log_results=True,
71+
override=True,
6972
)
7073

71-
print(f"{actual_eval_run_name=}")
74+
results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME)
75+
# print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
7276

73-
results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=actual_eval_run_name)
74-
print(f"Evaluation results for {actual_eval_run_name} from database:", results)
77+
def test_override_eval(client: JudgmentClient):
78+
example1 = Example(
79+
input="What if these shoes don't fit?",
80+
actual_output="We offer a 30-day full refund at no extra cost.",
81+
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
82+
trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
83+
)
84+
85+
scorer = JudgmentScorer(threshold=0.5, score_type=APIScorer.FAITHFULNESS)
7586

87+
PROJECT_NAME = "test_eval_run_naming_collisions"
88+
EVAL_RUN_NAME = ''.join(random.choices(string.ascii_letters + string.digits, k=12))
89+
90+
# First run should succeed
91+
client.run_evaluation(
92+
examples=[example1],
93+
scorers=[scorer],
94+
model="QWEN",
95+
metadata={"batch": "test"},
96+
project_name=PROJECT_NAME,
97+
eval_run_name=EVAL_RUN_NAME,
98+
log_results=True,
99+
override=False,
100+
)
101+
102+
# Second run with log_results=False should succeed
103+
client.run_evaluation(
104+
examples=[example1],
105+
scorers=[scorer],
106+
model="QWEN",
107+
metadata={"batch": "test"},
108+
project_name=PROJECT_NAME,
109+
eval_run_name=EVAL_RUN_NAME,
110+
log_results=False,
111+
override=False,
112+
)
113+
114+
# Third run with override=True should succeed
115+
try:
116+
client.run_evaluation(
117+
examples=[example1],
118+
scorers=[scorer],
119+
model="QWEN",
120+
metadata={"batch": "test"},
121+
project_name=PROJECT_NAME,
122+
eval_run_name=EVAL_RUN_NAME,
123+
log_results=True,
124+
override=True,
125+
)
126+
except ValueError as e:
127+
print(f"Unexpected error in override run: {e}")
128+
raise
129+
130+
# Final non-override run should fail
131+
try:
132+
client.run_evaluation(
133+
examples=[example1],
134+
scorers=[scorer],
135+
model="QWEN",
136+
metadata={"batch": "test"},
137+
project_name=PROJECT_NAME,
138+
eval_run_name=EVAL_RUN_NAME,
139+
log_results=True,
140+
override=False,
141+
)
142+
raise AssertionError("Expected ValueError was not raised")
143+
except ValueError as e:
144+
if "already exists" not in str(e):
145+
raise
146+
print(f"Successfully caught expected error: {e}")
147+
148+
76149

77150
def test_evaluate_dataset(client: JudgmentClient):
78151

@@ -139,6 +212,11 @@ def test_classifier_scorer(client: JudgmentClient):
139212
print("Evaluation run successful")
140213
print("*" * 40)
141214

215+
print("Testing evaluation run override")
216+
test_override_eval(client)
217+
print("Evaluation run override successful")
218+
print("*" * 40)
219+
142220
print("Testing dataset evaluation")
143221
test_evaluate_dataset(ui_client)
144222
print("Dataset evaluation successful")

judgeval/judgment_client.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def run_evaluation(
4444
log_results: bool = False,
4545
project_name: str = "",
4646
eval_run_name: str = "",
47+
override: bool = False,
4748
) -> List[ScoringResult]:
4849
"""
4950
Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -60,7 +61,7 @@ def run_evaluation(
6061
metadata=metadata,
6162
judgment_api_key=self.judgment_api_key
6263
)
63-
return run_eval(eval)
64+
return run_eval(eval, override)
6465
except ValueError as e:
6566
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
6667

judgeval/run_evaluation.py

Lines changed: 90 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from judgeval.scorers.score import a_execute_scoring
1919

2020
from judgeval.constants import (
21+
ROOT_API,
2122
JUDGMENT_EVAL_API_URL,
2223
JUDGMENT_EVAL_LOG_API_URL,
2324
APIScorer,
@@ -56,6 +57,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
5657
details = response.json().get("detail", "No details provided")
5758
raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
5859
# Check if the response status code is not 2XX
60+
# Add check for the duplicate eval run name
5961
if not response.ok:
6062
error_message = response_data.get('detail', 'An unknown error occurred.')
6163
error(f"Error: {error_message=}")
@@ -128,7 +130,83 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
128130
)
129131
return results
130132

131-
def run_eval(evaluation_run: EvaluationRun):
133+
def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
134+
"""
135+
Checks if an evaluation run name already exists for a given project.
136+
137+
Args:
138+
eval_name (str): Name of the evaluation run
139+
project_name (str): Name of the project
140+
judgment_api_key (str): API key for authentication
141+
142+
Raises:
143+
ValueError: If the evaluation run name already exists
144+
JudgmentAPIError: If there's an API error during the check
145+
"""
146+
try:
147+
response = requests.post(
148+
f"{ROOT_API}/eval-run-name-exists/",
149+
json={
150+
"eval_name": eval_name,
151+
"project_name": project_name,
152+
"judgment_api_key": judgment_api_key,
153+
}
154+
)
155+
156+
if response.status_code == 409:
157+
error(f"Evaluation run name '{eval_name}' already exists for this project")
158+
raise ValueError(f"Evaluation run name '{eval_name}' already exists for this project")
159+
160+
if not response.ok:
161+
response_data = response.json()
162+
error_message = response_data.get('detail', 'An unknown error occurred.')
163+
error(f"Error checking eval run name: {error_message}")
164+
raise JudgmentAPIError(error_message)
165+
166+
except requests.exceptions.RequestException as e:
167+
error(f"Failed to check if eval run name exists: {str(e)}")
168+
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
169+
170+
def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
171+
"""
172+
Logs evaluation results to the Judgment API database.
173+
174+
Args:
175+
merged_results (List[ScoringResult]): The results to log
176+
evaluation_run (EvaluationRun): The evaluation run containing project info and API key
177+
178+
Raises:
179+
JudgmentAPIError: If there's an API error during logging
180+
ValueError: If there's a validation error with the results
181+
"""
182+
try:
183+
res = requests.post(
184+
JUDGMENT_EVAL_LOG_API_URL,
185+
json={
186+
"results": [result.to_dict() for result in merged_results],
187+
"judgment_api_key": evaluation_run.judgment_api_key,
188+
"project_name": evaluation_run.project_name,
189+
"eval_name": evaluation_run.eval_name,
190+
}
191+
)
192+
193+
if not res.ok:
194+
response_data = res.json()
195+
error_message = response_data.get('detail', 'An unknown error occurred.')
196+
error(f"Error {res.status_code}: {error_message}")
197+
raise JudgmentAPIError(error_message)
198+
199+
if "ui_results_url" in res.json():
200+
rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
201+
202+
except requests.exceptions.RequestException as e:
203+
error(f"Request failed while saving evaluation results to DB: {str(e)}")
204+
raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
205+
except Exception as e:
206+
error(f"Failed to save evaluation results to DB: {str(e)}")
207+
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
208+
209+
def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
132210
"""
133211
Executes an evaluation of `Example`s using one or more `Scorer`s
134212
@@ -150,6 +228,15 @@ def run_eval(evaluation_run: EvaluationRun):
150228
Returns:
151229
List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
152230
"""
231+
232+
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
233+
if not override and evaluation_run.log_results:
234+
check_eval_run_name_exists(
235+
evaluation_run.eval_name,
236+
evaluation_run.project_name,
237+
evaluation_run.judgment_api_key
238+
)
239+
153240
# Set example IDs if not already set
154241
debug("Initializing examples with IDs and timestamps")
155242
for idx, example in enumerate(evaluation_run.examples):
@@ -262,39 +349,13 @@ def run_eval(evaluation_run: EvaluationRun):
262349

263350
info(f"Successfully merged {len(merged_results)} results")
264351

265-
actual_eval_run_name = evaluation_run.eval_name
266352
if evaluation_run.log_results:
267-
try:
268-
res = requests.post(
269-
JUDGMENT_EVAL_LOG_API_URL,
270-
json={
271-
"results": [result.to_dict() for result in merged_results],
272-
"judgment_api_key": evaluation_run.judgment_api_key,
273-
"project_name": evaluation_run.project_name,
274-
"eval_name": evaluation_run.eval_name,
275-
}
276-
)
277-
if not res.ok:
278-
response_data = res.json()
279-
error_message = response_data.get('detail', 'An unknown error occurred.')
280-
error(f"Error {res.status_code}: {error_message}")
281-
raise Exception(f"Error {res.status_code}: {error_message}")
282-
else:
283-
actual_eval_run_name = res.json()["eval_results_name"]
284-
if "ui_results_url" in res.json():
285-
rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
286-
287-
except requests.exceptions.RequestException as e:
288-
error(f"Request failed while saving evaluation results to DB: {str(e)}")
289-
raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
290-
except Exception as e:
291-
error(f"Failed to save evaluation results to DB: {str(e)}")
292-
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
353+
log_evaluation_results(merged_results, evaluation_run)
293354

294355
for i, result in enumerate(merged_results):
295356
if not result.scorers_data: # none of the scorers could be executed on this example
296357
info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
297-
return actual_eval_run_name, merged_results
358+
return merged_results
298359

299360

300361
if __name__ == "__main__":

0 commit comments

Comments
 (0)