Skip to content

Commit b245399

Browse files
author
Judgment Release Bot
committed
[Bump Minor Version] Release: Merge staging to main
2 parents 069c812 + a0c3ff8 commit b245399

25 files changed

+1087
-999
lines changed

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,17 @@ dependencies = [
2929
"langchain-openai",
3030
"langchain-anthropic",
3131
"langchain-core",
32+
"click<8.2.0",
33+
"typer>=0.9.0",
3234
]
3335

3436
[project.urls]
3537
Homepage = "https://github.com/JudgmentLabs/judgeval"
3638
Issues = "https://github.com/JudgmentLabs/judgeval/issues"
3739

40+
[project.scripts]
41+
judgeval = "judgeval.cli:app"
42+
3843
[build-system]
3944
requires = ["hatchling"]
4045
build-backend = "hatchling.build"

src/e2etests/test_all_scorers.py

Lines changed: 0 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,7 @@
99
FaithfulnessScorer,
1010
InstructionAdherenceScorer,
1111
ExecutionOrderScorer,
12-
PromptScorer,
1312
)
14-
from uuid import uuid4
1513
from judgeval.data import Example
1614
from judgeval.constants import DEFAULT_TOGETHER_MODEL
1715

@@ -32,7 +30,6 @@ def test_ac_scorer(client: JudgmentClient, project_name: str):
3230
model=DEFAULT_TOGETHER_MODEL,
3331
project_name=project_name,
3432
eval_run_name=EVAL_RUN_NAME,
35-
override=True,
3633
)
3734
print_debug_on_failure(res[0])
3835

@@ -58,7 +55,6 @@ def test_ar_scorer(client: JudgmentClient, project_name: str):
5855
model=DEFAULT_TOGETHER_MODEL,
5956
project_name=project_name,
6057
eval_run_name=EVAL_RUN_NAME,
61-
override=True,
6258
)
6359

6460
print_debug_on_failure(res[0])
@@ -101,7 +97,6 @@ def test_faithfulness_scorer(client: JudgmentClient, project_name: str):
10197
model=DEFAULT_TOGETHER_MODEL,
10298
project_name=project_name,
10399
eval_run_name=EVAL_RUN_NAME,
104-
override=True,
105100
)
106101

107102
print_debug_on_failure(res[0])
@@ -127,7 +122,6 @@ def test_instruction_adherence_scorer(client: JudgmentClient, project_name: str)
127122
model=DEFAULT_TOGETHER_MODEL,
128123
project_name=project_name,
129124
eval_run_name=EVAL_RUN_NAME,
130-
override=True,
131125
)
132126

133127
print_debug_on_failure(res[0])
@@ -160,146 +154,11 @@ def test_execution_order_scorer(client: JudgmentClient, project_name: str):
160154
model=DEFAULT_TOGETHER_MODEL,
161155
project_name=project_name,
162156
eval_run_name=EVAL_RUN_NAME,
163-
override=True,
164157
)
165158

166159
assert not res[0].success
167160

168161

169-
def test_prompt_scorer_without_options(client: JudgmentClient, project_name: str):
170-
"""Test prompt scorer functionality."""
171-
172-
prompt_scorer = PromptScorer.create(
173-
name=f"Test Prompt Scorer Without Options {uuid4()}",
174-
prompt="Question: {{input}}\nResponse: {{actual_output}}\n\nIs this response relevant to the question?",
175-
)
176-
177-
relevant_example = Example(
178-
input="What's the weather in New York?",
179-
actual_output="The weather in New York is sunny.",
180-
)
181-
182-
irrelevant_example = Example(
183-
input="What's the capital of France?",
184-
actual_output="The mitochondria is the powerhouse of the cell, and did you know that honey never spoils?",
185-
)
186-
187-
# Run evaluation
188-
res = client.run_evaluation(
189-
examples=[relevant_example, irrelevant_example],
190-
scorers=[prompt_scorer],
191-
model=DEFAULT_TOGETHER_MODEL,
192-
project_name=project_name,
193-
eval_run_name="test-run-prompt-scorer-without-options",
194-
override=True,
195-
)
196-
197-
# Verify results
198-
assert res[0].success, "Relevant example should pass classification"
199-
assert not res[1].success, "Irrelevant example should fail classification"
200-
201-
print_debug_on_failure(res[0])
202-
print_debug_on_failure(res[1])
203-
204-
205-
def test_prompt_scorer_with_options(client: JudgmentClient, project_name: str):
206-
"""Test prompt scorer functionality."""
207-
# Creating a prompt scorer from SDK
208-
prompt_scorer = PromptScorer.create(
209-
name=f"Test Prompt Scorer {uuid4()}",
210-
prompt="Question: {{input}}\nResponse: {{actual_output}}\n\nIs this response helpful?",
211-
options={"yes": 1.0, "no": 0.0},
212-
)
213-
214-
# Update the options with helpfulness classification choices
215-
prompt_scorer.set_options(
216-
{
217-
"yes": 1.0, # Helpful response
218-
"no": 0.0, # Unhelpful response
219-
}
220-
)
221-
222-
# Create test examples
223-
helpful_example = Example(
224-
input="What's the capital of France?",
225-
actual_output="The capital of France is Paris.",
226-
)
227-
228-
unhelpful_example = Example(
229-
input="What's the capital of France?",
230-
actual_output="I don't know much about geography, but I think it might be somewhere in Europe.",
231-
)
232-
233-
# Run evaluation
234-
res = client.run_evaluation(
235-
examples=[helpful_example, unhelpful_example],
236-
scorers=[prompt_scorer],
237-
model=DEFAULT_TOGETHER_MODEL,
238-
project_name=project_name,
239-
eval_run_name="test-run-prompt-scorer-with-options",
240-
override=True,
241-
)
242-
243-
# Verify results
244-
assert res[0].success, "Helpful example should pass classification"
245-
assert not res[1].success, "Unhelpful example should fail classification"
246-
247-
# Print debug info if any test fails
248-
print_debug_on_failure(res[0])
249-
print_debug_on_failure(res[1])
250-
251-
252-
def test_custom_prompt_scorer(client: JudgmentClient, project_name: str):
253-
"""Test custom prompt scorer functionality."""
254-
# Creating a custom prompt scorer from SDK
255-
# Creating a prompt scorer from SDK
256-
prompt_scorer = PromptScorer.create(
257-
name=f"Test Prompt Scorer {uuid4()}",
258-
prompt="Comparison A: {{comparison_a}}\n Comparison B: {{comparison_b}}\n\n Which candidate is better for a teammate?",
259-
options={"comparison_a": 1.0, "comparison_b": 0.0},
260-
)
261-
262-
prompt_scorer.set_options(
263-
{
264-
"comparison_a": 1.0,
265-
"comparison_b": 0.0,
266-
}
267-
)
268-
269-
class ComparisonExample(Example):
270-
comparison_a: str
271-
comparison_b: str
272-
273-
# Create test examples
274-
example1 = ComparisonExample(
275-
comparison_a="Mike loves to play basketball because he passes with his teammates.",
276-
comparison_b="Mike likes to play 1v1 basketball because he likes to show off his skills.",
277-
)
278-
279-
example2 = ComparisonExample(
280-
comparison_a="Mike loves to play singles tennis because he likes to only hit by himself and not with a partner and is selfish.",
281-
comparison_b="Mike likes to play doubles tennis because he likes to coordinate with his partner.",
282-
)
283-
284-
# Run evaluation
285-
res = client.run_evaluation(
286-
examples=[example1, example2],
287-
scorers=[prompt_scorer],
288-
model=DEFAULT_TOGETHER_MODEL,
289-
project_name=project_name,
290-
eval_run_name="test-custom-prompt-scorer",
291-
override=True,
292-
)
293-
294-
# Verify results
295-
assert res[0].success, "Example 1 should pass classification"
296-
assert not res[1].success, "Example 2 should fail classification"
297-
298-
# Print debug info if any test fails
299-
print_debug_on_failure(res[0])
300-
print_debug_on_failure(res[1])
301-
302-
303162
def print_debug_on_failure(result) -> bool:
304163
"""
305164
Helper function to print debug info only on test failure

0 commit comments

Comments
 (0)