99 FaithfulnessScorer ,
1010 InstructionAdherenceScorer ,
1111 ExecutionOrderScorer ,
12- PromptScorer ,
1312)
14- from uuid import uuid4
1513from judgeval .data import Example
1614from judgeval .constants import DEFAULT_TOGETHER_MODEL
1715
@@ -32,7 +30,6 @@ def test_ac_scorer(client: JudgmentClient, project_name: str):
3230 model = DEFAULT_TOGETHER_MODEL ,
3331 project_name = project_name ,
3432 eval_run_name = EVAL_RUN_NAME ,
35- override = True ,
3633 )
3734 print_debug_on_failure (res [0 ])
3835
@@ -58,7 +55,6 @@ def test_ar_scorer(client: JudgmentClient, project_name: str):
5855 model = DEFAULT_TOGETHER_MODEL ,
5956 project_name = project_name ,
6057 eval_run_name = EVAL_RUN_NAME ,
61- override = True ,
6258 )
6359
6460 print_debug_on_failure (res [0 ])
@@ -101,7 +97,6 @@ def test_faithfulness_scorer(client: JudgmentClient, project_name: str):
10197 model = DEFAULT_TOGETHER_MODEL ,
10298 project_name = project_name ,
10399 eval_run_name = EVAL_RUN_NAME ,
104- override = True ,
105100 )
106101
107102 print_debug_on_failure (res [0 ])
@@ -127,7 +122,6 @@ def test_instruction_adherence_scorer(client: JudgmentClient, project_name: str)
127122 model = DEFAULT_TOGETHER_MODEL ,
128123 project_name = project_name ,
129124 eval_run_name = EVAL_RUN_NAME ,
130- override = True ,
131125 )
132126
133127 print_debug_on_failure (res [0 ])
@@ -160,146 +154,11 @@ def test_execution_order_scorer(client: JudgmentClient, project_name: str):
160154 model = DEFAULT_TOGETHER_MODEL ,
161155 project_name = project_name ,
162156 eval_run_name = EVAL_RUN_NAME ,
163- override = True ,
164157 )
165158
166159 assert not res [0 ].success
167160
168161
169- def test_prompt_scorer_without_options (client : JudgmentClient , project_name : str ):
170- """Test prompt scorer functionality."""
171-
172- prompt_scorer = PromptScorer .create (
173- name = f"Test Prompt Scorer Without Options { uuid4 ()} " ,
174- prompt = "Question: {{input}}\n Response: {{actual_output}}\n \n Is this response relevant to the question?" ,
175- )
176-
177- relevant_example = Example (
178- input = "What's the weather in New York?" ,
179- actual_output = "The weather in New York is sunny." ,
180- )
181-
182- irrelevant_example = Example (
183- input = "What's the capital of France?" ,
184- actual_output = "The mitochondria is the powerhouse of the cell, and did you know that honey never spoils?" ,
185- )
186-
187- # Run evaluation
188- res = client .run_evaluation (
189- examples = [relevant_example , irrelevant_example ],
190- scorers = [prompt_scorer ],
191- model = DEFAULT_TOGETHER_MODEL ,
192- project_name = project_name ,
193- eval_run_name = "test-run-prompt-scorer-without-options" ,
194- override = True ,
195- )
196-
197- # Verify results
198- assert res [0 ].success , "Relevant example should pass classification"
199- assert not res [1 ].success , "Irrelevant example should fail classification"
200-
201- print_debug_on_failure (res [0 ])
202- print_debug_on_failure (res [1 ])
203-
204-
205- def test_prompt_scorer_with_options (client : JudgmentClient , project_name : str ):
206- """Test prompt scorer functionality."""
207- # Creating a prompt scorer from SDK
208- prompt_scorer = PromptScorer .create (
209- name = f"Test Prompt Scorer { uuid4 ()} " ,
210- prompt = "Question: {{input}}\n Response: {{actual_output}}\n \n Is this response helpful?" ,
211- options = {"yes" : 1.0 , "no" : 0.0 },
212- )
213-
214- # Update the options with helpfulness classification choices
215- prompt_scorer .set_options (
216- {
217- "yes" : 1.0 , # Helpful response
218- "no" : 0.0 , # Unhelpful response
219- }
220- )
221-
222- # Create test examples
223- helpful_example = Example (
224- input = "What's the capital of France?" ,
225- actual_output = "The capital of France is Paris." ,
226- )
227-
228- unhelpful_example = Example (
229- input = "What's the capital of France?" ,
230- actual_output = "I don't know much about geography, but I think it might be somewhere in Europe." ,
231- )
232-
233- # Run evaluation
234- res = client .run_evaluation (
235- examples = [helpful_example , unhelpful_example ],
236- scorers = [prompt_scorer ],
237- model = DEFAULT_TOGETHER_MODEL ,
238- project_name = project_name ,
239- eval_run_name = "test-run-prompt-scorer-with-options" ,
240- override = True ,
241- )
242-
243- # Verify results
244- assert res [0 ].success , "Helpful example should pass classification"
245- assert not res [1 ].success , "Unhelpful example should fail classification"
246-
247- # Print debug info if any test fails
248- print_debug_on_failure (res [0 ])
249- print_debug_on_failure (res [1 ])
250-
251-
252- def test_custom_prompt_scorer (client : JudgmentClient , project_name : str ):
253- """Test custom prompt scorer functionality."""
254- # Creating a custom prompt scorer from SDK
255- # Creating a prompt scorer from SDK
256- prompt_scorer = PromptScorer .create (
257- name = f"Test Prompt Scorer { uuid4 ()} " ,
258- prompt = "Comparison A: {{comparison_a}}\n Comparison B: {{comparison_b}}\n \n Which candidate is better for a teammate?" ,
259- options = {"comparison_a" : 1.0 , "comparison_b" : 0.0 },
260- )
261-
262- prompt_scorer .set_options (
263- {
264- "comparison_a" : 1.0 ,
265- "comparison_b" : 0.0 ,
266- }
267- )
268-
269- class ComparisonExample (Example ):
270- comparison_a : str
271- comparison_b : str
272-
273- # Create test examples
274- example1 = ComparisonExample (
275- comparison_a = "Mike loves to play basketball because he passes with his teammates." ,
276- comparison_b = "Mike likes to play 1v1 basketball because he likes to show off his skills." ,
277- )
278-
279- example2 = ComparisonExample (
280- comparison_a = "Mike loves to play singles tennis because he likes to only hit by himself and not with a partner and is selfish." ,
281- comparison_b = "Mike likes to play doubles tennis because he likes to coordinate with his partner." ,
282- )
283-
284- # Run evaluation
285- res = client .run_evaluation (
286- examples = [example1 , example2 ],
287- scorers = [prompt_scorer ],
288- model = DEFAULT_TOGETHER_MODEL ,
289- project_name = project_name ,
290- eval_run_name = "test-custom-prompt-scorer" ,
291- override = True ,
292- )
293-
294- # Verify results
295- assert res [0 ].success , "Example 1 should pass classification"
296- assert not res [1 ].success , "Example 2 should fail classification"
297-
298- # Print debug info if any test fails
299- print_debug_on_failure (res [0 ])
300- print_debug_on_failure (res [1 ])
301-
302-
303162def print_debug_on_failure (result ) -> bool :
304163 """
305164 Helper function to print debug info only on test failure
0 commit comments