@@ -471,7 +471,7 @@ async def get_evaluation_status(eval_name: str, project_name: str, judgment_api_
471
471
error (f"Failed to check evaluation status: { str (e )} " )
472
472
raise JudgmentAPIError (f"Failed to check evaluation status: { str (e )} " )
473
473
474
- async def _poll_evaluation_until_complete (eval_name : str , project_name : str , judgment_api_key : str , organization_id : str , poll_interval_seconds : int = 5 , original_examples : Optional [List [Example ]] = None , expected_scorers : Optional [ List [ Union [ str , Any ]]] = None ) -> List [ScoringResult ]:
474
+ async def _poll_evaluation_until_complete (eval_name : str , project_name : str , judgment_api_key : str , organization_id : str , poll_interval_seconds : int = 5 , original_examples : Optional [List [Example ]] = None ) -> List [ScoringResult ]:
475
475
"""
476
476
Polls until the evaluation is complete and returns the results.
477
477
@@ -483,8 +483,6 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
483
483
poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
484
484
original_examples (List[Example], optional): The original examples sent for evaluation.
485
485
If provided, will match results with original examples.
486
- expected_scorers (List[Union[str, Any]], optional): List of expected scorer names or scorer objects.
487
- Used to verify all scorer data is present.
488
486
489
487
Returns:
490
488
List[ScoringResult]: The evaluation results
@@ -496,19 +494,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
496
494
for example in original_examples :
497
495
original_example_map [example .example_id ] = example
498
496
499
- # Extract expected scorer names if provided
500
- expected_scorer_names = []
501
- if expected_scorers :
502
- for scorer in expected_scorers :
503
- if isinstance (scorer , str ):
504
- expected_scorer_names .append (scorer )
505
- elif hasattr (scorer , 'name' ):
506
- expected_scorer_names .append (scorer .name )
507
- elif hasattr (scorer , 'score_type' ) and hasattr (scorer .score_type , 'value' ):
508
- expected_scorer_names .append (scorer .score_type .value )
509
-
510
- debug (f"Expecting results for these scorers: { expected_scorer_names } " )
511
-
497
+ # Remove the expected scorer names extraction and checking
498
+ # We'll instead verify all examples have consistent scorer data
512
499
while True :
513
500
poll_count += 1
514
501
try :
@@ -567,6 +554,7 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
567
554
568
555
if "examples" in result_data :
569
556
examples_data = result_data .get ("examples" , [])
557
+
570
558
571
559
info (f"Successfully fetched { len (examples_data )} results for evaluation '{ eval_name } '" )
572
560
@@ -576,6 +564,7 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
576
564
has_invalid_results = False
577
565
for example_data in examples_data :
578
566
example_id = example_data .get ("example_id" )
567
+
579
568
if example_id not in original_example_map :
580
569
warning (f"Server returned example with ID { example_id } not found in original examples. " +
581
570
f"This indicates stale or incorrect data. Continuing to poll..." )
@@ -594,32 +583,28 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
594
583
f"This indicates incomplete data. Continuing to poll..." )
595
584
await asyncio .sleep (poll_interval_seconds )
596
585
continue
597
-
598
- # Verify all scorer data is present if expected_scorer_names is provided
599
- if expected_scorer_names :
600
- has_incomplete_scorer_data = False
586
+
587
+ # Collect all example IDs from scorer data
588
+ scorer_example_ids = set ()
601
589
for example_data in examples_data :
602
590
scorer_data_list = example_data .get ("scorer_data" , [])
603
-
604
- # Extract scorer names from the retrieved data
605
- retrieved_scorer_names = set ()
606
591
for scorer_data in scorer_data_list :
607
- name = scorer_data .get ("name" )
608
- if name :
609
- retrieved_scorer_names .add (name )
610
-
611
- # Check if all expected scorers are present
612
- missing_scorers = set (expected_scorer_names ) - retrieved_scorer_names
613
- if missing_scorers :
614
- example_id = example_data .get ("example_id" , "unknown" )
615
- warning (f"Example { example_id } is missing scorer data for: { missing_scorers } . " +
616
- f"Continuing to poll for complete data..." )
617
- has_incomplete_scorer_data = True
618
- break
592
+ if "example_id" in scorer_data :
593
+ scorer_example_ids .add (scorer_data ["example_id" ])
594
+
595
+ # Get the set of original example IDs
596
+ original_example_ids = set (original_example_map .keys ())
597
+
598
+ # Check if the sets are equal
599
+ missing_in_scorer = original_example_ids - scorer_example_ids
600
+ extra_in_scorer = scorer_example_ids - original_example_ids
619
601
620
- # If any example has incomplete scorer data, continue polling
621
- if has_incomplete_scorer_data :
622
- info ("Detected incomplete scorer data. Waiting before polling again..." )
602
+ if missing_in_scorer or extra_in_scorer :
603
+ if missing_in_scorer :
604
+ warning (f"Examples missing in scorer data: { missing_in_scorer } " )
605
+ if extra_in_scorer :
606
+ warning (f"Extra examples in scorer data: { extra_in_scorer } " )
607
+ info ("Detected mismatched example IDs in scorer data. Waiting before polling again..." )
623
608
await asyncio .sleep (poll_interval_seconds )
624
609
continue
625
610
@@ -807,8 +792,7 @@ async def _async_evaluation_workflow():
807
792
project_name = evaluation_run .project_name ,
808
793
judgment_api_key = evaluation_run .judgment_api_key ,
809
794
organization_id = evaluation_run .organization_id ,
810
- original_examples = evaluation_run .examples , # Pass the original examples
811
- expected_scorers = evaluation_run .scorers # Pass the expected scorers for verification
795
+ original_examples = evaluation_run .examples # Pass the original examples
812
796
)
813
797
814
798
# Create and return a task that can be awaited
0 commit comments