JudgmentLabs
diff --git a/‎Pipfile
Lines changed: 2 additions & 0 deletions b/‎Pipfile
Lines changed: 2 additions & 0 deletions
diff --git a/‎judgeval/constants.py
Lines changed: 8 additions & 2 deletions b/‎judgeval/constants.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎judgeval/data/api_example.py
Lines changed: 26 additions & 30 deletions b/‎judgeval/data/api_example.py
Lines changed: 26 additions & 30 deletions
diff --git a/‎judgeval/data/datasets/utils.py
Lines changed: 9 additions & 0 deletions b/‎judgeval/data/datasets/utils.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎judgeval/data/example.py
Lines changed: 0 additions & 35 deletions b/‎judgeval/data/example.py
Lines changed: 0 additions & 35 deletions
diff --git a/‎judgeval/scorers/custom_scorer.py
Lines changed: 2 additions & 3 deletions b/‎judgeval/scorers/custom_scorer.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎judgeval/scorers/prompt_scorer.py
Lines changed: 2 additions & 2 deletions b/‎judgeval/scorers/prompt_scorer.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎judgeval/scorers/score.py
Lines changed: 10 additions & 3 deletions b/‎judgeval/scorers/score.py
Lines changed: 10 additions & 3 deletions
@@ -14,6 +14,8 @@ uvicorn = "*"
 deepeval = "*"
 supabase = "*"
 requests = "*"
+pandas = "*"
+anthropic = "*"
 
 [dev-packages]
 pytest = "*"
 
@@ -5,7 +5,7 @@
 from enum import Enum
 import litellm
 
-class APIScorer(Enum):  
+class APIScorer(str, Enum):  
     """
     Collection of proprietary scorers implemented by Judgment.
 
@@ -20,7 +20,13 @@ class APIScorer(Enum):
     CONTEXTUAL_RELEVANCY = "contextual_relevancy"
     CONTEXTUAL_PRECISION = "contextual_precision"
     TOOL_CORRECTNESS = "tool_correctness"
-    
+
+    @classmethod
+    def _missing_(cls, value):
+        # Handle case-insensitive lookup
+        for member in cls:
+            if member.value == value.lower():
+                return member
 
 ROOT_API = "http://127.0.0.1:8000"
 # ROOT_API = "https://api.judgmentlabs.ai"  # TODO replace this with the actual API root
 
@@ -13,28 +13,24 @@ class ProcessExample(BaseModel):
     """
     name: str
     input: Optional[str] = None
-    actual_output: Optional[str] = Field(None, alias="actualOutput")
-    expected_output: Optional[str] = Field(None, alias="expectedOutput")
-    context: Optional[list] = Field(None)
-    retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
-    tools_called: Optional[list] = Field(None, alias="toolsCalled")
-    expected_tools: Optional[list] = Field(None, alias="expectedTools")
+    actual_output: Optional[str] = None
+    expected_output: Optional[str] = None
+    context: Optional[list] = None
+    retrieval_context: Optional[list] = None
+    tools_called: Optional[list] = None
+    expected_tools: Optional[list] = None
 
     # make these optional, not all test cases in a conversation will be evaluated
-    success: Union[bool, None] = Field(None)
-    scorers_data: Union[List[ScorerData], None] = Field(
-        None, alias="scorersData"
-    )
-    run_duration: Union[float, None] = Field(None, alias="runDuration")
-    evaluation_cost: Union[float, None] = Field(None, alias="evaluationCost")
+    success: Optional[bool] = None
+    scorers_data: Optional[List[ScorerData]] = None
+    run_duration: Optional[float] = None 
+    evaluation_cost: Optional[float] = None
 
-    order: Union[int, None] = Field(None)
+    order: Optional[int] =  None
     # These should map 1 to 1 from golden
-    additional_metadata: Optional[Dict] = Field(
-        None, alias="additionalMetadata"
-    )
-    comments: Optional[str] = Field(None)
-    trace_id: Optional[str] = Field(None)
+    additional_metadata: Optional[Dict] = None
+    comments: Optional[str] = None
+    trace_id: Optional[str] = None
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     def update_scorer_data(self, scorer_data: ScorerData):
@@ -65,12 +61,12 @@ def update_run_duration(self, run_duration: float):
     @model_validator(mode="before")
     def check_input(cls, values: Dict[str, Any]):
         input = values.get("input")
-        actual_output = values.get("actualOutput")
+        actual_output = values.get("actual_output")
 
         if (input is None or actual_output is None):
-            error(f"Validation error: Required fields missing. input={input}, actualOutput={actual_output}")
+            error(f"Validation error: Required fields missing. input={input}, actual_output={actual_output}")
             raise ValueError(
-                "'input' and 'actualOutput' must be provided."
+                "'input' and 'actual_output' must be provided."
             )
 
         return values
@@ -97,18 +93,18 @@ def create_process_example(
     process_ex = ProcessExample(
         name=name,
         input=example.input,
-        actualOutput=example.actual_output,
-        expectedOutput=example.expected_output,
+        actual_output=example.actual_output,
+        expected_output=example.expected_output,
         context=example.context,
-        retrievalContext=example.retrieval_context,
-        toolsCalled=example.tools_called,
-        expectedTools=example.expected_tools,
+        retrieval_context=example.retrieval_context,
+        tools_called=example.tools_called,
+        expected_tools=example.expected_tools,
         success=success,
-        scorersData=scorers_data,
-        runDuration=None,
-        evaluationCost=None,
+        scorers_data=scorers_data,
+        run_duration=None,
+        evaluation_cost=None,
         order=order,
-        additionalMetadata=example.additional_metadata,
+        additional_metadata=example.additional_metadata,
         trace_id=example.trace_id
     )
     return process_ex
 
@@ -14,6 +14,11 @@ def examples_to_ground_truths(examples: List[Example]) -> List[GroundTruthExampl
     Returns:
         List[GroundTruthExample]: A list of `GroundTruthExample` objects.
     """
+
+    if not isinstance(examples, list):
+        raise TypeError("Input should be a list of `Example` objects")
+
+    ground_truths = []
     ground_truths = []
     for e in examples:
         g_truth = {
@@ -45,6 +50,10 @@ def ground_truths_to_examples(
     Returns:
         List[Example]: A list of `Example` objects.
     """
+
+    if not isinstance(ground_truths, list):
+        raise TypeError("Input should be a list of `GroundTruthExample` objects")
+
     examples = []
     for index, ground_truth in enumerate(ground_truths):
         e = Example(
 
@@ -37,41 +37,6 @@ class Example(BaseModel):
     timestamp: Optional[str] = None
     trace_id: Optional[str] = None
 
-    def __post_init__(self):
-        # Ensure `context` is None or a list of strings
-        if self.context is not None:
-            if not isinstance(self.context, list) or not all(
-                isinstance(item, str) for item in self.context
-            ):
-                raise TypeError("'context' must be None or a list of strings")
-
-        # Ensure `retrieval_context` is None or a list of strings
-        if self.retrieval_context is not None:
-            if not isinstance(self.retrieval_context, list) or not all(
-                isinstance(item, str) for item in self.retrieval_context
-            ):
-                raise TypeError(
-                    "'retrieval_context' must be None or a list of strings"
-                )
-
-        # Ensure `tools_called` is None or a list of strings
-        if self.tools_called is not None:
-            if not isinstance(self.tools_called, list) or not all(
-                isinstance(item, str) for item in self.tools_called
-            ):
-                raise TypeError(
-                    "'tools_called' must be None or a list of strings"
-                )
-
-        # Ensure `expected_tools` is None or a list of strings
-        if self.expected_tools is not None:
-            if not isinstance(self.expected_tools, list) or not all(
-                isinstance(item, str) for item in self.expected_tools
-            ):
-                raise TypeError(
-                    "'expected_tools' must be None or a list of strings"
-                )
-
     def __init__(self, **data):
         super().__init__(**data)
         # Set timestamp if not provided
 
@@ -9,7 +9,6 @@
 from abc import abstractmethod
 
 from judgeval.common.logger import debug, info, warning, error
-from judgeval.data import Example
 from judgeval.judges import judgevalJudge
 from judgeval.judges.utils import create_judge
 
@@ -84,7 +83,7 @@ def _add_model(self, model: Optional[Union[str, List[str], judgevalJudge]] = Non
         self.evaluation_model = self.model.get_model_name()
 
     @abstractmethod
-    def score_example(self, example: Example, *args, **kwargs) -> float:
+    def score_example(self, example, *args, **kwargs) -> float:
         """
         Measures the score on a single example
         """
@@ -93,7 +92,7 @@ def score_example(self, example: Example, *args, **kwargs) -> float:
         raise NotImplementedError("You must implement the `score` method in your custom scorer")
 
     @abstractmethod
-    async def a_score_example(self, example: Example, *args, **kwargs) -> float:
+    async def a_score_example(self, example, *args, **kwargs) -> float:
         """
         Asynchronously measures the score on a single example
         """
 
@@ -68,7 +68,7 @@ def score_example(
         """
         Synchronous method for scoring an example using the prompt criteria.
         """
-        with scorer_progress_meter(self, _show_indicator=_show_indicator):
+        with scorer_progress_meter(self, display_meter=_show_indicator):
             if self.async_mode:
                 loop = get_or_create_event_loop()
                 loop.run_until_complete(
@@ -217,7 +217,7 @@ def enforce_prompt_format(self, judge_prompt: List[dict], schema: dict):
             # create formatting string for schema enforcement
             # schema is a map between key and type of the value 
             for key, key_type in schema.items():
-                SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type}), '
+                SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type.__name__}), '
             SCHEMA_ENFORCEMENT_PROMPT = SCHEMA_ENFORCEMENT_PROMPT[:-2] + "}"  # remove trailing comma and space
             judge_prompt[0]["content"] += SCHEMA_ENFORCEMENT_PROMPT
             return judge_prompt
 
@@ -273,8 +273,15 @@ async def a_execute_scoring(
     semaphore = asyncio.Semaphore(max_concurrent)
 
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
-        async with semaphore:
-            return await func(*args, **kwargs)
+        try:
+            async with semaphore:
+                return await func(*args, **kwargs)
+        except Exception as e:
+            error(f"Error executing function: {e}")
+            if kwargs.get('ignore_errors', False):
+                # Return None when ignoring errors
+                return None
+            raise
 
     if verbose_mode is not None:
         for scorer in scorers:
@@ -406,7 +413,7 @@ async def a_eval_examples_helper(
     # the results and update the process example with the scorer data
     for scorer in scorers:
         # At this point, the scorer has been executed and already contains data.
-        if scorer.skipped:
+        if getattr(scorer, 'skipped', False):
             continue
 
         scorer_data = create_scorer_data(scorer)  # Fetch scorer data from completed scorer evaluation