JudgmentLabs · abhishekg999 · May 26, 2025 · May 21, 2025 · May 22, 2025 · May 24, 2025
diff --git a/docs/monitoring/tracing.mdx b/docs/monitoring/tracing.mdx
@@ -87,7 +87,7 @@ const client = wrap(new OpenAI());
 #### `@observe` (Python) / `observe()` (Typescript) ####
 The `@observe` decorator (Python) or the `observe()` higher-order function (Typescript) wraps your functions/tools and captures metadata surrounding your function calls, such as:
 - Latency
-- Input/Output
+- Input/Output/Error
 - Span type (e.g. `retriever`, `tool`, `LLM call`, etc.)
 
 Here's an example of using the observer mechanism:

diff --git a/src/judgeval/common/tracer.py b/src/judgeval/common/tracer.py
@@ -498,7 +498,15 @@ def record_output(self, output: Any):
             return span # Return the created entry
         # Removed else block - original didn't have one
         return None # Return None if no span_id found
-
+
+    def record_error(self, error: Any):
+        current_span_id = current_span_var.get()
+        if current_span_id:
+            span = self.span_id_to_span[current_span_id]
+            span.error = error
+            return span
+        return None
+
     def add_span(self, span: TraceSpan):
         """Add a trace span to this trace context"""
         self.trace_spans.append(span)
@@ -677,7 +685,17 @@ def save(self, overwrite: bool = False) -> Tuple[str, dict]:
     def delete(self):
         return self.trace_manager_client.delete_trace(self.trace_id)
 
-
+def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: Tuple[Optional[type], Optional[BaseException], Optional[types.TracebackType]]):
+    if not current_trace:
+        return
+
+    exc_type, exc_value, exc_traceback_obj = exc_info
+    formatted_exception = {
+        "type": exc_type.__name__ if exc_type else "UnknownExceptionType",
+        "message": str(exc_value) if exc_value else "No exception message",
+        "traceback": traceback.format_tb(exc_traceback_obj) if exc_traceback_obj else []
+    }
+    current_trace.record_error(formatted_exception)
 class _DeepTracer:
     _instance: Optional["_DeepTracer"] = None
     _lock: threading.Lock = threading.Lock()
@@ -869,16 +887,11 @@ def _trace(self, frame: types.FrameType, event: str, arg: Any):
                 current_span_var.reset(frame.f_locals["_judgment_span_token"])
 
         elif event == "exception":
-            exc_type, exc_value, exc_traceback = arg
-            formatted_exception = {
-                "type": exc_type.__name__,
-                "message": str(exc_value),
-                "traceback": traceback.format_tb(exc_traceback)
-            }
-            current_trace = current_trace_var.get()
-            current_trace.record_output({
-                "error": formatted_exception
-            })
+            exc_type = arg[0]
+            if issubclass(exc_type, (StopIteration, StopAsyncIteration, GeneratorExit)):
+                return
+            _capture_exception_for_trace(current_trace, arg)
+
 
         return self._trace
 
@@ -1154,8 +1167,12 @@ async def async_wrapper(*args, **kwargs):
                                 with _DeepTracer():
                                     result = await func(*args, **kwargs)
                             else:
-                                result = await func(*args, **kwargs)
-
+                                try:
+                                    result = await func(*args, **kwargs)
+                                except Exception as e:
+                                    _capture_exception_for_trace(current_trace, sys.exc_info())
+                                    raise e
+
                             # Record output
                             span.record_output(result)
                         return result
@@ -1175,7 +1192,11 @@ async def async_wrapper(*args, **kwargs):
                             with _DeepTracer():
                                 result = await func(*args, **kwargs)
                         else:
-                            result = await func(*args, **kwargs)
+                            try:
+                                result = await func(*args, **kwargs)
+                            except Exception as e:
+                                _capture_exception_for_trace(current_trace, sys.exc_info())
+                                raise e
 
                         span.record_output(result)
                     return result
@@ -1221,7 +1242,11 @@ def wrapper(*args, **kwargs):
                                 with _DeepTracer():
                                     result = func(*args, **kwargs)
                             else:
-                                result = func(*args, **kwargs)
+                                try:
+                                    result = func(*args, **kwargs)
+                                except Exception as e:
+                                    _capture_exception_for_trace(current_trace, sys.exc_info())
+                                    raise e
 
                             # Record output
                             span.record_output(result)
@@ -1243,7 +1268,11 @@ def wrapper(*args, **kwargs):
                             with _DeepTracer():
                                 result = func(*args, **kwargs)
                         else:
-                            result = func(*args, **kwargs)
+                            try:
+                                result = func(*args, **kwargs)
+                            except Exception as e:
+                                _capture_exception_for_trace(current_trace, sys.exc_info())
+                                raise e
 
                         span.record_output(result)
                     return result

diff --git a/src/judgeval/data/trace.py b/src/judgeval/data/trace.py
@@ -14,6 +14,7 @@ class TraceSpan(BaseModel):
     parent_span_id: Optional[str] = None
     span_type: Optional[str] = "span"
     inputs: Optional[Dict[str, Any]] = None
+    error: Optional[Dict[str, Any]] = None
     output: Optional[Any] = None
     duration: Optional[float] = None
     annotation: Optional[List[Dict[str, Any]]] = None
@@ -26,10 +27,10 @@ def model_dump(self, **kwargs):
             "span_id": self.span_id,
             "trace_id": self.trace_id,
             "depth": self.depth,
-#             "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
             "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
-            "inputs": self._serialize_inputs(),
-            "output": self._serialize_output(),
+            "inputs": self._serialize_value(self.inputs),
+            "output": self._serialize_value(self.output),
+            "error": self._serialize_value(self.error),
             "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
             "parent_span_id": self.parent_span_id,
             "function": self.function,
@@ -42,30 +43,6 @@ def print_span(self):
         indent = "  " * self.depth
         parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
         print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
-
-    def _serialize_inputs(self) -> dict:
-        """Helper method to serialize input data safely."""
-        if self.inputs is None:
-            return {}
-
-        serialized_inputs = {}
-        for key, value in self.inputs.items():
-            if isinstance(value, BaseModel):
-                serialized_inputs[key] = value.model_dump()
-            elif isinstance(value, (list, tuple)):
-                # Handle lists/tuples of arguments
-                serialized_inputs[key] = [
-                    item.model_dump() if isinstance(item, BaseModel)
-                    else None if not self._is_json_serializable(item)
-                    else item
-                    for item in value
-                ]
-            else:
-                if self._is_json_serializable(value):
-                    serialized_inputs[key] = value
-                else:
-                    serialized_inputs[key] = self.safe_stringify(value, self.function)
-        return serialized_inputs
 
     def _is_json_serializable(self, obj: Any) -> bool:
         """Helper method to check if an object is JSON serializable."""
@@ -94,9 +71,9 @@ def safe_stringify(self, output, function_name):
         )
         return None
 
-    def _serialize_output(self) -> Any:
-        """Helper method to serialize output data safely."""
-        if self.output is None:
+    def _serialize_value(self, value: Any) -> Any:
+        """Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
+        if value is None:
             return None
 
         def serialize_value(value):
@@ -117,8 +94,8 @@ def serialize_value(value):
                     # Fallback to safe stringification
                     return self.safe_stringify(value, self.function)
 
-        # Start serialization with the top-level output
-        return serialize_value(self.output)
+        # Start serialization with the top-level value
+        return serialize_value(value)
 
 class Trace(BaseModel):
     trace_id: str