cumulative cost tracking

justinsheu · justinsheu · commit 6a7b92457362 · 2025-08-15T10:44:26.000-07:00
diff --git a/src/judgeval/tracer/__init__.py b/src/judgeval/tracer/__init__.py
@@ -60,10 +60,14 @@
 Cls = TypeVar("Cls", bound=Type)
 ApiClient = TypeVar("ApiClient", bound=Any)
 
-_current_agent_context: ContextVar[Optional[Dict[str, str]]] = ContextVar(
+_current_agent_context: ContextVar[Optional[Dict[str, str | bool]]] = ContextVar(
     "current_agent_context", default=None
 )
 
+_current_cost_context: ContextVar[Optional[Dict[str, float]]] = ContextVar(
+    "current_cost_context", default=None
+)
+
 
 def resolve_project_id(
     api_key: str, organization_id: str, project_name: str
@@ -207,7 +211,27 @@ def get_current_span(self):
     def get_tracer(self):
         return self.tracer
 
-    def _add_agent_attributes_to_span(
+    def get_current_agent_context(self):
+        return _current_agent_context
+
+    def get_current_cost_context(self):
+        return _current_cost_context
+
+    def add_cost_to_current_context(self, cost: float) -> None:
+        """Add cost to the current cost context and update span attribute."""
+        current_cost_context = _current_cost_context.get()
+        if current_cost_context is not None:
+            current_cumulative_cost = current_cost_context.get("cumulative_cost", 0.0)
+            new_cumulative_cost = float(current_cumulative_cost) + cost
+            current_cost_context["cumulative_cost"] = new_cumulative_cost
+
+            span = get_current_span()
+            if span and span.is_recording():
+                span.set_attribute(
+                    AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST, new_cumulative_cost
+                )
+
+    def add_agent_attributes_to_span(
         self, span, attributes: Optional[Dict[str, Any]] = None
     ):
         """Add agent ID, class name, and instance name to span if they exist in context"""
@@ -238,7 +262,7 @@ def _add_agent_attributes_to_span(
                     current_agent_context["is_agent_entry_point"],
                 )
                 current_agent_context["is_agent_entry_point"] = (
-                    "false"  # only true for entry point to agent
+                    False  # only true for entry point to agent
                 )
 
     def _wrap_sync(
@@ -248,7 +272,7 @@ def _wrap_sync(
         def wrapper(*args, **kwargs):
             n = name or f.__qualname__
             with sync_span_context(self, n, attributes) as span:
-                self._add_agent_attributes_to_span(span, attributes)
+                self.add_agent_attributes_to_span(span, attributes)
                 try:
                     span.set_attribute(
                         AttributeKeys.JUDGMENT_INPUT,
@@ -276,7 +300,7 @@ def _wrap_async(
         async def wrapper(*args, **kwargs):
             n = name or f.__qualname__
             with sync_span_context(self, n, attributes) as span:
-                self._add_agent_attributes_to_span(span, attributes)
+                self.add_agent_attributes_to_span(span, attributes)
                 try:
                     span.set_attribute(
                         AttributeKeys.JUDGMENT_INPUT,
@@ -390,7 +414,7 @@ async def async_wrapper(*args, **kwargs):
                         agent_context["parent_agent_id"] = current_agent_context[
                             "agent_id"
                         ]
-                    agent_context["is_agent_entry_point"] = "true"
+                    agent_context["is_agent_entry_point"] = True
                     token = _current_agent_context.set(agent_context)
                     try:
                         return await f(*args, **kwargs)
@@ -418,7 +442,7 @@ def sync_wrapper(*args, **kwargs):
                         agent_context["parent_agent_id"] = current_agent_context[
                             "agent_id"
                         ]
-                    agent_context["is_agent_entry_point"] = "true"
+                    agent_context["is_agent_entry_point"] = True
                     token = _current_agent_context.set(agent_context)
                     try:
                         return f(*args, **kwargs)
diff --git a/src/judgeval/tracer/keys.py b/src/judgeval/tracer/keys.py
@@ -20,6 +20,7 @@ class AttributeKeys:
     JUDGMENT_AGENT_CLASS_NAME = "judgment.agent_class_name"
     JUDGMENT_AGENT_INSTANCE_NAME = "judgment.agent_instance_name"
     JUDGMENT_IS_AGENT_ENTRY_POINT = "judgment.is_agent_entry_point"
+    JUDGMENT_CUMULATIVE_LLM_COST = "judgment.cumulative_llm_cost"
 
     # GenAI-specific attributes (semantic conventions)
     GEN_AI_PROMPT = gen_ai_attributes.GEN_AI_PROMPT
@@ -34,6 +35,10 @@ class AttributeKeys:
     GEN_AI_REQUEST_MAX_TOKENS = gen_ai_attributes.GEN_AI_REQUEST_MAX_TOKENS
     GEN_AI_RESPONSE_FINISH_REASONS = gen_ai_attributes.GEN_AI_RESPONSE_FINISH_REASONS
 
+    # GenAI-specific attributes (custom namespace)
+    GEN_AI_USAGE_TOTAL_TOKENS = "gen_ai.usage.total_tokens"
+    GEN_AI_USAGE_TOTAL_COST = "gen_ai.usage.total_cost"
+
 
 class ResourceKeys:
     SERVICE_NAME = ResourceAttributes.SERVICE_NAME
diff --git a/src/judgeval/tracer/llm/__init__.py b/src/judgeval/tracer/llm/__init__.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 import functools
-import sys
 from typing import Callable, Tuple, Optional, Any, TYPE_CHECKING
 from functools import wraps
 from judgeval.data.trace import TraceUsage
@@ -55,6 +54,9 @@ def wrapper(*args, **kwargs):
             with sync_span_context(
                 tracer, span_name, {AttributeKeys.SPAN_TYPE: "llm"}
             ) as span:
+                tracer.add_agent_attributes_to_span(
+                    span, {AttributeKeys.SPAN_TYPE: "llm"}
+                )
                 span.set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
                 try:
                     response = function(*args, **kwargs)
@@ -76,6 +78,18 @@ def wrapper(*args, **kwargs):
                                 AttributeKeys.GEN_AI_USAGE_COMPLETION_TOKENS,
                                 usage.completion_tokens,
                             )
+                        if usage.total_tokens:
+                            span.set_attribute(
+                                AttributeKeys.GEN_AI_USAGE_TOTAL_TOKENS,
+                                usage.total_tokens,
+                            )
+                        if usage.total_cost_usd:
+                            span.set_attribute(
+                                AttributeKeys.GEN_AI_USAGE_TOTAL_COST,
+                                usage.total_cost_usd,
+                            )
+                            # Add cost to cumulative context tracking
+                            tracer.add_cost_to_current_context(usage.total_cost_usd)
                     return response
                 except Exception as e:
                     span.record_exception(e)
@@ -89,6 +103,9 @@ async def wrapper(*args, **kwargs):
             async with async_span_context(
                 tracer, span_name, {AttributeKeys.SPAN_TYPE: "llm"}
             ) as span:
+                tracer.add_agent_attributes_to_span(
+                    span, {AttributeKeys.SPAN_TYPE: "llm"}
+                )
                 span.set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
                 try:
                     response = await function(*args, **kwargs)
@@ -110,6 +127,17 @@ async def wrapper(*args, **kwargs):
                                 AttributeKeys.GEN_AI_USAGE_COMPLETION_TOKENS,
                                 usage.completion_tokens,
                             )
+                        if usage.total_tokens:
+                            span.set_attribute(
+                                AttributeKeys.GEN_AI_USAGE_TOTAL_TOKENS,
+                                usage.total_tokens,
+                            )
+                        if usage.total_cost_usd:
+                            span.set_attribute(
+                                AttributeKeys.GEN_AI_USAGE_TOTAL_COST,
+                                usage.total_cost_usd,
+                            )
+                            tracer.add_cost_to_current_context(usage.total_cost_usd)
                     return response
                 except Exception as e:
                     span.record_exception(e)
@@ -160,9 +188,9 @@ async def wrapper(*args, **kwargs):
         )
 
         assert google_genai_Client is not None, "Google GenAI client not found"
-        assert (
-            google_genai_AsyncClient is not None
-        ), "Google GenAI async client not found"
+        assert google_genai_AsyncClient is not None, (
+            "Google GenAI async client not found"
+        )
         if isinstance(client, google_genai_Client):
             setattr(client.models, "generate_content", wrapped(original_create))
         elif isinstance(client, google_genai_AsyncClient):
@@ -225,9 +253,9 @@ def _get_client_config(client: ApiClient) -> tuple[str, Callable]:
         )
 
         assert google_genai_Client is not None, "Google GenAI client not found"
-        assert (
-            google_genai_AsyncClient is not None
-        ), "Google GenAI async client not found"
+        assert google_genai_AsyncClient is not None, (
+            "Google GenAI async client not found"
+        )
         if isinstance(client, google_genai_Client):
             return "GOOGLE_API_CALL", client.models.generate_content
         elif isinstance(client, google_genai_AsyncClient):
@@ -269,9 +297,9 @@ def _format_output_data(
         assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
         assert openai_ChatCompletion is not None, "OpenAI chat completion not found"
         assert openai_Response is not None, "OpenAI response not found"
-        assert (
-            openai_ParsedChatCompletion is not None
-        ), "OpenAI parsed chat completion not found"
+        assert openai_ParsedChatCompletion is not None, (
+            "OpenAI parsed chat completion not found"
+        )
 
         if isinstance(client, openai_OpenAI) or isinstance(client, openai_AsyncOpenAI):
             if isinstance(response, openai_ChatCompletion):
@@ -318,7 +346,11 @@ def _format_output_data(
                     else 0
                 )
                 output0 = response.output[0]
-                if hasattr(output0, "content") and output0.content and hasattr(output0.content, "__iter__"):  # type: ignore[attr-defined]
+                if (
+                    hasattr(output0, "content")
+                    and output0.content
+                    and hasattr(output0.content, "__iter__")
+                ):  # type: ignore[attr-defined]
                     message_content = "".join(
                         seg.text  # type: ignore[attr-defined]
                         for seg in output0.content  # type: ignore[attr-defined]
@@ -346,9 +378,23 @@ def _format_output_data(
             client, together_AsyncTogether
         ):
             model_name = (response.model or "") if hasattr(response, "model") else ""
-            prompt_tokens = response.usage.prompt_tokens if hasattr(response.usage, "prompt_tokens") and response.usage.prompt_tokens is not None else 0  # type: ignore[attr-defined]
-            completion_tokens = response.usage.completion_tokens if hasattr(response.usage, "completion_tokens") and response.usage.completion_tokens is not None else 0  # type: ignore[attr-defined]
-            message_content = response.choices[0].message.content if hasattr(response, "choices") else None  # type: ignore[attr-defined]
+            prompt_tokens = (
+                response.usage.prompt_tokens
+                if hasattr(response.usage, "prompt_tokens")
+                and response.usage.prompt_tokens is not None
+                else 0
+            )  # type: ignore[attr-defined]
+            completion_tokens = (
+                response.usage.completion_tokens
+                if hasattr(response.usage, "completion_tokens")
+                and response.usage.completion_tokens is not None
+                else 0
+            )  # type: ignore[attr-defined]
+            message_content = (
+                response.choices[0].message.content
+                if hasattr(response, "choices")
+                else None
+            )  # type: ignore[attr-defined]
 
             if model_name:
                 return message_content, _create_usage(
@@ -366,9 +412,9 @@ def _format_output_data(
         )
 
         assert google_genai_Client is not None, "Google GenAI client not found"
-        assert (
-            google_genai_AsyncClient is not None
-        ), "Google GenAI async client not found"
+        assert google_genai_AsyncClient is not None, (
+            "Google GenAI async client not found"
+        )
         if isinstance(client, google_genai_Client) or isinstance(
             client, google_genai_AsyncClient
         ):
@@ -467,9 +513,23 @@ def _format_output_data(
         assert groq_AsyncGroq is not None, "Groq async client not found"
         if isinstance(client, groq_Groq) or isinstance(client, groq_AsyncGroq):
             model_name = (response.model or "") if hasattr(response, "model") else ""
-            prompt_tokens = response.usage.prompt_tokens if hasattr(response.usage, "prompt_tokens") and response.usage.prompt_tokens is not None else 0  # type: ignore[attr-defined]
-            completion_tokens = response.usage.completion_tokens if hasattr(response.usage, "completion_tokens") and response.usage.completion_tokens is not None else 0  # type: ignore[attr-defined]
-            message_content = response.choices[0].message.content if hasattr(response, "choices") else None  # type: ignore[attr-defined]
+            prompt_tokens = (
+                response.usage.prompt_tokens
+                if hasattr(response.usage, "prompt_tokens")
+                and response.usage.prompt_tokens is not None
+                else 0
+            )  # type: ignore[attr-defined]
+            completion_tokens = (
+                response.usage.completion_tokens
+                if hasattr(response.usage, "completion_tokens")
+                and response.usage.completion_tokens is not None
+                else 0
+            )  # type: ignore[attr-defined]
+            message_content = (
+                response.choices[0].message.content
+                if hasattr(response, "choices")
+                else None
+            )  # type: ignore[attr-defined]
 
             if model_name:
                 return message_content, _create_usage(
diff --git a/src/judgeval/tracer/managers.py b/src/judgeval/tracer/managers.py
@@ -2,6 +2,7 @@
 
 from contextlib import asynccontextmanager, contextmanager
 from typing import TYPE_CHECKING, Dict, Optional
+from judgeval.tracer.keys import AttributeKeys
 
 if TYPE_CHECKING:
     from judgeval.tracer import Tracer
@@ -16,11 +17,24 @@ def sync_span_context(
     if span_attributes is None:
         span_attributes = {}
 
-    with tracer.get_tracer().start_as_current_span(
-        name=name,
-        attributes=span_attributes,
-    ) as span:
-        yield span
+    current_cost_context = tracer.get_current_cost_context()
+
+    cost_context = {"cumulative_cost": 0.0}
+
+    cost_token = current_cost_context.set(cost_context)
+
+    try:
+        with tracer.get_tracer().start_as_current_span(
+            name=name,
+            attributes=span_attributes,
+        ) as span:
+            # Set initial cumulative cost attribute
+            span.set_attribute(AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST, 0.0)
+            yield span
+    finally:
+        current_cost_context.reset(cost_token)
+        child_cost = float(cost_context.get("cumulative_cost", 0.0))
+        tracer.add_cost_to_current_context(child_cost)
 
 
 @asynccontextmanager
@@ -30,8 +44,20 @@ async def async_span_context(
     if span_attributes is None:
         span_attributes = {}
 
-    with tracer.get_tracer().start_as_current_span(
-        name=name,
-        attributes=span_attributes,
-    ) as span:
-        yield span
+    current_cost_context = tracer.get_current_cost_context()
+
+    cost_context = {"cumulative_cost": 0.0}
+
+    cost_token = current_cost_context.set(cost_context)
+
+    try:
+        with tracer.get_tracer().start_as_current_span(
+            name=name,
+            attributes=span_attributes,
+        ) as span:
+            span.set_attribute(AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST, 0.0)
+            yield span
+    finally:
+        current_cost_context.reset(cost_token)
+        child_cost = float(cost_context.get("cumulative_cost", 0.0))
+        tracer.add_cost_to_current_context(child_cost)