Merge pull request #73 from JudgmentLabs/trace-token-aggregations

JCamyre · web-flow · commit f9d6189f86fc · 2025-02-18T12:27:31.000-08:00
add: count total tokens for trace runs
diff --git a/src/e2etests/test_tracer.py b/src/e2etests/test_tracer.py
@@ -122,20 +122,38 @@ async def make_poem(input: str) -> str:
         print(f"Error generating poem: {e}")
         return ""
 
+async def test_token_counting(trace_data: dict):
+    """Test that token counts are properly aggregated from different LLM API calls."""
+    # Verify token counts exist and are properly aggregated
+    token_counts = trace_data["token_counts"]
+    assert token_counts["prompt_tokens"] > 0, "Prompt tokens should be counted"
+    assert token_counts["completion_tokens"] > 0, "Completion tokens should be counted"
+    assert token_counts["total_tokens"] > 0, "Total tokens should be counted"
+    assert token_counts["total_tokens"] == (
+        token_counts["prompt_tokens"] + token_counts["completion_tokens"]
+    ), "Total tokens should be equal to the sum of prompt and completion tokens"
+    
+    # Print token counts for verification
+    print("\nToken Count Results:")
+    print(f"Prompt Tokens: {token_counts['prompt_tokens']}")
+    print(f"Completion Tokens: {token_counts['completion_tokens']}")
+    print(f"Total Tokens: {token_counts['total_tokens']}")
+
 async def test_evaluation_mixed(input):
     PROJECT_NAME = "TestingPoemBot"
     with judgment.trace("Use-claude-hehexd123", project_name=PROJECT_NAME, overwrite=True) as trace:
         upper = await make_upper(input)
         result = await make_poem(upper)
         await answer_user_question("What if these shoes don't fit?")
-
-    trace.save()
         
-    trace.print()
-    
-    return result
+        # Save trace data and test token counting
+        trace_id, trace_data = trace.save()
+        await test_token_counting(trace_data)
+        
+        trace.print()
+        return result
 
 if __name__ == "__main__":
-    # Use a more meaningful test input
+    # Run both tests
     test_input = "Write a poem about Nissan R32 GTR"
-    asyncio.run(test_evaluation_mixed(test_input))
+    asyncio.run(test_evaluation_mixed(test_input))
diff --git a/src/judgeval/common/tracer.py b/src/judgeval/common/tracer.py
@@ -383,6 +383,24 @@ def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str,
         raw_entries = [entry.to_dict() for entry in self.entries]
         condensed_entries = self.condense_trace(raw_entries)
 
+        # Calculate total token counts from LLM API calls
+        total_prompt_tokens = 0
+        total_completion_tokens = 0
+        total_tokens = 0
+        
+        for entry in condensed_entries:
+            if entry.get("span_type") == "llm" and isinstance(entry.get("output"), dict):
+                usage = entry["output"].get("usage", {})
+                # Handle OpenAI/Together format
+                if "prompt_tokens" in usage:
+                    total_prompt_tokens += usage.get("prompt_tokens", 0)
+                    total_completion_tokens += usage.get("completion_tokens", 0)
+                # Handle Anthropic format
+                elif "input_tokens" in usage:
+                    total_prompt_tokens += usage.get("input_tokens", 0)
+                    total_completion_tokens += usage.get("output_tokens", 0)
+                total_tokens += usage.get("total_tokens", 0)
+
         # Create trace document
         trace_data = {
             "trace_id": self.trace_id,
@@ -392,10 +410,10 @@ def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str,
             "created_at": datetime.fromtimestamp(self.start_time).isoformat(),
             "duration": total_duration,
             "token_counts": {
-                "prompt_tokens": 0,  # Dummy value
-                "completion_tokens": 0,  # Dummy value
-                "total_tokens": 0,  # Dummy value
-            },  # TODO: Add token counts
+                "prompt_tokens": total_prompt_tokens,
+                "completion_tokens": total_completion_tokens,
+                "total_tokens": total_tokens,
+            },
             "entries": condensed_entries,
             "empty_save": empty_save,
             "overwrite": overwrite