JudgmentLabs
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/e2etests/test_tracer.py
Lines changed: 1 addition & 0 deletions b/‎src/e2etests/test_tracer.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/judgeval/common/tracer/core.py
Lines changed: 171 additions & 1 deletion b/‎src/judgeval/common/tracer/core.py
Lines changed: 171 additions & 1 deletion
diff --git a/‎src/judgeval/common/tracer/trace_manager.py
Lines changed: 6 additions & 1 deletion b/‎src/judgeval/common/tracer/trace_manager.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/judgeval/common/trainer/__init__.py
Lines changed: 5 additions & 0 deletions b/‎src/judgeval/common/trainer/__init__.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/judgeval/common/trainer/config.py
Lines changed: 125 additions & 0 deletions b/‎src/judgeval/common/trainer/config.py
Lines changed: 125 additions & 0 deletions
@@ -31,6 +31,7 @@ dependencies = [
     "langchain-core",
     "click<8.2.0",
     "typer>=0.9.0",
+    "fireworks-ai>=0.19.18",
 ]
 
 [project.urls]
 
@@ -90,6 +90,7 @@ def validate_trace_token_counts(
         "TOGETHER_API_CALL",
         "GOOGLE_API_CALL",
         "GROQ_API_CALL",
+        "FIREWORKS_TRAINABLE_MODEL_CALL",
     }
 
     for span in trace_spans:
 
@@ -815,6 +815,8 @@ def __init__(
         == "true",
         enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower()
         == "true",
+        show_trace_urls: bool = os.getenv("JUDGMENT_SHOW_TRACE_URLS", "true").lower()
+        == "true",
         # S3 configuration
         use_s3: bool = False,
         s3_bucket_name: Optional[str] = None,
@@ -859,6 +861,7 @@ def __init__(
             self.traces: List[Trace] = []
             self.enable_monitoring: bool = enable_monitoring
             self.enable_evaluations: bool = enable_evaluations
+            self.show_trace_urls: bool = show_trace_urls
             self.class_identifiers: Dict[
                 str, str
             ] = {}  # Dictionary to store class identifiers
@@ -1731,6 +1734,93 @@ def _cleanup_on_exit(self):
                     f"Error during background service shutdown: {e}"
                 )
 
+    def trace_to_message_history(
+        self, trace: Union[Trace, TraceClient]
+    ) -> List[Dict[str, str]]:
+        """
+        Extract message history from a trace for training purposes.
+
+        This method processes trace spans to reconstruct the conversation flow,
+        extracting messages in chronological order from LLM, user, and tool spans.
+
+        Args:
+            trace: Trace or TraceClient instance to extract messages from
+
+        Returns:
+            List of message dictionaries with 'role' and 'content' keys
+
+        Raises:
+            ValueError: If no trace is provided
+        """
+        if not trace:
+            raise ValueError("No trace provided")
+
+        # Handle both Trace and TraceClient objects
+        if isinstance(trace, TraceClient):
+            spans = trace.trace_spans
+        else:
+            spans = trace.trace_spans if hasattr(trace, "trace_spans") else []
+
+        messages = []
+        first_found = False
+
+        # Process spans in chronological order
+        for span in sorted(
+            spans, key=lambda s: s.created_at if hasattr(s, "created_at") else 0
+        ):
+            # Skip spans without output (except for first LLM span which may have input messages)
+            if span.output is None and span.span_type != "llm":
+                continue
+
+            if span.span_type == "llm":
+                # For the first LLM span, extract input messages (system + user prompts)
+                if not first_found and hasattr(span, "inputs") and span.inputs:
+                    input_messages = span.inputs.get("messages", [])
+                    if input_messages:
+                        first_found = True
+                        # Add input messages (typically system and user messages)
+                        for msg in input_messages:
+                            if (
+                                isinstance(msg, dict)
+                                and "role" in msg
+                                and "content" in msg
+                            ):
+                                messages.append(
+                                    {"role": msg["role"], "content": msg["content"]}
+                                )
+
+                # Add assistant response from span output
+                if span.output is not None:
+                    messages.append({"role": "assistant", "content": str(span.output)})
+
+            elif span.span_type == "user":
+                # Add user messages
+                if span.output is not None:
+                    messages.append({"role": "user", "content": str(span.output)})
+
+            elif span.span_type == "tool":
+                # Add tool responses as user messages (common pattern in training)
+                if span.output is not None:
+                    messages.append({"role": "user", "content": str(span.output)})
+
+        return messages
+
+    def get_current_message_history(self) -> List[Dict[str, str]]:
+        """
+        Get message history from the current trace.
+
+        Returns:
+            List of message dictionaries from the current trace context
+
+        Raises:
+            ValueError: If no current trace is found
+        """
+        current_trace = self.get_current_trace()
+        if not current_trace:
+            raise ValueError("No current trace found")
+
+        return self.trace_to_message_history(current_trace)
+
 
 def _get_current_trace(
     trace_across_async_contexts: bool = Tracer.trace_across_async_contexts,
@@ -1746,7 +1836,7 @@ def wrap(
 ) -> Any:
     """
     Wraps an API client to add tracing capabilities.
-    Supports OpenAI, Together, Anthropic, and Google GenAI clients.
+    Supports OpenAI, Together, Anthropic, Google GenAI clients, and TrainableModel.
     Patches both '.create' and Anthropic's '.stream' methods using a wrapper class.
     """
     (
@@ -1871,6 +1961,39 @@ async def wrapper(*args, **kwargs):
             setattr(client.chat.completions, "create", wrapped(original_create))
         elif isinstance(client, (groq_AsyncGroq)):
             setattr(client.chat.completions, "create", wrapped_async(original_create))
+
+    # Check for TrainableModel from judgeval.common.trainer
+    try:
+        from judgeval.common.trainer import TrainableModel
+
+        if isinstance(client, TrainableModel):
+            # Define a wrapper function that can be reapplied to new model instances
+            def wrap_model_instance(model_instance):
+                """Wrap a model instance with tracing functionality"""
+                if hasattr(model_instance, "chat") and hasattr(
+                    model_instance.chat, "completions"
+                ):
+                    if hasattr(model_instance.chat.completions, "create"):
+                        setattr(
+                            model_instance.chat.completions,
+                            "create",
+                            wrapped(model_instance.chat.completions.create),
+                        )
+                    if hasattr(model_instance.chat.completions, "acreate"):
+                        setattr(
+                            model_instance.chat.completions,
+                            "acreate",
+                            wrapped_async(model_instance.chat.completions.acreate),
+                        )
+
+            # Register the wrapper function with the TrainableModel
+            client._register_tracer_wrapper(wrap_model_instance)
+
+            # Apply wrapping to the current model
+            wrap_model_instance(client._current_model)
+    except ImportError:
+        pass  # TrainableModel not available
+
     return client
 
 
@@ -1977,6 +2100,22 @@ def _get_client_config(
             return "GROQ_API_CALL", client.chat.completions.create, None, None, None
         elif isinstance(client, (groq_AsyncGroq)):
             return "GROQ_API_CALL", client.chat.completions.create, None, None, None
+
+    # Check for TrainableModel
+    try:
+        from judgeval.common.trainer import TrainableModel
+
+        if isinstance(client, TrainableModel):
+            return (
+                "FIREWORKS_TRAINABLE_MODEL_CALL",
+                client._current_model.chat.completions.create,
+                None,
+                None,
+                None,
+            )
+    except ImportError:
+        pass  # TrainableModel not available
+
     raise ValueError(f"Unsupported client type: {type(client)}")
 
 
@@ -2155,6 +2294,37 @@ def _format_output_data(
                 cache_creation_input_tokens,
             )
 
+    # Check for TrainableModel
+    try:
+        from judgeval.common.trainer import TrainableModel
+
+        if isinstance(client, TrainableModel):
+            # TrainableModel uses Fireworks LLM internally, so response format should be similar to OpenAI
+            if (
+                hasattr(response, "model")
+                and hasattr(response, "usage")
+                and hasattr(response, "choices")
+            ):
+                model_name = response.model
+                prompt_tokens = response.usage.prompt_tokens if response.usage else 0
+                completion_tokens = (
+                    response.usage.completion_tokens if response.usage else 0
+                )
+                message_content = response.choices[0].message.content
+
+                # Use LiteLLM cost calculation with fireworks_ai prefix
+                # LiteLLM supports Fireworks AI models for cost calculation when prefixed with "fireworks_ai/"
+                fireworks_model_name = f"fireworks_ai/{model_name}"
+                return message_content, _create_usage(
+                    fireworks_model_name,
+                    prompt_tokens,
+                    completion_tokens,
+                    cache_read_input_tokens,
+                    cache_creation_input_tokens,
+                )
+    except ImportError:
+        pass  # TrainableModel not available
+
     judgeval_logger.warning(f"Unsupported client type: {type(client)}")
     return None, None
 
 
@@ -71,7 +71,12 @@ def upsert_trace(
 
         server_response = self.api_client.upsert_trace(trace_data)
 
-        if not offline_mode and show_link and "ui_results_url" in server_response:
+        if (
+            not offline_mode
+            and show_link
+            and "ui_results_url" in server_response
+            and self.tracer.show_trace_urls
+        ):
             pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={server_response['ui_results_url']}]View Trace[/link]\n"
             rprint(pretty_str)
 
 
@@ -0,0 +1,5 @@
+from .trainer import JudgmentTrainer
+from .config import TrainerConfig, ModelConfig
+from .trainable_model import TrainableModel
+
+__all__ = ["JudgmentTrainer", "TrainerConfig", "ModelConfig", "TrainableModel"]
@@ -0,0 +1,125 @@
+from dataclasses import dataclass
+from typing import Optional, Dict, Any
+import json
+
+
+@dataclass
+class TrainerConfig:
+    """Configuration class for JudgmentTrainer parameters."""
+
+    deployment_id: str
+    user_id: str
+    model_id: str
+    base_model_name: str = "qwen2p5-7b-instruct"
+    rft_provider: str = "fireworks"
+    num_steps: int = 5
+    num_generations_per_prompt: int = (
+        5  # Number of rollouts/generations per input prompt
+    )
+    num_prompts_per_step: int = 4  # Number of input prompts to sample per training step
+    concurrency: int = 100
+    epochs: int = 1
+    learning_rate: float = 1e-5
+    accelerator_count: int = 1
+    accelerator_type: str = "NVIDIA_A100_80GB"
+    temperature: float = 1.5
+    max_tokens: int = 50
+    enable_addons: bool = True
+
+
+@dataclass
+class ModelConfig:
+    """
+    Configuration class for storing and loading trained model state.
+
+    This class enables persistence of trained models so they can be loaded
+    and used later without retraining.
+
+    Example usage:
+        trainer = JudgmentTrainer(config)
+        model_config = trainer.train(agent_function, scorers, prompts)
+
+        # Save the trained model configuration
+        model_config.save_to_file("my_trained_model.json")
+
+        # Later, load and use the trained model
+        loaded_config = ModelConfig.load_from_file("my_trained_model.json")
+        trained_model = TrainableModel.from_model_config(loaded_config)
+
+        # Use the trained model for inference
+        response = trained_model.chat.completions.create(
+            model="current",  # Uses the loaded trained model
+            messages=[{"role": "user", "content": "Hello!"}]
+        )
+    """
+
+    # Base model configuration
+    base_model_name: str
+    deployment_id: str
+    user_id: str
+    model_id: str
+    enable_addons: bool
+
+    # Training state
+    current_step: int
+    total_steps: int
+
+    # Current model information
+    current_model_name: Optional[str] = None
+    is_trained: bool = False
+
+    # Training parameters used (for reference)
+    training_params: Optional[Dict[str, Any]] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert ModelConfig to dictionary for serialization."""
+        return {
+            "base_model_name": self.base_model_name,
+            "deployment_id": self.deployment_id,
+            "user_id": self.user_id,
+            "model_id": self.model_id,
+            "enable_addons": self.enable_addons,
+            "current_step": self.current_step,
+            "total_steps": self.total_steps,
+            "current_model_name": self.current_model_name,
+            "is_trained": self.is_trained,
+            "training_params": self.training_params,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ModelConfig":
+        """Create ModelConfig from dictionary."""
+        return cls(
+            base_model_name=data.get("base_model_name", "qwen2p5-7b-instruct"),
+            deployment_id=data.get("deployment_id", "my-base-deployment"),
+            user_id=data.get("user_id", ""),
+            model_id=data.get("model_id", ""),
+            enable_addons=data.get("enable_addons", True),
+            current_step=data.get("current_step", 0),
+            total_steps=data.get("total_steps", 0),
+            current_model_name=data.get("current_model_name"),
+            is_trained=data.get("is_trained", False),
+            training_params=data.get("training_params"),
+        )
+
+    def to_json(self) -> str:
+        """Convert ModelConfig to JSON string."""
+        return json.dumps(self.to_dict(), indent=2)
+
+    @classmethod
+    def from_json(cls, json_str: str) -> "ModelConfig":
+        """Create ModelConfig from JSON string."""
+        data = json.loads(json_str)
+        return cls.from_dict(data)
+
+    def save_to_file(self, filepath: str):
+        """Save ModelConfig to a JSON file."""
+        with open(filepath, "w") as f:
+            f.write(self.to_json())
+
+    @classmethod
+    def load_from_file(cls, filepath: str) -> "ModelConfig":
+        """Load ModelConfig from a JSON file."""
+        with open(filepath, "r") as f:
+            json_str = f.read()
+        return cls.from_json(json_str)
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ dependencies = [`
`31`	`31`	`"langchain-core",`
`32`	`32`	`"click<8.2.0",`
`33`	`33`	`"typer>=0.9.0",`
	`34`	`+ "fireworks-ai>=0.19.18",`
`34`	`35`	`]`
`35`	`36`
`36`	`37`	`[project.urls]`
Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,7 @@ def validate_trace_token_counts(`
`90`	`90`	`"TOGETHER_API_CALL",`
`91`	`91`	`"GOOGLE_API_CALL",`
`92`	`92`	`"GROQ_API_CALL",`
	`93`	`+ "FIREWORKS_TRAINABLE_MODEL_CALL",`
`93`	`94`	`}`
`94`	`95`
`95`	`96`	`for span in trace_spans:`