xjpang
diff --git a/‎docs/backend/server_arguments.md
Lines changed: 1 addition & 1 deletion b/‎docs/backend/server_arguments.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/sglang/srt/configs/__init__.py
Lines changed: 8 additions & 0 deletions b/‎python/sglang/srt/configs/__init__.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎python/sglang/srt/configs/model_config.py
Lines changed: 3 additions & 0 deletions b/‎python/sglang/srt/configs/model_config.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/sglang/srt/configs/step3_vl.py
Lines changed: 172 additions & 0 deletions b/‎python/sglang/srt/configs/step3_vl.py
Lines changed: 172 additions & 0 deletions
diff --git a/‎python/sglang/srt/conversation.py
Lines changed: 23 additions & 0 deletions b/‎python/sglang/srt/conversation.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎python/sglang/srt/function_call/function_call_parser.py
Lines changed: 2 additions & 0 deletions b/‎python/sglang/srt/function_call/function_call_parser.py
Lines changed: 2 additions & 0 deletions
@@ -148,7 +148,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
 | `--file-storage-path` | The path of the file storage in backend. | sglang_storage |
 | `--enable-cache-report` | Return number of cached tokens in usage.prompt_tokens_details for each openai request. | False |
 | `--reasoning-parser` | Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}. | None |
-| `--tool-call-parser` | Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and 'kimi_k2'. | None |
+| `--tool-call-parser` | Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'. | None |
 
 ## Data parallelism
 
 
@@ -5,6 +5,11 @@
 from sglang.srt.configs.janus_pro import MultiModalityConfig
 from sglang.srt.configs.kimi_vl import KimiVLConfig
 from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
+from sglang.srt.configs.step3_vl import (
+    Step3TextConfig,
+    Step3VisionEncoderConfig,
+    Step3VLConfig,
+)
 
 __all__ = [
     "ExaoneConfig",
@@ -14,4 +19,7 @@
     "MultiModalityConfig",
     "KimiVLConfig",
     "MoonViTConfig",
+    "Step3VLConfig",
+    "Step3TextConfig",
+    "Step3VisionEncoderConfig",
 ]
@@ -335,6 +335,8 @@ def get_total_num_kv_heads(self) -> int:
             "num_key_value_heads",
             # For ChatGLM:
             "multi_query_group_num",
+            # For Step3
+            "num_attention_groups",
         ]
         for attr in attributes:
             num_kv_heads = getattr(self.hf_text_config, attr, None)
@@ -644,6 +646,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
     "InternS1ForConditionalGeneration",
     "Phi4MMForCausalLM",
     "VILAForConditionalGeneration",
+    "Step3VLForConditionalGeneration",
 ]
 
 
 
@@ -0,0 +1,172 @@
+from typing import Any, Optional, Union
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Step3VisionEncoderConfig(PretrainedConfig):
+    model_type = "step3_vision_encoder"
+
+    def __init__(
+        self,
+        hidden_size=1792,
+        intermediate_size=3072,
+        output_hidden_size=4096,
+        num_hidden_layers=63,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=728,
+        patch_size=14,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.output_hidden_size = output_hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        super().__init__(**kwargs)
+
+
+class Step3TextConfig(PretrainedConfig):
+    model_type = "step3_text"
+    architectures = ["Step3TextForCausalLM"]
+
+    def __init__(
+        self,
+        hidden_size: int = 7168,
+        intermediate_size: int = 18432,
+        num_attention_heads: int = 64,
+        num_attention_groups: int = 1,
+        num_hidden_layers: int = 61,
+        max_seq_len: int = 65536,
+        vocab_size: int = 128815,
+        rms_norm_eps: float = 1e-5,
+        moe_intermediate_size: int = 5120,
+        moe_num_experts: int = 48,
+        moe_top_k: int = 3,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embedding: int = 65536,
+        share_expert_dim: int = 5120,
+        share_q_dim: int = 2048,
+        head_dim: int = 256,
+        norm_expert_weight: bool = False,
+        moe_layers_enum: tuple[int] = (
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            18,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            29,
+            30,
+            31,
+            32,
+            33,
+            34,
+            35,
+            36,
+            37,
+            38,
+            39,
+            40,
+            41,
+            42,
+            43,
+            44,
+            45,
+            46,
+            47,
+            48,
+            49,
+            50,
+            51,
+            52,
+            53,
+            54,
+            55,
+            56,
+            57,
+            58,
+            59,
+        ),
+        **kwargs,
+    ) -> None:
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_groups = num_attention_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.max_position_embedding = max_position_embedding
+        self.share_expert_dim = share_expert_dim
+        self.share_q_dim = share_q_dim
+        self.head_dim = head_dim
+        self.norm_expert_weight = norm_expert_weight
+        self.moe_layers_enum = moe_layers_enum
+
+        super().__init__(**kwargs)
+
+
+class Step3VLConfig(PretrainedConfig):
+    model_type = "step3_vl"
+
+    def __init__(
+        self,
+        vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
+        text_config: Optional[Union[dict, Step3TextConfig]] = None,
+        understand_projector_stride: int = 1,
+        projector_bias: bool = True,
+        image_token_id: int = 128001,
+        **kwargs,
+    ) -> None:
+        if vision_config is None:
+            vision_config = Step3VisionEncoderConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = Step3VisionEncoderConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = Step3TextConfig()
+        elif isinstance(text_config, dict):
+            text_config = Step3TextConfig(**text_config)
+        self.text_config = text_config
+
+        self.understand_projector_stride = understand_projector_stride
+        self.projector_bias = projector_bias
+        self.hidden_size = text_config.hidden_size
+        self.image_token_id = image_token_id
+
+        super().__init__(**kwargs)
@@ -994,6 +994,23 @@ def generate_chat_conv(
     )
 )
 
+register_conv_template(
+    Conversation(
+        name="step3-vl",
+        system_message="<｜begin▁of▁sentence｜>You are a helpful assistant",
+        system_template="{system_message}\n",
+        roles=(
+            "<|BOT|>user\n",
+            "<|BOT|>assistant\n<think>\n",
+        ),
+        sep="<|EOT|>",
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        stop_str="<|EOT|>",
+        image_token="<im_patch>",
+        # add_bos=True,
+    )
+)
+
 
 @register_conv_template_matching_function
 def match_internvl(model_path: str):
@@ -1103,3 +1120,9 @@ def match_vila(model_path: str):
 def match_mimo_vl(model_path: str):
     if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
         return "mimo-vl"
+
+
+# @register_conv_template_matching_function
+# def match_step3(model_path: str):
+#     if re.search(r"step3", model_path, re.IGNORECASE):
+#         return "step3-vl"
@@ -17,6 +17,7 @@
 from sglang.srt.function_call.pythonic_detector import PythonicDetector
 from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
 from sglang.srt.function_call.qwen25_detector import Qwen25Detector
+from sglang.srt.function_call.step3_detector import Step3Detector
 
 logger = logging.getLogger(__name__)
 
@@ -39,6 +40,7 @@ class FunctionCallParser:
         "kimi_k2": KimiK2Detector,
         "qwen3_coder": Qwen3CoderDetector,
         "glm45": Glm4MoeDetector,
+        "step3": Step3Detector,
     }
 
     def __init__(self, tools: List[Tool], tool_call_parser: str):
Original file line number	Diff line number	Diff line change
`@@ -335,6 +335,8 @@ def get_total_num_kv_heads(self) -> int:`
`335`	`335`	`"num_key_value_heads",`
`336`	`336`	`# For ChatGLM:`
`337`	`337`	`"multi_query_group_num",`
	`338`	`+ # For Step3`
	`339`	`+ "num_attention_groups",`
`338`	`340`	`]`
`339`	`341`	`for attr in attributes:`
`340`	`342`	`num_kv_heads = getattr(self.hf_text_config, attr, None)`
`@@ -644,6 +646,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal`
`644`	`646`	`"InternS1ForConditionalGeneration",`
`645`	`647`	`"Phi4MMForCausalLM",`
`646`	`648`	`"VILAForConditionalGeneration",`
	`649`	`+ "Step3VLForConditionalGeneration",`
`647`	`650`	`]`
`648`	`651`
`649`	`652`