Skip to content

Commit 51c3816

Browse files
CatherineSuennnobody-codeispobockQiaolin-YuJustinTong0323
authored
model: support Step3V (sgl-project#8583)
Signed-off-by: Xinyuan Tong <justinning0323@outlook.com> Co-authored-by: nnnobody-code <nnnobody@foxmail.com> Co-authored-by: ispobock <ispobaoke@gmail.com> Co-authored-by: Qiaolin-Yu <qy254@cornell.edu> Co-authored-by: Qiaolin-Yu <liin1211@outlook.com> Co-authored-by: Xinyuan Tong <justinning0323@outlook.com> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com>
1 parent 09f1a24 commit 51c3816

File tree

16 files changed

+2340
-23
lines changed

16 files changed

+2340
-23
lines changed

docs/backend/server_arguments.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
148148
| `--file-storage-path` | The path of the file storage in backend. | sglang_storage |
149149
| `--enable-cache-report` | Return number of cached tokens in usage.prompt_tokens_details for each openai request. | False |
150150
| `--reasoning-parser` | Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}. | None |
151-
| `--tool-call-parser` | Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and 'kimi_k2'. | None |
151+
| `--tool-call-parser` | Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'. | None |
152152

153153
## Data parallelism
154154

python/sglang/srt/configs/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
from sglang.srt.configs.janus_pro import MultiModalityConfig
66
from sglang.srt.configs.kimi_vl import KimiVLConfig
77
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
8+
from sglang.srt.configs.step3_vl import (
9+
Step3TextConfig,
10+
Step3VisionEncoderConfig,
11+
Step3VLConfig,
12+
)
813

914
__all__ = [
1015
"ExaoneConfig",
@@ -14,4 +19,7 @@
1419
"MultiModalityConfig",
1520
"KimiVLConfig",
1621
"MoonViTConfig",
22+
"Step3VLConfig",
23+
"Step3TextConfig",
24+
"Step3VisionEncoderConfig",
1725
]

python/sglang/srt/configs/model_config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,8 @@ def get_total_num_kv_heads(self) -> int:
335335
"num_key_value_heads",
336336
# For ChatGLM:
337337
"multi_query_group_num",
338+
# For Step3
339+
"num_attention_groups",
338340
]
339341
for attr in attributes:
340342
num_kv_heads = getattr(self.hf_text_config, attr, None)
@@ -644,6 +646,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
644646
"InternS1ForConditionalGeneration",
645647
"Phi4MMForCausalLM",
646648
"VILAForConditionalGeneration",
649+
"Step3VLForConditionalGeneration",
647650
]
648651

649652

python/sglang/srt/configs/step3_vl.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
from typing import Any, Optional, Union
2+
3+
from transformers.configuration_utils import PretrainedConfig
4+
5+
6+
class Step3VisionEncoderConfig(PretrainedConfig):
7+
model_type = "step3_vision_encoder"
8+
9+
def __init__(
10+
self,
11+
hidden_size=1792,
12+
intermediate_size=3072,
13+
output_hidden_size=4096,
14+
num_hidden_layers=63,
15+
num_attention_heads=16,
16+
num_channels=3,
17+
image_size=728,
18+
patch_size=14,
19+
hidden_act="quick_gelu",
20+
layer_norm_eps=1e-5,
21+
**kwargs,
22+
):
23+
self.hidden_size = hidden_size
24+
self.intermediate_size = intermediate_size
25+
self.output_hidden_size = output_hidden_size
26+
self.num_hidden_layers = num_hidden_layers
27+
self.num_attention_heads = num_attention_heads
28+
self.num_channels = num_channels
29+
self.patch_size = patch_size
30+
self.image_size = image_size
31+
self.layer_norm_eps = layer_norm_eps
32+
self.hidden_act = hidden_act
33+
super().__init__(**kwargs)
34+
35+
36+
class Step3TextConfig(PretrainedConfig):
37+
model_type = "step3_text"
38+
architectures = ["Step3TextForCausalLM"]
39+
40+
def __init__(
41+
self,
42+
hidden_size: int = 7168,
43+
intermediate_size: int = 18432,
44+
num_attention_heads: int = 64,
45+
num_attention_groups: int = 1,
46+
num_hidden_layers: int = 61,
47+
max_seq_len: int = 65536,
48+
vocab_size: int = 128815,
49+
rms_norm_eps: float = 1e-5,
50+
moe_intermediate_size: int = 5120,
51+
moe_num_experts: int = 48,
52+
moe_top_k: int = 3,
53+
rope_theta: float = 500000,
54+
rope_scaling: Optional[dict[str, Any]] = None,
55+
max_position_embedding: int = 65536,
56+
share_expert_dim: int = 5120,
57+
share_q_dim: int = 2048,
58+
head_dim: int = 256,
59+
norm_expert_weight: bool = False,
60+
moe_layers_enum: tuple[int] = (
61+
4,
62+
5,
63+
6,
64+
7,
65+
8,
66+
9,
67+
10,
68+
11,
69+
12,
70+
13,
71+
14,
72+
15,
73+
16,
74+
17,
75+
18,
76+
19,
77+
20,
78+
21,
79+
22,
80+
23,
81+
24,
82+
25,
83+
26,
84+
27,
85+
28,
86+
29,
87+
30,
88+
31,
89+
32,
90+
33,
91+
34,
92+
35,
93+
36,
94+
37,
95+
38,
96+
39,
97+
40,
98+
41,
99+
42,
100+
43,
101+
44,
102+
45,
103+
46,
104+
47,
105+
48,
106+
49,
107+
50,
108+
51,
109+
52,
110+
53,
111+
54,
112+
55,
113+
56,
114+
57,
115+
58,
116+
59,
117+
),
118+
**kwargs,
119+
) -> None:
120+
self.hidden_size = hidden_size
121+
self.intermediate_size = intermediate_size
122+
self.num_attention_heads = num_attention_heads
123+
self.num_attention_groups = num_attention_groups
124+
self.num_hidden_layers = num_hidden_layers
125+
self.max_seq_len = max_seq_len
126+
self.vocab_size = vocab_size
127+
self.rms_norm_eps = rms_norm_eps
128+
self.moe_intermediate_size = moe_intermediate_size
129+
self.moe_num_experts = moe_num_experts
130+
self.moe_top_k = moe_top_k
131+
self.rope_theta = rope_theta
132+
self.rope_scaling = rope_scaling
133+
self.max_position_embedding = max_position_embedding
134+
self.share_expert_dim = share_expert_dim
135+
self.share_q_dim = share_q_dim
136+
self.head_dim = head_dim
137+
self.norm_expert_weight = norm_expert_weight
138+
self.moe_layers_enum = moe_layers_enum
139+
140+
super().__init__(**kwargs)
141+
142+
143+
class Step3VLConfig(PretrainedConfig):
144+
model_type = "step3_vl"
145+
146+
def __init__(
147+
self,
148+
vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
149+
text_config: Optional[Union[dict, Step3TextConfig]] = None,
150+
understand_projector_stride: int = 1,
151+
projector_bias: bool = True,
152+
image_token_id: int = 128001,
153+
**kwargs,
154+
) -> None:
155+
if vision_config is None:
156+
vision_config = Step3VisionEncoderConfig()
157+
elif isinstance(vision_config, dict):
158+
vision_config = Step3VisionEncoderConfig(**vision_config)
159+
self.vision_config = vision_config
160+
161+
if text_config is None:
162+
text_config = Step3TextConfig()
163+
elif isinstance(text_config, dict):
164+
text_config = Step3TextConfig(**text_config)
165+
self.text_config = text_config
166+
167+
self.understand_projector_stride = understand_projector_stride
168+
self.projector_bias = projector_bias
169+
self.hidden_size = text_config.hidden_size
170+
self.image_token_id = image_token_id
171+
172+
super().__init__(**kwargs)

python/sglang/srt/conversation.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,23 @@ def generate_chat_conv(
994994
)
995995
)
996996

997+
register_conv_template(
998+
Conversation(
999+
name="step3-vl",
1000+
system_message="<|begin▁of▁sentence|>You are a helpful assistant",
1001+
system_template="{system_message}\n",
1002+
roles=(
1003+
"<|BOT|>user\n",
1004+
"<|BOT|>assistant\n<think>\n",
1005+
),
1006+
sep="<|EOT|>",
1007+
sep_style=SeparatorStyle.NO_COLON_SINGLE,
1008+
stop_str="<|EOT|>",
1009+
image_token="<im_patch>",
1010+
# add_bos=True,
1011+
)
1012+
)
1013+
9971014

9981015
@register_conv_template_matching_function
9991016
def match_internvl(model_path: str):
@@ -1103,3 +1120,9 @@ def match_vila(model_path: str):
11031120
def match_mimo_vl(model_path: str):
11041121
if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
11051122
return "mimo-vl"
1123+
1124+
1125+
# @register_conv_template_matching_function
1126+
# def match_step3(model_path: str):
1127+
# if re.search(r"step3", model_path, re.IGNORECASE):
1128+
# return "step3-vl"

python/sglang/srt/function_call/function_call_parser.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from sglang.srt.function_call.pythonic_detector import PythonicDetector
1818
from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
1919
from sglang.srt.function_call.qwen25_detector import Qwen25Detector
20+
from sglang.srt.function_call.step3_detector import Step3Detector
2021

2122
logger = logging.getLogger(__name__)
2223

@@ -39,6 +40,7 @@ class FunctionCallParser:
3940
"kimi_k2": KimiK2Detector,
4041
"qwen3_coder": Qwen3CoderDetector,
4142
"glm45": Glm4MoeDetector,
43+
"step3": Step3Detector,
4244
}
4345

4446
def __init__(self, tools: List[Tool], tool_call_parser: str):

0 commit comments

Comments
 (0)