Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions python/sglang/srt/layers/attention/flashinfer_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,11 @@ def __init__(
self.num_wrappers = 1
self.dispatch_reason = None

# Qwen2 models require higher flashinfer workspace size
if "Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures:
# Qwen2/Qwen3 models require higher flashinfer workspace size
if (
"Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures
or "Qwen3ForCausalLM" in model_runner.model_config.hf_config.architectures
):
global_config.flashinfer_workspace_size = 512 * 1024 * 1024

# Allocate buffers
Expand Down
5 changes: 4 additions & 1 deletion python/sglang/srt/models/qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ def __init__(
config: Qwen2Config,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer,
) -> None:
super().__init__()
self.config = config
Expand All @@ -250,9 +251,11 @@ def __init__(
quant_config=quant_config,
prefix=add_prefix("embed_tokens", prefix),
)
# Use the provided decoder layer type or default to Qwen2DecoderLayer
decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer
self.layers = make_layers(
config.num_hidden_layers,
lambda idx, prefix: Qwen2DecoderLayer(
lambda idx, prefix: decoder_layer_type(
layer_id=idx,
config=config,
quant_config=quant_config,
Expand Down
24 changes: 13 additions & 11 deletions python/sglang/srt/models/qwen2_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.utils import add_prefix
from sglang.srt.utils import add_prefix, make_layers

expert_distribution_recorder = ExpertDistributionRecorder()

Expand Down Expand Up @@ -334,6 +334,7 @@ def __init__(
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
decoder_layer_type: type[nn.Module] = Qwen2MoeDecoderLayer,
) -> None:
super().__init__()
self.padding_idx = config.pad_token_id
Expand All @@ -344,16 +345,17 @@ def __init__(
config.hidden_size,
prefix=add_prefix("embed_tokens", prefix),
)
self.layers = nn.ModuleList(
[
Qwen2MoeDecoderLayer(
config,
layer_id,
quant_config=quant_config,
prefix=add_prefix(f"layers.{layer_id}", prefix),
)
for layer_id in range(config.num_hidden_layers)
]
# Use the provided decoder layer type or default to Qwen2MoeDecoderLayer
decoder_layer_type = decoder_layer_type or Qwen2MoeDecoderLayer
self.layers = make_layers(
config.num_hidden_layers,
lambda idx, prefix: decoder_layer_type(
layer_id=idx,
config=config,
quant_config=quant_config,
prefix=prefix,
),
prefix=add_prefix("layers", prefix),
)
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

Expand Down
Loading
Loading