Skip to content

Commit 270e373

Browse files
committed
support qwen3
1 parent 42af0b4 commit 270e373

File tree

15 files changed

+517
-135
lines changed

15 files changed

+517
-135
lines changed

fastdeploy/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,7 @@ class LoadChoices(str, Enum):
666666
DEFAULT = "default"
667667
# only support qwen3-bf16 now
668668
DEFAULT_V1 = "default_v1"
669+
INFLIGHT_QUANT = "inflight_quant"
669670

670671

671672
class LoadConfig:
@@ -685,6 +686,7 @@ def __init__(
685686
args,
686687
):
687688
self.load_choices: Union[str, LoadChoices] = LoadChoices.DEFAULT.value
689+
self.is_inflight_quant = False
688690
self.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1
689691
self.dynamic_load_weight: bool = False
690692
self.load_strategy: Optional[Literal["ipc", "ipc_snapshot"]] = None

fastdeploy/model_executor/layers/linear.py

Lines changed: 99 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@
2323
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
2424
from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
2525
from fastdeploy.model_executor.models.utils import (
26-
default_weight_loader,
26+
default_load_weights_into_param,
27+
default_weights_processor,
2728
set_weight_attrs,
29+
slice_fn,
2830
)
2931
from fastdeploy.platforms import current_platform
3032

@@ -37,21 +39,32 @@ class UnquantizedLinearMethod(QuantMethodBase):
3739
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
3840
"""
3941
extra_weight_attrs is a dictionary that may include parameters like:
42+
- split_axis: axis along which to split the tensor in a distributed environment
4043
- output_dim: determines whether the split is applied along the output dimension (rows) or input dimension (columns)
41-
- weight_loader: a callable or method responsible for loading the weight data
44+
- weights_processor: a callable or method responsible for processing weight data
45+
- load_weights_into_param:Loads the given weight tensor into the specified model parameter.
4246
"""
4347
layer.weight = layer.create_parameter(
4448
shape=layer.weight_shape,
4549
dtype=layer.weight_dtype,
4650
is_bias=False,
4751
default_initializer=paddle.nn.initializer.Constant(0),
4852
)
53+
split_axis = extra_weight_attrs.get("split_axis")
54+
if hasattr(layer, "nranks") and layer.nranks > 0:
55+
_set_var_distributed(layer.weight, split_axis=split_axis)
4956
set_weight_attrs(
5057
layer.weight,
51-
{"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config))},
58+
{
59+
**extra_weight_attrs,
60+
"weights_processor": extra_weight_attrs.get(
61+
"weights_processor", default_weights_processor(layer.fd_config)
62+
),
63+
"load_weights_into_param": extra_weight_attrs.get(
64+
"load_weights_into_param", default_load_weights_into_param()
65+
),
66+
},
5267
)
53-
if hasattr(layer, "nranks") and layer.nranks > 1:
54-
set_weight_attrs(layer.weight, {"output_dim": extra_weight_attrs.get("output_dim")})
5568

5669
def process_loaded_weights(self, layer, weights) -> None:
5770
# mlp.gate.weight is precision-sensitive, so we cast it to float32 for computation
@@ -158,6 +171,7 @@ def __init__(
158171
is_bias=True,
159172
)
160173

174+
self.is_quantized = fd_config.model_config.is_quantized
161175
# smooth quant
162176
self.linear_shift = None
163177
self.linear_smooth = None
@@ -270,9 +284,17 @@ def __init__(
270284
assert self.quant_method is not None
271285
self.quant_method.create_weights(
272286
self,
273-
weight_loader=(
274-
self.weight_loader if hasattr(self, "weight_loader") else default_weight_loader(self.fd_config)
287+
weights_processor=(
288+
self.weights_processor
289+
if hasattr(self, "weights_processor")
290+
else default_weights_processor(self.fd_config)
275291
),
292+
load_weights_into_param=(
293+
self.load_weights_into_param
294+
if hasattr(self, "load_weights_into_param")
295+
else default_load_weights_into_param()
296+
),
297+
inflight_quant=fd_config.quant_config and not skip_quant,
276298
)
277299

278300

@@ -327,17 +349,23 @@ def __init__(
327349
self.quant_method.create_weights(
328350
self,
329351
output_dim=True,
330-
weight_loader=(
331-
self.weight_loader if hasattr(self, "weight_loader") else default_weight_loader(self.fd_config)
352+
weights_processor=(
353+
self.weights_processor
354+
if hasattr(self, "weights_processor")
355+
else default_weights_processor(self.fd_config)
356+
),
357+
load_weights_into_param=(
358+
self.load_weights_into_param
359+
if hasattr(self, "load_weights_into_param")
360+
else default_load_weights_into_param()
332361
),
362+
inflight_quant=fd_config.quant_config and not skip_quant,
333363
)
364+
334365
if self.nranks > 0:
335-
_set_var_distributed(self.weight, split_axis=1)
336366
if self.with_bias:
337367
# col parallel
338368
_set_var_distributed(self.bias, split_axis=1)
339-
if self.nranks > 1:
340-
set_weight_attrs(self.bias, {"output_dim": True})
341369

342370

343371
class MergedColumnParallelLinear(ColumnParallelLinear):
@@ -390,31 +418,33 @@ def __init__(
390418
skip_quant=skip_quant,
391419
)
392420

393-
def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
421+
def load_weights_into_param(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
422+
assert loaded_shard_id in ["gate", "up"]
423+
output_dim = getattr(param, "output_dim", None)
424+
if loaded_shard_id == "gate":
425+
param = slice_fn(param, output_dim, start=0, end=self.output_size // 2)
426+
elif loaded_shard_id == "up":
427+
param = slice_fn(param, output_dim, start=self.output_size // 2, end=self.output_size)
428+
assert param.shape == loaded_weight.shape, (
429+
f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
430+
)
431+
param.copy_(loaded_weight, False)
432+
433+
def weights_processor(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
394434
# 1.fused gate_up in disk
395435
# 2.split gate up
396436
assert loaded_shard_id in ["gate", "up"]
397437
output_dim = getattr(param, "output_dim", None)
398438
# Tensor parallelism splits the weight along the output_dim
399-
if output_dim is not None:
439+
if output_dim is not None and self.nranks > 1:
400440
dim = -1
401441
size = loaded_weight.get_shape()[dim]
402442
block_size = size // self.nranks
403443
shard_offset = self.local_rank * block_size
404444
shard_size = (self.local_rank + 1) * block_size
405-
loaded_weight = loaded_weight[..., shard_offset:shard_size]
406-
445+
loaded_weight = slice_fn(loaded_weight, output_dim, shard_offset, shard_size)
407446
loaded_weight = get_tensor(loaded_weight)
408-
409-
if loaded_shard_id == "gate":
410-
param = param[:, : self.output_size // 2]
411-
elif loaded_shard_id == "up":
412-
param = param[:, self.output_size // 2 :]
413-
414-
assert param.shape == loaded_weight.shape, (
415-
f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
416-
)
417-
param.copy_(loaded_weight, False)
447+
yield loaded_weight
418448

419449
def load_state_dict(self, state_dict: dict):
420450
"""
@@ -484,33 +514,44 @@ def __init__(self, fd_config, prefix, with_bias=False, add_bias=True):
484514
add_bias=add_bias,
485515
)
486516

487-
def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
517+
def weights_processor(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
488518
# 1.fused qkv in disk
489519
# 2.split q k v
490520
assert loaded_shard_id in ["q", "k", "v"]
491521
output_dim = getattr(param, "output_dim", None)
492522
# Tensor parallelism splits the weight along the output_dim
493-
if output_dim is not None:
523+
if output_dim is not None and self.nranks > 1:
494524
dim = -1
495525
size = loaded_weight.get_shape()[dim]
496526
block_size = size // self.nranks
497527
shard_offset = self.local_rank * block_size
498528
shard_size = (self.local_rank + 1) * block_size
499-
loaded_weight = loaded_weight[..., shard_offset:shard_size]
529+
loaded_weight = slice_fn(loaded_weight, output_dim, shard_offset, shard_size)
500530

501531
loaded_weight = get_tensor(loaded_weight)
532+
yield loaded_weight
502533

534+
def load_weights_into_param(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
535+
assert loaded_shard_id in ["q", "k", "v"]
536+
output_dim = getattr(param, "output_dim", None)
503537
if loaded_shard_id == "q":
504-
param = param[:, : self.num_heads_per_rank * self.head_dim]
538+
param = slice_fn(param, output_dim, 0, self.num_heads_per_rank * self.head_dim)
539+
505540
elif loaded_shard_id == "k":
506-
param = param[
507-
:,
508-
self.num_heads_per_rank
509-
* self.head_dim : (self.num_heads_per_rank + self.kv_num_heads_per_rank)
510-
* self.head_dim,
511-
]
541+
param = slice_fn(
542+
param,
543+
output_dim,
544+
self.num_heads_per_rank * self.head_dim,
545+
(self.num_heads_per_rank + self.kv_num_heads_per_rank) * self.head_dim,
546+
)
547+
512548
elif loaded_shard_id == "v":
513-
param = param[:, (self.num_heads_per_rank + self.kv_num_heads_per_rank) * self.head_dim :]
549+
param = slice_fn(
550+
param,
551+
output_dim,
552+
(self.num_heads_per_rank + self.kv_num_heads_per_rank) * self.head_dim,
553+
(self.num_heads_per_rank + 2 * self.kv_num_heads_per_rank) * self.head_dim,
554+
)
514555

515556
assert param.shape == loaded_weight.shape, (
516557
f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
@@ -653,9 +694,17 @@ def __init__(
653694
self,
654695
split_axis=0,
655696
output_dim=False,
656-
weight_loader=(
657-
self.weight_loader if hasattr(self, "weight_loader") else default_weight_loader(self.fd_config)
697+
weights_processor=(
698+
self.weights_processor
699+
if hasattr(self, "weights_processor")
700+
else default_weights_processor(self.fd_config)
701+
),
702+
load_weights_into_param=(
703+
self.load_weights_into_param
704+
if hasattr(self, "load_weights_into_param")
705+
else default_load_weights_into_param()
658706
),
707+
inflight_quant=fd_config.quant_config and not skip_quant,
659708
)
660709
if self.nranks > 0:
661710
_set_var_distributed(self.weight, split_axis=0)
@@ -670,6 +719,17 @@ def __init__(
670719
},
671720
)
672721

722+
if self.nranks > 0:
723+
if self.with_bias:
724+
# col parallel
725+
_set_var_distributed(self.bias, split_axis=0)
726+
set_weight_attrs(
727+
self.bias,
728+
{
729+
"output_dim": False,
730+
},
731+
)
732+
673733
self.reduce_results = reduce_results
674734

675735
def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:

fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,9 +185,11 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
185185
if current_platform.is_cuda():
186186
self.up_gate_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2]
187187
self.down_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size, layer.hidden_size]
188+
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}}
188189
else:
189190
self.up_gate_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size * 2, layer.hidden_size]
190191
self.down_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size]
192+
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
191193

192194
layer.up_gate_proj_weight = layer.create_parameter(
193195
shape=self.up_gate_proj_weight_shape,

0 commit comments

Comments
 (0)