PaddlePaddle · gzy19990617 · Aug 18, 2025 · Aug 18, 2025
diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
@@ -63,7 +63,7 @@ def process_loaded_weights(self, layer, weights) -> None:
     def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor:
 
         linear_out = paddle.matmul(x, layer.weight)
-        if layer.with_bias:
+        if layer.with_bias and layer.add_bias:
             linear_out = paddle.add(linear_out, layer.bias)
         return linear_out
 
@@ -650,7 +650,7 @@ def __init__(
         input_size: int = None,
         output_size: int = None,
         with_bias: bool = False,
-        add_bias: bool = False,
+        add_bias: bool = True,
         reduce_results: bool = True,
         skip_quant: bool = False,
     ):
@@ -717,6 +717,8 @@ def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
             out = self.quant_method.apply(self, x)
         else:
             out = paddle.matmul(x, self.weight)
+            if self.with_bias and self.add_bias:
+                out = paddle.add(out, self.bias)
 
         if self.reduce_results and self.nranks > 1:
             tensor_model_parallel_all_reduce(out)

diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -133,6 +133,6 @@ def apply(self, layer, x):
             (layer.weight, layer.weight_scale),
             linear_out,
         )
-        if layer.with_bias:
+        if layer.with_bias and layer.add_bias:
             linear_out = paddle.add(linear_out, layer.bias)
         return linear_out
diff --git a/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py
@@ -131,7 +131,7 @@ def apply(self, layer, x):
             layer.weight,
             transpose_x=False,
             transpose_y=True,
-            bias=None,
+            bias=layer.bias if layer.add_bias and layer.with_bias else None,
             scale=self.total_scale,
             output_dtype="bfloat16",
             activation_type="identity",

diff --git a/fastdeploy/model_executor/layers/quantization/w4afp8.py b/fastdeploy/model_executor/layers/quantization/w4afp8.py
@@ -94,7 +94,7 @@ def apply(self, layer, x):
             layer.weight,
             layer.weight_scale,
             zero_points=None,
-            bias=layer.bias if layer.add_bias else None,
+            bias=layer.bias if layer.add_bias and layer.with_bias else None,
             out_scale=self.quant_config.weight_scale_dict.get(layer.prefix + ".weight_scale")
             / (
                 self.quant_config.act_scale_dict.get(layer.prefix + ".activation_scale")

diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -210,7 +210,7 @@ def apply(self, layer, x):
             linear_out = weight_only_linear(
                 x,
                 weight=layer.weight,
-                bias=layer.bias if layer.add_bias else None,
+                bias=layer.bias if layer.add_bias and layer.with_bias else None,
                 weight_scale=layer.weight_scale,
                 weight_dtype=("int8" if self.quant_config.name() == "wint8" else "int4"),
                 arch=80,
@@ -219,7 +219,7 @@ def apply(self, layer, x):
             linear_out = weight_only_linear(
                 x,
                 weight=layer.weight,
-                bias=layer.bias if layer.add_bias else None,
+                bias=layer.bias if layer.add_bias and layer.with_bias else None,
                 weight_scale=layer.weight_scale,
                 weight_dtype=("int8" if self.quant_config.name() == "wint8" else "int4"),
                 arch=self.quant_config.weight_only_linear_arch,

diff --git a/fastdeploy/model_executor/layers/quantization/wfp8afp8.py b/fastdeploy/model_executor/layers/quantization/wfp8afp8.py
@@ -118,7 +118,7 @@ def apply(self, layer, x):
                 a_scales,
                 layer.weight_scale,
                 out_type,
-                layer.bias,
+                layer.bias if layer.add_bias and layer.with_bias else None,
             )
         else:
             raise NotImplementedError