[GCU] Update to the latest version

EnflameGCU · EnflameGCU · commit eba8f41fe837 · 2025-08-08T08:05:37.000Z
diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
@@ -76,6 +76,8 @@ def __init__(
         kv_num_heads: int,
         num_heads: int,
         head_dim: int,
+        encoder_block_shape_q: int = -1,
+        decoder_block_shape_q: int = -1,
     ):
         """
         GCUFlashAttnBackend __init__
@@ -94,7 +96,7 @@ def __init__(
         self.head_dim = head_dim
         self.scaling = 1.0 / (self.head_dim**0.5)
         self.num_layers = fd_config.model_config.num_hidden_layers
-        self.position_ids_base = paddle.arange(self.max_seq_len)
+        self.position_ids_base = np.arange(self.max_seq_len)
 
         # TODO(zhengjun): Need to adapt the allocation logic and
         # temporarily allocate according to fixed size
diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
@@ -74,6 +74,8 @@ def __init__(
         kv_num_heads: int,
         num_heads: int,
         head_dim: int,
+        encoder_block_shape_q: int = -1,
+        decoder_block_shape_q: int = -1,
     ):
         """
         GCUMemEfficientAttnBackend __init__
@@ -92,7 +94,7 @@ def __init__(
         self.head_dim = head_dim
         self.scaling = 1.0 / (self.head_dim**0.5)
         self.num_layers = fd_config.model_config.num_hidden_layers
-        self.position_ids_base = paddle.arange(self.max_seq_len)
+        self.position_ids_base = np.arange(self.max_seq_len)
 
         # TODO(zhengjun): Need to adapt the allocation logic and
         # temporarily allocate according to fixed size
diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py
@@ -297,7 +297,7 @@ def get_attr_from_request(request, attr, default_value=None):
 
         if self.speculative_method in ["mtp"]:
             self.proposer.insert_prefill_inputs(req_dicts)
-        self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
+        self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer
 
     def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decode_len: int):
         """Set dummy prefill inputs to share_inputs"""
@@ -675,7 +675,7 @@ def initialize_attn_backend(self) -> None:
         )
         self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
         self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
-        self.share_inputs["decoder_num_blocks_cpu"] = paddle.full([1], 0, dtype="int32").pin_memory()
+        self.share_inputs["decoder_num_blocks_cpu"] = paddle.full([1], 0, dtype="int32")
         self.share_inputs["max_len_tensor_cpu"] = paddle.full([8], 0, dtype="int32").cpu()
 
         # Get the attention backend
@@ -1062,9 +1062,7 @@ class at the server level, which is too granular for ModelRunner.
 
         self._update_chunked_prefill(model_forward_batch)
         self._add_cache(model_forward_batch)
-        self.seq_lens_this_time_buffer[:num_running_requests].copy_(
-            self.share_inputs["seq_lens_this_time"][:num_running_requests], False
-        )
+        self.seq_lens_this_time_buffer.copy_(self.share_inputs["seq_lens_this_time"], False)
         return None
 
     def _add_cache(self, model_forward_batch) -> None: