Skip to content

Commit eba8f41

Browse files
committed
[GCU] Update to the latest version
1 parent ce1f353 commit eba8f41

File tree

3 files changed

+9
-7
lines changed

3 files changed

+9
-7
lines changed

fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ def __init__(
7676
kv_num_heads: int,
7777
num_heads: int,
7878
head_dim: int,
79+
encoder_block_shape_q: int = -1,
80+
decoder_block_shape_q: int = -1,
7981
):
8082
"""
8183
GCUFlashAttnBackend __init__
@@ -94,7 +96,7 @@ def __init__(
9496
self.head_dim = head_dim
9597
self.scaling = 1.0 / (self.head_dim**0.5)
9698
self.num_layers = fd_config.model_config.num_hidden_layers
97-
self.position_ids_base = paddle.arange(self.max_seq_len)
99+
self.position_ids_base = np.arange(self.max_seq_len)
98100

99101
# TODO(zhengjun): Need to adapt the allocation logic and
100102
# temporarily allocate according to fixed size

fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ def __init__(
7474
kv_num_heads: int,
7575
num_heads: int,
7676
head_dim: int,
77+
encoder_block_shape_q: int = -1,
78+
decoder_block_shape_q: int = -1,
7779
):
7880
"""
7981
GCUMemEfficientAttnBackend __init__
@@ -92,7 +94,7 @@ def __init__(
9294
self.head_dim = head_dim
9395
self.scaling = 1.0 / (self.head_dim**0.5)
9496
self.num_layers = fd_config.model_config.num_hidden_layers
95-
self.position_ids_base = paddle.arange(self.max_seq_len)
97+
self.position_ids_base = np.arange(self.max_seq_len)
9698

9799
# TODO(zhengjun): Need to adapt the allocation logic and
98100
# temporarily allocate according to fixed size

fastdeploy/worker/gcu_model_runner.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ def get_attr_from_request(request, attr, default_value=None):
297297

298298
if self.speculative_method in ["mtp"]:
299299
self.proposer.insert_prefill_inputs(req_dicts)
300-
self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
300+
self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer
301301

302302
def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decode_len: int):
303303
"""Set dummy prefill inputs to share_inputs"""
@@ -675,7 +675,7 @@ def initialize_attn_backend(self) -> None:
675675
)
676676
self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
677677
self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
678-
self.share_inputs["decoder_num_blocks_cpu"] = paddle.full([1], 0, dtype="int32").pin_memory()
678+
self.share_inputs["decoder_num_blocks_cpu"] = paddle.full([1], 0, dtype="int32")
679679
self.share_inputs["max_len_tensor_cpu"] = paddle.full([8], 0, dtype="int32").cpu()
680680

681681
# Get the attention backend
@@ -1062,9 +1062,7 @@ class at the server level, which is too granular for ModelRunner.
10621062

10631063
self._update_chunked_prefill(model_forward_batch)
10641064
self._add_cache(model_forward_batch)
1065-
self.seq_lens_this_time_buffer[:num_running_requests].copy_(
1066-
self.share_inputs["seq_lens_this_time"][:num_running_requests], False
1067-
)
1065+
self.seq_lens_this_time_buffer.copy_(self.share_inputs["seq_lens_this_time"], False)
10681066
return None
10691067

10701068
def _add_cache(self, model_forward_batch) -> None:

0 commit comments

Comments
 (0)