diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index bb2e6944ea..6714270059 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -530,7 +530,7 @@ paddle::Tensor FusedHadamardQuantFp8Func( int64_t init_custom_all_reduce(const std::vector& fake_ipc_ptrs, paddle::Tensor& rank_data, int64_t rank, bool full_nvlink); -void all_reduce(int64_t _fa, paddle::Tensor& inp, paddle::Tensor& out, +void all_reduce(paddle::Tensor& inp, paddle::Tensor& out, int64_t _fa, int64_t reg_buffer, int64_t reg_buffer_sz_bytes); void dispose(int64_t _fa); diff --git a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cu b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cu index 7c6d4cec79..0de2127734 100644 --- a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cu +++ b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cu @@ -49,7 +49,7 @@ fptr_t init_custom_all_reduce(const std::vector& fake_ipc_ptrs, * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first * copied into _reg_buffer. */ -void all_reduce(fptr_t _fa, paddle::Tensor& inp, paddle::Tensor& out, +void all_reduce(paddle::Tensor& inp, paddle::Tensor& out, fptr_t _fa, fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) { auto fa = reinterpret_cast(_fa); auto stream = inp.stream(); @@ -163,3 +163,12 @@ fptr_t open_mem_handle(paddle::Tensor& mem_handle) { void free_shared_buffer(fptr_t buffer) { CUDACHECK(cudaFree(reinterpret_cast(buffer))); } + + +PD_BUILD_STATIC_OP(all_reduce) + .Inputs({"inp", + "out"}) + .Outputs({"new_out"}) + .Attrs({"_fa: int64_t", "_reg_buffer: int64_t", "reg_buffer_sz_bytes: int64_t"}) + .SetInplaceMap({{"out", "new_out"}}) + .SetKernelFn(PD_KERNEL(all_reduce)); diff --git a/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py b/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py index 4f98b29c44..9a38b728e8 100644 --- a/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py +++ b/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py @@ -158,9 +158,9 @@ def all_reduce( if out is None: out = paddle.empty_like(inp) if registered: - all_reduce(self._ptr, inp, out, 0, 0) + all_reduce(inp, out, self._ptr, 0, 0) else: - all_reduce(self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size) + all_reduce(inp, out, self._ptr, self.buffer_ptrs[self.rank], self.max_size) return out def start_capture(self): diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py index 68a469e795..2cf961f21b 100644 --- a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py +++ b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py @@ -89,6 +89,9 @@ class MLAAttentionMetadata(AttentionMetadata): kv_signal_metadata: Optional[paddle.Tensor] = None kv_signal_data_list: List[Optional[paddle.Tensor]] = field(default_factory=list) + max_enc_len_this_time: Optional[paddle.Tensor] = None + max_dec_len_this_time: Optional[paddle.Tensor] = None + class MLAAttentionBackend(AttentionBackend): """