From 4950687370992a59c00e1a2bd003005840f787dd Mon Sep 17 00:00:00 2001 From: lizexu123 <2694294196@qq.com> Date: Wed, 30 Jul 2025 11:24:21 +0000 Subject: [PATCH 01/22] rebuild_padding --- custom_ops/cpu_ops/get_padding_offset.cc | 5 +-- custom_ops/cpu_ops/rebuild_padding.cc | 36 +++++++++--------- custom_ops/gpu_ops/get_padding_offset.cu | 5 +-- custom_ops/gpu_ops/rebuild_padding.cu | 37 ++++++++++--------- .../speculate_get_padding_offset.cu | 6 +-- .../model_executor/pre_and_post_process.py | 3 -- fastdeploy/worker/gpu_model_runner.py | 13 +++---- 7 files changed, 46 insertions(+), 59 deletions(-) diff --git a/custom_ops/cpu_ops/get_padding_offset.cc b/custom_ops/cpu_ops/get_padding_offset.cc index 8fe73bc8e4..02ee71a263 100644 --- a/custom_ops/cpu_ops/get_padding_offset.cc +++ b/custom_ops/cpu_ops/get_padding_offset.cc @@ -84,7 +84,6 @@ std::vector GetPaddingOffset(const paddle::Tensor &input_ids, seq_length, bsz); return {x_remove_padding, - cum_offsets_out, padding_offset, cu_seqlens_q, cu_seqlens_k}; @@ -97,7 +96,7 @@ std::vector> GetPaddingOffsetInferShape( const std::vector &seq_len_shape) { int64_t bsz = seq_len_shape[0]; int64_t seq_len = input_ids_shape[1]; - return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}}; + return {{-1}, {-1}, {bsz + 1}, {bsz + 1}}; } std::vector GetPaddingOffsetInferDtype( @@ -106,7 +105,6 @@ std::vector GetPaddingOffsetInferDtype( const paddle::DataType &token_num_dtype, const paddle::DataType &seq_len_dtype) { return {input_ids_dtype, - seq_len_dtype, seq_len_dtype, seq_len_dtype, seq_len_dtype}; @@ -115,7 +113,6 @@ std::vector GetPaddingOffsetInferDtype( PD_BUILD_STATIC_OP(get_padding_offset_cpu) .Inputs({"input_ids", "cum_offsets", "token_num", "seq_len"}) .Outputs({"x_remove_padding", - "cum_offsets_out", "padding_offset", "cu_seqlens_q", "cu_seqlens_k"}) diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc index 8ce533d041..315a04f30b 100644 --- a/custom_ops/cpu_ops/rebuild_padding.cc +++ b/custom_ops/cpu_ops/rebuild_padding.cc @@ -22,7 +22,7 @@ template void RebuildPaddingCPUImpl(T *output_data, const T *input_data, - const int *cum_offsets_data, + const int *cu_seqlens_q, const int *seq_len_this_time_data, const int *seq_lens_decoder_data, const int *seq_lens_encoder_data, @@ -44,7 +44,7 @@ void RebuildPaddingCPUImpl(T *output_data, seq_id = seq_lens_encoder_data[bi] - 1; } const int ori_token_idx = - bi * max_input_length - cum_offsets_data[bi] + seq_id; + cu_seqlens_q[bi] + seq_id; const int src_offset = ori_token_idx * dim_embed + bias_idx; output_data[i] = input_data[src_offset]; @@ -54,7 +54,7 @@ void RebuildPaddingCPUImpl(T *output_data, template void RebuildAppendPaddingCPUImpl(T *output_data, const T *input_data, - const int *cum_offsets_data, + const int *cu_seqlens_q, const int *seq_len_this_time_data, const int *seq_lens_decoder_data, const int *seq_lens_encoder_data, @@ -76,7 +76,8 @@ void RebuildAppendPaddingCPUImpl(T *output_data, if (seq_lens_encoder_data[bi] > 0) { seq_id = seq_lens_encoder_data[bi] - 1; } - int input_token_id = ori_token_id - cum_offsets_data[bi] + seq_id; + + int input_token_id = cu_seqlens_q[bi] + seq_id; int bias_idx = i % dim_embed; int src_offset = input_token_id * dim_embed + bias_idx; output_data[i] = input_data[src_offset]; @@ -85,14 +86,13 @@ void RebuildAppendPaddingCPUImpl(T *output_data, std::vector RebuildPaddingCPU( const paddle::Tensor &tmp_out, - const paddle::Tensor &cum_offsets, + const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, const paddle::optional &output_padding_offset, int max_input_length) { auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true); - auto cum_offsets_cpu = cum_offsets.copy_to(paddle::CPUPlace(), true); auto seq_len_this_time_cpu = seq_len_this_time.copy_to(paddle::CPUPlace(), true); auto seq_lens_decoder_cpu = @@ -107,7 +107,7 @@ std::vector RebuildPaddingCPU( int token_num = tmp_out_cpu.shape()[0]; int dim_embed = tmp_out_cpu.shape()[1]; - int bsz = cum_offsets_cpu.shape()[0]; + int bsz = cu_seqlens_q.shape()[0] - 1; paddle::Tensor out; if (output_padding_offset_cpu) { @@ -128,7 +128,7 @@ std::vector RebuildPaddingCPU( {bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace()); } - const int *cum_offsets_data = cum_offsets_cpu.data(); + const int *cu_seqlens_q_data = cu_seqlens_q.data(); const int *seq_len_this_time_data = seq_len_this_time_cpu.data(); const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data(); const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data(); @@ -141,7 +141,7 @@ std::vector RebuildPaddingCPU( case paddle::DataType::FLOAT32: RebuildAppendPaddingCPUImpl(out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -154,7 +154,7 @@ std::vector RebuildPaddingCPU( RebuildAppendPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -167,7 +167,7 @@ std::vector RebuildPaddingCPU( RebuildAppendPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -186,7 +186,7 @@ std::vector RebuildPaddingCPU( case paddle::DataType::FLOAT32: RebuildPaddingCPUImpl(out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -198,7 +198,7 @@ std::vector RebuildPaddingCPU( RebuildPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -211,7 +211,7 @@ std::vector RebuildPaddingCPU( RebuildPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -230,7 +230,7 @@ std::vector RebuildPaddingCPU( std::vector> RebuildPaddingInferShape( const std::vector &tmp_out_shape, - const std::vector &cum_offsets_shape, + const std::vector &cu_seqlens_q_shape, const std::vector &seq_len_this_time_shape, const std::vector &seq_lens_decoder_shape, const std::vector &seq_lens_encoder_shape, @@ -239,14 +239,14 @@ std::vector> RebuildPaddingInferShape( if (output_padding_offset_shape) { return {{-1, dim_embed}}; } else { - int64_t bsz = cum_offsets_shape[0]; + int64_t bsz = cu_seqlens_q_shape[0]-1; return {{bsz, dim_embed}}; } } std::vector RebuildPaddingInferDtype( const paddle::DataType &tmp_out_dtype, - const paddle::DataType &cum_offsets_dtype, + const paddle::DataType &cu_seqlens_q_dtype, const paddle::DataType &seq_len_this_time_dtype, const paddle::DataType &seq_lens_decoder_dtype, const paddle::DataType &seq_lens_encoder_dtype, @@ -256,7 +256,7 @@ std::vector RebuildPaddingInferDtype( PD_BUILD_STATIC_OP(rebuild_padding_cpu) .Inputs({"tmp_out", - "cum_offsets", + "cu_seqlens_q", "seq_len_this_time", "seq_lens_decoder", "seq_lens_encoder", diff --git a/custom_ops/gpu_ops/get_padding_offset.cu b/custom_ops/gpu_ops/get_padding_offset.cu index 8fae9b88c3..f505e1c326 100644 --- a/custom_ops/gpu_ops/get_padding_offset.cu +++ b/custom_ops/gpu_ops/get_padding_offset.cu @@ -101,7 +101,6 @@ std::vector GetPaddingOffset(const paddle::Tensor &input_ids, cum_offsets_out.data(), seq_length); return {x_remove_padding, - cum_offsets_out, batch_id_per_token, cu_seqlens_q, cu_seqlens_k}; // , enc_token_num, dec_token_num}; @@ -114,7 +113,7 @@ std::vector> GetPaddingOffsetInferShape( const std::vector &seq_len_shape) { int64_t bsz = seq_len_shape[0]; int64_t seq_len = input_ids_shape[1]; - return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}}; + return {{-1}, {-1}, {bsz + 1}, {bsz + 1}}; } std::vector GetPaddingOffsetInferDtype( @@ -123,7 +122,6 @@ std::vector GetPaddingOffsetInferDtype( const paddle::DataType &token_num_dtype, const paddle::DataType &seq_len_dtype) { return {input_ids_dtype, - seq_len_dtype, seq_len_dtype, seq_len_dtype, seq_len_dtype}; @@ -132,7 +130,6 @@ std::vector GetPaddingOffsetInferDtype( PD_BUILD_STATIC_OP(get_padding_offset) .Inputs({"input_ids", "token_num", "cum_offsets", "seq_len"}) .Outputs({"x_remove_padding", - "cum_offsets_out", "batch_id_per_token", "cu_seqlens_q", "cu_seqlens_k"}) diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu index 3d69e9e459..38a1fc551c 100644 --- a/custom_ops/gpu_ops/rebuild_padding.cu +++ b/custom_ops/gpu_ops/rebuild_padding.cu @@ -17,7 +17,7 @@ template __global__ void RebuildPaddingKernel(T *output_data, const T *input_data, - const int *cum_offsets, + const int *cu_seqlens_q, const int *seq_len_this_time, const int *seq_len_decoder, const int *seq_len_encoder, @@ -36,8 +36,9 @@ __global__ void RebuildPaddingKernel(T *output_data, if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue; // if encoder, get last token; just decoder, get first token. if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1; + const int ori_token_idx = - bi * max_input_length - cum_offsets[bi] + seq_id; + cu_seqlens_q[bi] + seq_id; const int src_offset = ori_token_idx * dim_embed + bias_idx; Load(&input_data[src_offset], &src_vec); Store(src_vec, &output_data[i]); @@ -47,7 +48,7 @@ __global__ void RebuildPaddingKernel(T *output_data, template __global__ void RebuildAppendPaddingKernel(T *output_data, const T *input_data, - const int *cum_offset, + const int *cu_seqlens_q, const int *seq_len_this_time, const int *seq_len_decoder, const int *seq_len_encoder, @@ -69,7 +70,7 @@ __global__ void RebuildAppendPaddingKernel(T *output_data, // if encoder, get last token; just decoder, get first token. if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1; - const int input_token_id = ori_token_id - cum_offset[bi] + seq_id; + const int input_token_id = cu_seqlens_q[bi]+ seq_id; const int bias_idx = i % dim_embed; Load(&input_data[input_token_id * dim_embed + bias_idx], @@ -81,7 +82,7 @@ __global__ void RebuildAppendPaddingKernel(T *output_data, template std::vector rebuild_padding( const paddle::Tensor &tmp_out, // [token_num, dim_embed] - const paddle::Tensor &cum_offsets, // [bsz, 1] + const paddle::Tensor &cu_seqlens_q, // [bsz+1, 1] const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, @@ -100,7 +101,7 @@ std::vector rebuild_padding( std::vector tmp_out_shape = tmp_out.shape(); const int token_num = tmp_out_shape[0]; const int dim_embed = tmp_out_shape[1]; - const int bsz = cum_offsets.shape()[0]; + const int bsz = cu_seqlens_q.shape()[0] - 1; paddle::Tensor out; if (output_padding_offset) { @@ -133,7 +134,7 @@ std::vector rebuild_padding( <<>>( reinterpret_cast(out.data()), reinterpret_cast(tmp_out.data()), - cum_offsets.data(), + cu_seqlens_q.data(), seq_len_this_time.data(), seq_lens_decoder.data(), seq_lens_encoder.data(), @@ -147,7 +148,7 @@ std::vector rebuild_padding( reinterpret_cast(out.data()), reinterpret_cast( const_cast(tmp_out.data())), - cum_offsets.data(), + cu_seqlens_q.data(), seq_len_this_time.data(), seq_lens_decoder.data(), seq_lens_encoder.data(), @@ -160,7 +161,7 @@ std::vector rebuild_padding( paddle::Tensor RebuildPaddingFunc( const paddle::Tensor &tmp_out, // [token_num, dim_embed] - const paddle::Tensor &cum_offsets, // [bsz, 1] + const paddle::Tensor &cu_seqlens_q, // [bsz, 1] const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, @@ -170,7 +171,7 @@ paddle::Tensor RebuildPaddingFunc( case paddle::DataType::BFLOAT16: { return rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -180,7 +181,7 @@ paddle::Tensor RebuildPaddingFunc( case paddle::DataType::FLOAT16: { return rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -190,7 +191,7 @@ paddle::Tensor RebuildPaddingFunc( case paddle::DataType::FLOAT32: { return rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -208,14 +209,14 @@ paddle::Tensor RebuildPaddingFunc( std::vector RebuildPadding( const paddle::Tensor &tmp_out, // [token_num, dim_embed] - const paddle::Tensor &cum_offsets, // [bsz, 1] + const paddle::Tensor &cu_seqlens_q, // [bsz+1, 1] const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, const paddle::optional &output_padding_offset, int max_input_length) { return {RebuildPaddingFunc(tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -225,7 +226,7 @@ std::vector RebuildPadding( std::vector> RebuildPaddingInferShape( const std::vector &tmp_out_shape, - const std::vector &cum_offsets_shape, + const std::vector &cu_seqlens_q_shape, const std::vector &seq_len_this_time_shape, const std::vector &seq_lens_decoder_shape, const std::vector &seq_lens_encoder_shape, @@ -235,14 +236,14 @@ std::vector> RebuildPaddingInferShape( if (output_padding_offset_shape) { return {{-1, dim_embed}}; } else { - int64_t bsz = cum_offsets_shape[0]; + int64_t bsz = cu_seqlens_q_shape[0] - 1; return {{bsz, dim_embed}}; } } std::vector RebuildPaddingInferDtype( const paddle::DataType &tmp_out_dtype, - const paddle::DataType &cum_offsets_dtype, + const paddle::DataType &cu_seqlens_q_dtype, const paddle::DataType &seq_len_this_time_dtype, const paddle::DataType &seq_lens_decoder_dtype, const paddle::DataType &seq_lens_encoder_dtype, @@ -252,7 +253,7 @@ std::vector RebuildPaddingInferDtype( PD_BUILD_STATIC_OP(rebuild_padding) .Inputs({"tmp_out", - "cum_offsets", + "cu_seqlens_q", "seq_len_this_time", "seq_lens_decoder", "seq_lens_encoder", diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu index 96186d761f..e37dacbf34 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu @@ -106,7 +106,6 @@ std::vector SpeculateGetPaddingOffset( seq_length, max_draft_tokens); return {x_remove_padding, - cum_offsets_out, batch_id_per_token, cu_seqlens_q, cu_seqlens_k}; // , enc_token_num, dec_token_num}; @@ -121,7 +120,7 @@ std::vector> SpeculateGetPaddingOffsetInferShape( const std::vector& seq_lens_encoder_shape) { int64_t bsz = seq_len_shape[0]; int64_t seq_len = input_ids_shape[1]; - return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}}; + return {{-1}, {-1}, {bsz + 1}, {bsz + 1}}; } std::vector SpeculateGetPaddingOffsetInferDtype( @@ -132,7 +131,6 @@ std::vector SpeculateGetPaddingOffsetInferDtype( const paddle::DataType& seq_len_dtype, const paddle::DataType& seq_lens_encoder_dtype) { return {input_ids_dtype, - seq_len_dtype, seq_len_dtype, seq_len_dtype, seq_len_dtype}; @@ -141,12 +139,10 @@ std::vector SpeculateGetPaddingOffsetInferDtype( PD_BUILD_STATIC_OP(speculate_get_padding_offset) .Inputs({"input_ids", "draft_tokens", - "cum_offsets", "token_num", "seq_len", "seq_lens_encoder"}) .Outputs({"x_remove_padding", - "cum_offsets_out", "batch_id_per_token", "cu_seqlens_q", "cu_seqlens_k"}) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 7d0a2aef78..190f442fbf 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -104,7 +104,6 @@ def pre_process( if speculative_decoding: ( ids_remove_padding, - cum_offsets, batch_id_per_token, cu_seqlens_q, cu_seqlens_k, @@ -134,14 +133,12 @@ def pre_process( else: ( ids_remove_padding, - cum_offsets, batch_id_per_token, cu_seqlens_q, cu_seqlens_k, ) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time) return ( ids_remove_padding, - cum_offsets, batch_id_per_token, cu_seqlens_q, cu_seqlens_k, diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index ca73961408..8b54bb3dcd 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -580,7 +580,6 @@ def _init_share_inputs(self, max_num_seqs: int): 0, dtype="int64", ) - self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["batch_id_per_token"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") @@ -695,7 +694,6 @@ def _prepare_inputs(self) -> None: # Remove padding ( ids_remove_padding, - cum_offsets, batch_id_per_token, cu_seqlens_q, cu_seqlens_k, @@ -711,7 +709,6 @@ def _prepare_inputs(self) -> None: ) self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) - self.share_inputs["cum_offsets"].copy_(cum_offsets, False) self.share_inputs["batch_id_per_token"].copy_(batch_id_per_token, False) self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) self.share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False) @@ -910,7 +907,7 @@ def _dummy_run( expected_decode_len=expected_decode_len, ) if self.speculative_method in ["mtp"]: - self.proposer.dummy_prefill_inputs( + self.proposer.dummy_prefipre_processll_inputs( num_tokens=num_tokens, batch_size=batch_size, expected_decode_len=expected_decode_len, @@ -940,7 +937,7 @@ def _dummy_run( hidden_states = rebuild_padding( model_output, - self.share_inputs["cum_offsets"], + self.share_inputs["cu_seqlens_q"], self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], @@ -1199,9 +1196,11 @@ class at the server level, which is too granular for ModelRunner. ids_remove_padding=self.share_inputs["ids_remove_padding"], forward_meta=self.forward_meta, ) - hidden_states = rebuild_padding( + from fastdeploy.model_executor.ops.cpu import rebuild_padding_cpu + + hidden_states = rebuild_padding_cpu( model_output, - self.share_inputs["cum_offsets"], + self.share_inputs["cu_seqlens_q"], self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], From f1db5268f15b7236aa6a8226c43191946193bffe Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 4 Aug 2025 14:13:12 +0800 Subject: [PATCH 02/22] update rebuild_padding --- custom_ops/cpu_ops/rebuild_padding.cc | 65 ++++++++++--------- .../model_executor/pre_and_post_process.py | 14 ++-- fastdeploy/worker/gcu_model_runner.py | 7 +- fastdeploy/worker/gpu_model_runner.py | 6 +- fastdeploy/worker/iluvatar_model_runner.py | 5 +- 5 files changed, 49 insertions(+), 48 deletions(-) diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc index 315a04f30b..5f8e033f56 100644 --- a/custom_ops/cpu_ops/rebuild_padding.cc +++ b/custom_ops/cpu_ops/rebuild_padding.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ template void RebuildPaddingCPUImpl(T *output_data, const T *input_data, - const int *cu_seqlens_q, + const int *cu_seqlens_q_data, const int *seq_len_this_time_data, const int *seq_lens_decoder_data, const int *seq_lens_encoder_data, @@ -40,11 +40,13 @@ void RebuildPaddingCPUImpl(T *output_data, if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) { continue; } + + // if encoder, get last token; just decoder, get first token. if (seq_lens_encoder_data[bi] > 0) { seq_id = seq_lens_encoder_data[bi] - 1; } - const int ori_token_idx = - cu_seqlens_q[bi] + seq_id; + + const int ori_token_idx = cu_seqlens_q_data[bi] + seq_id; const int src_offset = ori_token_idx * dim_embed + bias_idx; output_data[i] = input_data[src_offset]; @@ -54,7 +56,7 @@ void RebuildPaddingCPUImpl(T *output_data, template void RebuildAppendPaddingCPUImpl(T *output_data, const T *input_data, - const int *cu_seqlens_q, + const int *cu_seqlens_q_data, const int *seq_len_this_time_data, const int *seq_lens_decoder_data, const int *seq_lens_encoder_data, @@ -63,36 +65,42 @@ void RebuildAppendPaddingCPUImpl(T *output_data, const int dim_embed, const int64_t output_elem_nums) { for (int i = 0; i < output_elem_nums; ++i) { - int out_token_id = i / dim_embed; - int ori_token_id = + const int out_token_id = i / dim_embed; + const int ori_token_id = out_token_id + output_padding_offset_data[out_token_id]; - int bi = ori_token_id / max_input_length; - if (seq_len_this_time_data[bi] == 0 || - (seq_lens_decoder_data[bi] == 0 && - seq_lens_encoder_data[bi] == 0)) { + const int bi = ori_token_id / max_input_length; + int seq_id = 0; + + if (seq_len_this_time_data[bi] == 0) { continue; } - int seq_id = 0; + if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) { + continue; + } + + // if encoder, get last token; just decoder, get first token. if (seq_lens_encoder_data[bi] > 0) { seq_id = seq_lens_encoder_data[bi] - 1; } - int input_token_id = cu_seqlens_q[bi] + seq_id; - int bias_idx = i % dim_embed; - int src_offset = input_token_id * dim_embed + bias_idx; + const int input_token_id = cu_seqlens_q_data[bi] + seq_id; + const int bias_idx = i % dim_embed; + const int src_offset = input_token_id * dim_embed + bias_idx; + output_data[i] = input_data[src_offset]; } } std::vector RebuildPaddingCPU( const paddle::Tensor &tmp_out, - const paddle::Tensor &cu_seqlens_q, + const paddle::Tensor &cu_seqlens_q, // 改名:从cum_offsets改为cu_seqlens_q const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, const paddle::optional &output_padding_offset, int max_input_length) { auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true); + auto cu_seqlens_q_cpu = cu_seqlens_q.copy_to(paddle::CPUPlace(), true); // 改名 auto seq_len_this_time_cpu = seq_len_this_time.copy_to(paddle::CPUPlace(), true); auto seq_lens_decoder_cpu = @@ -107,7 +115,7 @@ std::vector RebuildPaddingCPU( int token_num = tmp_out_cpu.shape()[0]; int dim_embed = tmp_out_cpu.shape()[1]; - int bsz = cu_seqlens_q.shape()[0] - 1; + int bsz = cu_seqlens_q_cpu.shape()[0] - 1; // 改为bsz+1的长度,所以bsz = shape[0] - 1 paddle::Tensor out; if (output_padding_offset_cpu) { @@ -128,7 +136,7 @@ std::vector RebuildPaddingCPU( {bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace()); } - const int *cu_seqlens_q_data = cu_seqlens_q.data(); + const int *cu_seqlens_q_data = cu_seqlens_q_cpu.data(); // 改名 const int *seq_len_this_time_data = seq_len_this_time_cpu.data(); const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data(); const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data(); @@ -141,7 +149,7 @@ std::vector RebuildPaddingCPU( case paddle::DataType::FLOAT32: RebuildAppendPaddingCPUImpl(out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, + cu_seqlens_q_data, // 改名 seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -154,7 +162,7 @@ std::vector RebuildPaddingCPU( RebuildAppendPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, + cu_seqlens_q_data, // 改名 seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -167,7 +175,7 @@ std::vector RebuildPaddingCPU( RebuildAppendPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, + cu_seqlens_q_data, // 改名 seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -186,7 +194,7 @@ std::vector RebuildPaddingCPU( case paddle::DataType::FLOAT32: RebuildPaddingCPUImpl(out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, + cu_seqlens_q_data, // 改名 seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -198,7 +206,7 @@ std::vector RebuildPaddingCPU( RebuildPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, + cu_seqlens_q_data, // 改名 seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -207,11 +215,10 @@ std::vector RebuildPaddingCPU( elem_nums); break; case paddle::DataType::BFLOAT16: - RebuildPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, + cu_seqlens_q_data, // 改名 seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -230,7 +237,7 @@ std::vector RebuildPaddingCPU( std::vector> RebuildPaddingInferShape( const std::vector &tmp_out_shape, - const std::vector &cu_seqlens_q_shape, + const std::vector &cu_seqlens_q_shape, // 改名:从cum_offsets_shape改为cu_seqlens_q_shape const std::vector &seq_len_this_time_shape, const std::vector &seq_lens_decoder_shape, const std::vector &seq_lens_encoder_shape, @@ -239,14 +246,14 @@ std::vector> RebuildPaddingInferShape( if (output_padding_offset_shape) { return {{-1, dim_embed}}; } else { - int64_t bsz = cu_seqlens_q_shape[0]-1; + int64_t bsz = cu_seqlens_q_shape[0] - 1; // 改为bsz+1的长度,所以bsz = shape[0] - 1 return {{bsz, dim_embed}}; } } std::vector RebuildPaddingInferDtype( const paddle::DataType &tmp_out_dtype, - const paddle::DataType &cu_seqlens_q_dtype, + const paddle::DataType &cu_seqlens_q_dtype, // 改名:从cum_offsets_dtype改为cu_seqlens_q_dtype const paddle::DataType &seq_len_this_time_dtype, const paddle::DataType &seq_lens_decoder_dtype, const paddle::DataType &seq_lens_encoder_dtype, @@ -256,7 +263,7 @@ std::vector RebuildPaddingInferDtype( PD_BUILD_STATIC_OP(rebuild_padding_cpu) .Inputs({"tmp_out", - "cu_seqlens_q", + "cu_seqlens_q", // 改名:从cum_offsets改为cu_seqlens_q "seq_len_this_time", "seq_lens_decoder", "seq_lens_encoder", diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 190f442fbf..28a4cab486 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -484,7 +484,7 @@ def step_cuda( def rebuild_padding( tmp_out: paddle.Tensor, - cum_offsets: paddle.Tensor, + cu_seqlens_q: paddle.Tensor, seq_len_this_time: paddle.Tensor, seq_lens_decoder: paddle.Tensor, seq_lens_encoder: paddle.Tensor, @@ -500,7 +500,7 @@ def rebuild_padding( hidden_states = rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -512,7 +512,7 @@ def rebuild_padding( hidden_states = rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -524,19 +524,19 @@ def rebuild_padding( hidden_states = rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, output_padding_offset, max_input_length, ) - elif current_platform.is_gcu(): + # elif current_platform.is_gcu(): from fastdeploy.model_executor.ops.gcu import rebuild_padding hidden_states = rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -548,7 +548,7 @@ def rebuild_padding( hidden_states = rebuild_padding_cpu( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 63bf6e7b34..7fe06e0fb4 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -398,7 +398,7 @@ def _init_share_inputs(self, max_num_seqs: int): 0, dtype="int64", ) - self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["batch_id_per_token"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") @@ -495,7 +495,6 @@ def _prepare_inputs(self) -> None: ) self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) - self.share_inputs["cum_offsets"].copy_(cum_offsets, False) self.share_inputs["batch_id_per_token"].copy_(batch_id_per_token, False) self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) self.share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False) @@ -698,7 +697,7 @@ def _dummy_run( hidden_states = rebuild_padding( model_output, - self.share_inputs["cum_offsets"], + self.share_inputs["cu_seqlens_q"], self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], @@ -910,7 +909,7 @@ class at the server level, which is too granular for ModelRunner. hidden_states = rebuild_padding( model_output, - self.share_inputs["cum_offsets"], + self.share_inputs["cu_seqlens_q"], self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 8b54bb3dcd..40331074f0 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -907,7 +907,7 @@ def _dummy_run( expected_decode_len=expected_decode_len, ) if self.speculative_method in ["mtp"]: - self.proposer.dummy_prefipre_processll_inputs( + self.proposer.dummy_prefill_inputs( num_tokens=num_tokens, batch_size=batch_size, expected_decode_len=expected_decode_len, @@ -1196,9 +1196,7 @@ class at the server level, which is too granular for ModelRunner. ids_remove_padding=self.share_inputs["ids_remove_padding"], forward_meta=self.forward_meta, ) - from fastdeploy.model_executor.ops.cpu import rebuild_padding_cpu - - hidden_states = rebuild_padding_cpu( + hidden_states = rebuild_padding( model_output, self.share_inputs["cu_seqlens_q"], self.share_inputs["seq_lens_this_time"], diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index f3ef3823c0..f4fcb2fc03 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -363,7 +363,6 @@ def _init_share_inputs(self, max_num_seqs: int): 0, dtype="int64", ) - self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["padding_offset"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") @@ -468,7 +467,6 @@ def _prepare_inputs(self) -> None: ) self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) - self.share_inputs["cum_offsets"].copy_(cum_offsets, False) self.share_inputs["padding_offset"].copy_(padding_offset, False) self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) self.share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False) @@ -531,7 +529,6 @@ def initialize_forward_meta(self): seq_lens_encoder=self.share_inputs["seq_lens_encoder"], seq_lens_decoder=self.share_inputs["seq_lens_decoder"], seq_lens_this_time=self.share_inputs["seq_lens_this_time"], - cum_offsets=self.share_inputs["cum_offsets"], padding_offset=self.share_inputs["padding_offset"], cu_seqlens_q=self.share_inputs["cu_seqlens_q"], cu_seqlens_k=self.share_inputs["cu_seqlens_k"], @@ -656,7 +653,7 @@ def _dummy_run( hiddden_states = rebuild_padding( model_output, - self.share_inputs["cum_offsets"], + self.share_inputs["cu_seqlens_q"], self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], From 4f4a7b86509b7b52277f1c25128e463df4b5c33b Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 4 Aug 2025 16:13:48 +0800 Subject: [PATCH 03/22] add rebuild_padding test --- test/layers/test_rebuild_padding.py | 212 ++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 test/layers/test_rebuild_padding.py diff --git a/test/layers/test_rebuild_padding.py b/test/layers/test_rebuild_padding.py new file mode 100644 index 0000000000..af3b670e34 --- /dev/null +++ b/test/layers/test_rebuild_padding.py @@ -0,0 +1,212 @@ +import time +import unittest +from typing import Tuple + +import numpy as np +import paddle + + +class TestCuSeqlensQPerformance(unittest.TestCase): + + def setUp(self): + paddle.device.set_device("gpu:0") + + # Test configurations:(batch_size, max_seq_len, dim_embed, avg_seq_len_ratio) + self.test_configs = [ + # Small scale tests + (4, 512, 2048, 0.8), + (8, 512, 4096, 0.7), + # Medium scale tests + (16, 1024, 4096, 0.6), + (32, 1024, 4096, 0.8), + # Large scale tests + (64, 2048, 4096, 0.5), + (128, 1024, 8192, 0.7), + (256, 512, 4096, 0.9), + (16, 4096, 4096, 0.6), + (32, 2048, 8192, 0.8), + ] + + self.warmup_runs = 10 + self.benchmark_runs = 50 + + def generate_realistic_test_data( + self, batch_size: int, max_seq_len: int, dim_embed: int, avg_ratio: float + ) -> dict: + """Generate test data closer to real-world scenarios""" + + avg_seq_len = int(max_seq_len * avg_ratio) + std_seq_len = avg_seq_len // 4 + + seq_lens = np.random.normal(avg_seq_len, std_seq_len, batch_size) + seq_lens = np.clip(seq_lens, max_seq_len // 10, max_seq_len).astype(np.int32) + + total_tokens = np.sum(seq_lens) + + tmp_out = paddle.randn([total_tokens, dim_embed], dtype=paddle.float16) + tmp_out = tmp_out.cuda() + + cu_seqlens_q_np = np.zeros(batch_size + 1, dtype=np.int32) + for i in range(batch_size): + cu_seqlens_q_np[i + 1] = cu_seqlens_q_np[i] + seq_lens[i] + + cu_seqlens_q = paddle.to_tensor(cu_seqlens_q_np, dtype=paddle.int32).cuda() + + seq_len_this_time = paddle.to_tensor(seq_lens, dtype=paddle.int32).cuda() + seq_len_decoder = paddle.to_tensor(seq_lens, dtype=paddle.int32).cuda() + seq_len_encoder = paddle.zeros([batch_size], dtype=paddle.int32).cuda() + + return { + "tmp_out": tmp_out, + "cu_seqlens_q": cu_seqlens_q, + "seq_len_this_time": seq_len_this_time, + "seq_len_decoder": seq_len_decoder, + "seq_len_encoder": seq_len_encoder, + "max_input_length": max_seq_len, + "actual_tokens": total_tokens, + "seq_lens": seq_lens, + } + + def benchmark_cu_seqlens_performance(self, data_dict: dict) -> Tuple[float, float, paddle.Tensor]: + """Test performance of cu_seqlens_q version""" + + def rebuild_padding_cu_seqlens( + tmp_out, cu_seqlens_q, seq_len_this_time, seq_len_decoder, seq_len_encoder, max_input_length + ): + + from fastdeploy.model_executor.pre_and_post_process import rebuild_padding + + hidden_states = rebuild_padding( + tmp_out, cu_seqlens_q, seq_len_this_time, seq_len_decoder, seq_len_encoder, None, max_input_length + ) + return hidden_states + + for _ in range(self.warmup_runs): + result = rebuild_padding_cu_seqlens( + data_dict["tmp_out"], + data_dict["cu_seqlens_q"], + data_dict["seq_len_this_time"], + data_dict["seq_len_decoder"], + data_dict["seq_len_encoder"], + data_dict["max_input_length"], + ) + paddle.device.cuda.synchronize() + + paddle.device.cuda.synchronize() + start_time = time.perf_counter() + + for _ in range(self.benchmark_runs): + result = rebuild_padding_cu_seqlens( + data_dict["tmp_out"], + data_dict["cu_seqlens_q"], + data_dict["seq_len_this_time"], + data_dict["seq_len_decoder"], + data_dict["seq_len_encoder"], + data_dict["max_input_length"], + ) + + paddle.device.cuda.synchronize() + end_time = time.perf_counter() + + avg_time = (end_time - start_time) / self.benchmark_runs * 1000 # ms + + # throughput(tokens/ms) + throughput = data_dict["actual_tokens"] / avg_time + + return avg_time, throughput, result + + def test_performance_scaling(self): + """Test performance unfer different scales""" + print("\n" + "=" * 90) + print("CU_SEQLENS_Q Performance Scaling Test") + print("=" * 90) + print( + f"{'Config':<20} {'Batch':<6} {'SeqLen':<7} {'Tokens':<8} {'Time(ms)':<10} {'Throughput':<12} {'Memory(MB)'}" + ) + print("-" * 90) + + results = [] + + for i, (batch_size, max_seq_len, dim_embed, avg_ratio) in enumerate(self.test_configs): + config_name = f"Config_{i+1}" + + try: + data_dict = self.generate_realistic_test_data(batch_size, max_seq_len, dim_embed, avg_ratio) + + paddle.device.cuda.empty_cache() + mem_before = paddle.device.cuda.memory_allocated() / 1024 / 1024 # MB + + avg_time, throughput, result = self.benchmark_cu_seqlens_performance(data_dict) + + mem_after = paddle.device.cuda.memory_allocated() / 1024 / 1024 # MB + mem_usage = mem_after - mem_before + + results.append( + { + "config": config_name, + "batch_size": batch_size, + "max_seq_len": max_seq_len, + "dim_embed": dim_embed, + "actual_tokens": data_dict["actual_tokens"], + "avg_time": avg_time, + "throughput": throughput, + "memory_mb": mem_usage, + "result_shape": result.shape, + } + ) + + print( + f"{config_name:<20} {batch_size:<6} {max_seq_len:<7} " + f"{data_dict['actual_tokens']:<8} {avg_time:<10.3f} " + f"{throughput:<12.1f} {mem_usage:<8.1f}" + ) + + expected_shape = [batch_size, dim_embed] + self.assertEqual(list(result.shape), expected_shape, f"Output shape mismatch for {config_name}") + + except Exception as e: + print( + f"{config_name:<20} {'ERROR':<6} {'ERROR':<7} {'ERROR':<8} " + f"{'ERROR':<10} {'ERROR':<12} {'ERROR':<8} - {str(e)[:30]}..." + ) + + print("-" * 90) + return results + + +def main(): + """Run all performance tests""" + print("Starting CU_SEQLENS_Q Performance Benchmark...") + print(f"GPU: {paddle.device.cuda.get_device_name()}") + print(f"GPU Memory: {paddle.device.cuda.get_device_properties().total_memory / 1024**3:.1f} GB") + + test_instance = TestCuSeqlensQPerformance() + test_instance.setUp() + + try: + scaling_results = test_instance.test_performance_scaling() + + print("\n" + "=" * 50) + print("Performance Summary") + print("=" * 50) + + if scaling_results: + best_throughput = max(scaling_results, key=lambda x: x["throughput"]) + print(f"Best throughput: {best_throughput['throughput']:.1f} tokens/ms") + print( + f" Config: {best_throughput['config']} " + f"(batch={best_throughput['batch_size']}, " + f"seq_len={best_throughput['max_seq_len']})" + ) + + print("=" * 50) + + except Exception as e: + print(f"Test failed with error: {e}") + import traceback + + traceback.print_exc() + + +if __name__ == "__main__": + main() From d62c1645eb6653a014c16e163d2d98ed79f138ea Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 4 Aug 2025 16:23:34 +0800 Subject: [PATCH 04/22] fix --- test/layers/test_rebuild_padding.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/layers/test_rebuild_padding.py b/test/layers/test_rebuild_padding.py index af3b670e34..2860959e15 100644 --- a/test/layers/test_rebuild_padding.py +++ b/test/layers/test_rebuild_padding.py @@ -23,8 +23,6 @@ def setUp(self): (64, 2048, 4096, 0.5), (128, 1024, 8192, 0.7), (256, 512, 4096, 0.9), - (16, 4096, 4096, 0.6), - (32, 2048, 8192, 0.8), ] self.warmup_runs = 10 From dcb809d77895126ef8d6bd2fb1a83ebdf90761af Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 4 Aug 2025 22:31:45 +0800 Subject: [PATCH 05/22] fix test_rebuild_padding.py --- test/layers/test_rebuild_padding.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/layers/test_rebuild_padding.py b/test/layers/test_rebuild_padding.py index 2860959e15..6dbbe27c5e 100644 --- a/test/layers/test_rebuild_padding.py +++ b/test/layers/test_rebuild_padding.py @@ -1,3 +1,19 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + import time import unittest from typing import Tuple From 8d6e60457d4765a1f5c2ba5c3b715d1d6c6e497c Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 4 Aug 2025 22:38:27 +0800 Subject: [PATCH 06/22] Remove the Chinese comments and translate this sentence into English. --- custom_ops/cpu_ops/rebuild_padding.cc | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc index 5f8e033f56..cd77cbc840 100644 --- a/custom_ops/cpu_ops/rebuild_padding.cc +++ b/custom_ops/cpu_ops/rebuild_padding.cc @@ -93,14 +93,14 @@ void RebuildAppendPaddingCPUImpl(T *output_data, std::vector RebuildPaddingCPU( const paddle::Tensor &tmp_out, - const paddle::Tensor &cu_seqlens_q, // 改名:从cum_offsets改为cu_seqlens_q + const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, const paddle::optional &output_padding_offset, int max_input_length) { auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true); - auto cu_seqlens_q_cpu = cu_seqlens_q.copy_to(paddle::CPUPlace(), true); // 改名 + auto cu_seqlens_q_cpu = cu_seqlens_q.copy_to(paddle::CPUPlace(), true); auto seq_len_this_time_cpu = seq_len_this_time.copy_to(paddle::CPUPlace(), true); auto seq_lens_decoder_cpu = @@ -115,7 +115,7 @@ std::vector RebuildPaddingCPU( int token_num = tmp_out_cpu.shape()[0]; int dim_embed = tmp_out_cpu.shape()[1]; - int bsz = cu_seqlens_q_cpu.shape()[0] - 1; // 改为bsz+1的长度,所以bsz = shape[0] - 1 + int bsz = cu_seqlens_q_cpu.shape()[0] - 1; paddle::Tensor out; if (output_padding_offset_cpu) { @@ -136,7 +136,7 @@ std::vector RebuildPaddingCPU( {bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace()); } - const int *cu_seqlens_q_data = cu_seqlens_q_cpu.data(); // 改名 + const int *cu_seqlens_q_data = cu_seqlens_q_cpu.data(); const int *seq_len_this_time_data = seq_len_this_time_cpu.data(); const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data(); const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data(); @@ -149,7 +149,7 @@ std::vector RebuildPaddingCPU( case paddle::DataType::FLOAT32: RebuildAppendPaddingCPUImpl(out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, // 改名 + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -162,7 +162,7 @@ std::vector RebuildPaddingCPU( RebuildAppendPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, // 改名 + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -175,7 +175,7 @@ std::vector RebuildPaddingCPU( RebuildAppendPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, // 改名 + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -194,7 +194,7 @@ std::vector RebuildPaddingCPU( case paddle::DataType::FLOAT32: RebuildPaddingCPUImpl(out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, // 改名 + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -206,7 +206,7 @@ std::vector RebuildPaddingCPU( RebuildPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, // 改名 + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -218,7 +218,7 @@ std::vector RebuildPaddingCPU( RebuildPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cu_seqlens_q_data, // 改名 + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -237,7 +237,7 @@ std::vector RebuildPaddingCPU( std::vector> RebuildPaddingInferShape( const std::vector &tmp_out_shape, - const std::vector &cu_seqlens_q_shape, // 改名:从cum_offsets_shape改为cu_seqlens_q_shape + const std::vector &cu_seqlens_q_shape, const std::vector &seq_len_this_time_shape, const std::vector &seq_lens_decoder_shape, const std::vector &seq_lens_encoder_shape, @@ -246,14 +246,14 @@ std::vector> RebuildPaddingInferShape( if (output_padding_offset_shape) { return {{-1, dim_embed}}; } else { - int64_t bsz = cu_seqlens_q_shape[0] - 1; // 改为bsz+1的长度,所以bsz = shape[0] - 1 + int64_t bsz = cu_seqlens_q_shape[0] - 1; return {{bsz, dim_embed}}; } } std::vector RebuildPaddingInferDtype( const paddle::DataType &tmp_out_dtype, - const paddle::DataType &cu_seqlens_q_dtype, // 改名:从cum_offsets_dtype改为cu_seqlens_q_dtype + const paddle::DataType &cu_seqlens_q_dtype, const paddle::DataType &seq_len_this_time_dtype, const paddle::DataType &seq_lens_decoder_dtype, const paddle::DataType &seq_lens_encoder_dtype, @@ -263,7 +263,7 @@ std::vector RebuildPaddingInferDtype( PD_BUILD_STATIC_OP(rebuild_padding_cpu) .Inputs({"tmp_out", - "cu_seqlens_q", // 改名:从cum_offsets改为cu_seqlens_q + "cu_seqlens_q", "seq_len_this_time", "seq_lens_decoder", "seq_lens_encoder", From 9407c174bc2a0b760c08f077ca66ade4fb637bf1 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 4 Aug 2025 23:03:01 +0800 Subject: [PATCH 07/22] fix comments: --- custom_ops/cpu_ops/rebuild_padding.cc | 24 ++++++++++++------------ custom_ops/gpu_ops/rebuild_padding.cu | 2 +- test/layers/test_rebuild_padding.py | 10 +++------- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc index cd77cbc840..cc7f7c779c 100644 --- a/custom_ops/cpu_ops/rebuild_padding.cc +++ b/custom_ops/cpu_ops/rebuild_padding.cc @@ -65,27 +65,27 @@ void RebuildAppendPaddingCPUImpl(T *output_data, const int dim_embed, const int64_t output_elem_nums) { for (int i = 0; i < output_elem_nums; ++i) { - const int out_token_id = i / dim_embed; - const int ori_token_id = + int out_token_id = i / dim_embed; + int ori_token_id = out_token_id + output_padding_offset_data[out_token_id]; - const int bi = ori_token_id / max_input_length; + int bi = ori_token_id / max_input_length; int seq_id = 0; - if (seq_len_this_time_data[bi] == 0) { - continue; - } - if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) { - continue; - } + if (seq_len_this_time_data[bi] == 0 || + (seq_lens_decoder_data[bi] == 0 && + seq_lens_encoder_data[bi] == 0)) { + continue; + } + // if encoder, get last token; just decoder, get first token. if (seq_lens_encoder_data[bi] > 0) { seq_id = seq_lens_encoder_data[bi] - 1; } - const int input_token_id = cu_seqlens_q_data[bi] + seq_id; - const int bias_idx = i % dim_embed; - const int src_offset = input_token_id * dim_embed + bias_idx; + int input_token_id = cu_seqlens_q_data[bi] + seq_id; + int bias_idx = i % dim_embed; + int src_offset = input_token_id * dim_embed + bias_idx; output_data[i] = input_data[src_offset]; } diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu index 38a1fc551c..7632136667 100644 --- a/custom_ops/gpu_ops/rebuild_padding.cu +++ b/custom_ops/gpu_ops/rebuild_padding.cu @@ -161,7 +161,7 @@ std::vector rebuild_padding( paddle::Tensor RebuildPaddingFunc( const paddle::Tensor &tmp_out, // [token_num, dim_embed] - const paddle::Tensor &cu_seqlens_q, // [bsz, 1] + const paddle::Tensor &cu_seqlens_q, // [bsz+1, 1] const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, diff --git a/test/layers/test_rebuild_padding.py b/test/layers/test_rebuild_padding.py index 6dbbe27c5e..a66bde915b 100644 --- a/test/layers/test_rebuild_padding.py +++ b/test/layers/test_rebuild_padding.py @@ -179,9 +179,8 @@ def test_performance_scaling(self): self.assertEqual(list(result.shape), expected_shape, f"Output shape mismatch for {config_name}") except Exception as e: - print( - f"{config_name:<20} {'ERROR':<6} {'ERROR':<7} {'ERROR':<8} " - f"{'ERROR':<10} {'ERROR':<12} {'ERROR':<8} - {str(e)[:30]}..." + raise RuntimeError( + f"Failed to test configuration {config_name} (batch={batch_size}, seq_len={max_seq_len}): {str(e)}" ) print("-" * 90) @@ -216,10 +215,7 @@ def main(): print("=" * 50) except Exception as e: - print(f"Test failed with error: {e}") - import traceback - - traceback.print_exc() + raise RuntimeError(f"Performance benchmark failed: {str(e)}") if __name__ == "__main__": From 97a73a771ced47b992da5b9256c4f3c7b10aa53f Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 4 Aug 2025 23:04:45 +0800 Subject: [PATCH 08/22] fix rebuild_padding.cc --- custom_ops/cpu_ops/rebuild_padding.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc index cc7f7c779c..3425072089 100644 --- a/custom_ops/cpu_ops/rebuild_padding.cc +++ b/custom_ops/cpu_ops/rebuild_padding.cc @@ -69,20 +69,16 @@ void RebuildAppendPaddingCPUImpl(T *output_data, int ori_token_id = out_token_id + output_padding_offset_data[out_token_id]; int bi = ori_token_id / max_input_length; - int seq_id = 0; - - if (seq_len_this_time_data[bi] == 0 || + if (seq_len_this_time_data[bi] == 0 || (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0)) { continue; } - - + int seq_id = 0; // if encoder, get last token; just decoder, get first token. if (seq_lens_encoder_data[bi] > 0) { seq_id = seq_lens_encoder_data[bi] - 1; } - int input_token_id = cu_seqlens_q_data[bi] + seq_id; int bias_idx = i % dim_embed; int src_offset = input_token_id * dim_embed + bias_idx; From 556624481b8e73704746c22f5867572fd7297570 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 4 Aug 2025 23:05:29 +0800 Subject: [PATCH 09/22] fix rebuild_padding.cc --- custom_ops/cpu_ops/rebuild_padding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc index 3425072089..19cce80500 100644 --- a/custom_ops/cpu_ops/rebuild_padding.cc +++ b/custom_ops/cpu_ops/rebuild_padding.cc @@ -75,7 +75,7 @@ void RebuildAppendPaddingCPUImpl(T *output_data, continue; } int seq_id = 0; - // if encoder, get last token; just decoder, get first token. + if (seq_lens_encoder_data[bi] > 0) { seq_id = seq_lens_encoder_data[bi] - 1; } From 767d1d133f46f1233cac4998ce7e252557dd90c6 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 4 Aug 2025 23:06:20 +0800 Subject: [PATCH 10/22] fix comments --- custom_ops/cpu_ops/rebuild_padding.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc index 19cce80500..adbf95e5fd 100644 --- a/custom_ops/cpu_ops/rebuild_padding.cc +++ b/custom_ops/cpu_ops/rebuild_padding.cc @@ -41,7 +41,6 @@ void RebuildPaddingCPUImpl(T *output_data, continue; } - // if encoder, get last token; just decoder, get first token. if (seq_lens_encoder_data[bi] > 0) { seq_id = seq_lens_encoder_data[bi] - 1; } From ed2da59232aeb7ad108574381b10352f83b9f07a Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 5 Aug 2025 11:21:04 +0800 Subject: [PATCH 11/22] Adapt to MTP. --- fastdeploy/spec_decode/mtp.py | 5 +---- fastdeploy/worker/gpu_model_runner.py | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 39f0fce427..9d74b83050 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -272,7 +272,6 @@ def _init_model_inputs(self): self.model_inputs["not_need_stop"] = paddle.to_tensor([False], dtype="bool", place="cpu") self.model_inputs["pre_ids"] = paddle.clone(self.main_model_inputs["pre_ids"]) self.model_inputs["ids_remove_padding"] = paddle.clone(self.main_model_inputs["ids_remove_padding"]) - self.model_inputs["cum_offsets"] = paddle.clone(self.main_model_inputs["cum_offsets"]) self.model_inputs["batch_id_per_token"] = paddle.clone(self.main_model_inputs["batch_id_per_token"]) self.model_inputs["cu_seqlens_q"] = paddle.clone(self.main_model_inputs["cu_seqlens_q"]) self.model_inputs["cu_seqlens_k"] = paddle.clone(self.main_model_inputs["cu_seqlens_k"]) @@ -524,7 +523,6 @@ def _propose(self, target_hidden_states): # Remove padding ( ids_remove_padding, - cum_offsets, batch_id_per_token, cu_seqlens_q, cu_seqlens_k, @@ -540,7 +538,6 @@ def _propose(self, target_hidden_states): ) # Initialize forward meta data self.model_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) - self.model_inputs["cum_offsets"].copy_(cum_offsets, False) self.model_inputs["batch_id_per_token"].copy_(batch_id_per_token, False) self.model_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) self.model_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False) @@ -575,7 +572,7 @@ def _propose(self, target_hidden_states): hidden_states = rebuild_padding( model_output, - self.model_inputs["cum_offsets"], + self.model_inputs["cu_seqlens_q"], self.model_inputs["seq_lens_this_time"], self.model_inputs["seq_lens_decoder"], self.model_inputs["seq_lens_encoder"], diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 416d5cf680..49e0f417d9 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1383,6 +1383,7 @@ class at the server level, which is too granular for ModelRunner. # 7. Updata 'infer_seed' and step_cuda() self.share_inputs["infer_seed"].add_(self.infer_seed_increment) self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED + if not envs.ENABLE_V1_KVCACHE_SCHEDULER: step_cuda( self.share_inputs, From 43fc43e03cbf9aca67cff7f299ea34847a06d20c Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 6 Aug 2025 20:20:31 +0800 Subject: [PATCH 12/22] fix mtp RebuildAppendPaddingKernel --- custom_ops/gpu_ops/rebuild_padding.cu | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu index 7632136667..a1d0b897dc 100644 --- a/custom_ops/gpu_ops/rebuild_padding.cu +++ b/custom_ops/gpu_ops/rebuild_padding.cu @@ -34,7 +34,6 @@ __global__ void RebuildPaddingKernel(T *output_data, int seq_id = 0; if (seq_len_this_time[bi] == 0) continue; if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue; - // if encoder, get last token; just decoder, get first token. if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1; const int ori_token_idx = @@ -55,22 +54,25 @@ __global__ void RebuildAppendPaddingKernel(T *output_data, const int *output_padding_offset, const int max_input_length, const int dim_embed, - const int64_t output_elem_nums) { + const int64_t output_elem_nums, + const int bsz) { AlignedVector src_vec; const int64_t global_idx = blockDim.x * blockIdx.x + threadIdx.x; for (int64_t i = global_idx * VecSize; i < output_elem_nums; i += gridDim.x * blockDim.x * VecSize) { const int out_token_id = i / dim_embed; - const int ori_token_id = - out_token_id + output_padding_offset[out_token_id]; + const int ori_token_id = out_token_id + output_padding_offset[out_token_id]; + const int bi = ori_token_id / max_input_length; + int seq_id = 0; if (seq_len_this_time[bi] == 0) continue; if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue; - // if encoder, get last token; just decoder, get first token. + if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1; - const int input_token_id = cu_seqlens_q[bi]+ seq_id; + const int cum_offset_bi = bi * max_input_length - cu_seqlens_q[bi]; + const int input_token_id = ori_token_id - cum_offset_bi + seq_id; const int bias_idx = i % dim_embed; Load(&input_data[input_token_id * dim_embed + bias_idx], @@ -79,6 +81,8 @@ __global__ void RebuildAppendPaddingKernel(T *output_data, } } + + template std::vector rebuild_padding( const paddle::Tensor &tmp_out, // [token_num, dim_embed] @@ -141,7 +145,8 @@ std::vector rebuild_padding( output_padding_offset.get_ptr()->data(), max_input_length, dim_embed, - elem_nums); + elem_nums, + bsz); } else { RebuildPaddingKernel <<>>( From 569f7beaeb96fd274e8d63b1f510878a7d161c3d Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Sun, 10 Aug 2025 17:31:35 +0800 Subject: [PATCH 13/22] gaoxingneng --- custom_ops/gpu_ops/rebuild_padding.cu | 44 +++++++++++++++++++-------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu index a1d0b897dc..ce3b73ed52 100644 --- a/custom_ops/gpu_ops/rebuild_padding.cu +++ b/custom_ops/gpu_ops/rebuild_padding.cu @@ -14,6 +14,8 @@ #include "helper.h" // NOLINT +#define BLOCK_SIZE 128 + template __global__ void RebuildPaddingKernel(T *output_data, const T *input_data, @@ -27,17 +29,26 @@ __global__ void RebuildPaddingKernel(T *output_data, using LoadT = AlignedVector; LoadT src_vec; const int global_idx = blockDim.x * blockIdx.x + threadIdx.x; + + // Precompute the shared memory offset for the sequence lengths + __shared__ int shared_cu_seqlens_q[BLOCK_SIZE]; + + if (threadIdx.x < BLOCK_SIZE && threadIdx.x < blockDim.x) { + shared_cu_seqlens_q[threadIdx.x] = cu_seqlens_q[threadIdx.x]; + } + __syncthreads(); + for (int i = global_idx * VecSize; i < elem_nums; i += gridDim.x * blockDim.x * VecSize) { const int bi = i / dim_embed; const int bias_idx = i % dim_embed; - int seq_id = 0; - if (seq_len_this_time[bi] == 0) continue; - if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue; - if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1; - const int ori_token_idx = - cu_seqlens_q[bi] + seq_id; + // Check if the sequence length is valid + if (seq_len_this_time[bi] == 0 || (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0)) continue; + + int seq_id = seq_len_encoder[bi] > 0 ? seq_len_encoder[bi] - 1 : 0; + + const int ori_token_idx = shared_cu_seqlens_q[bi] + seq_id; const int src_offset = ori_token_idx * dim_embed + bias_idx; Load(&input_data[src_offset], &src_vec); Store(src_vec, &output_data[i]); @@ -58,6 +69,14 @@ __global__ void RebuildAppendPaddingKernel(T *output_data, const int bsz) { AlignedVector src_vec; const int64_t global_idx = blockDim.x * blockIdx.x + threadIdx.x; + + // Precompute shared memory for cu_seqlens_q + __shared__ int shared_cu_seqlens_q[BLOCK_SIZE]; + if (threadIdx.x < BLOCK_SIZE && threadIdx.x < blockDim.x) { + shared_cu_seqlens_q[threadIdx.x] = cu_seqlens_q[threadIdx.x]; + } + __syncthreads(); + for (int64_t i = global_idx * VecSize; i < output_elem_nums; i += gridDim.x * blockDim.x * VecSize) { const int out_token_id = i / dim_embed; @@ -65,24 +84,23 @@ __global__ void RebuildAppendPaddingKernel(T *output_data, const int bi = ori_token_id / max_input_length; - int seq_id = 0; - if (seq_len_this_time[bi] == 0) continue; - if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue; + // Skip the invalid sequences + if (seq_len_this_time[bi] == 0 || (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0)) continue; - if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1; + int seq_id = seq_len_encoder[bi] > 0 ? seq_len_encoder[bi] - 1 : 0; - const int cum_offset_bi = bi * max_input_length - cu_seqlens_q[bi]; + const int cum_offset_bi = bi * max_input_length - shared_cu_seqlens_q[bi]; const int input_token_id = ori_token_id - cum_offset_bi + seq_id; const int bias_idx = i % dim_embed; - Load(&input_data[input_token_id * dim_embed + bias_idx], - &src_vec); + Load(&input_data[input_token_id * dim_embed + bias_idx], &src_vec); Store(src_vec, &output_data[i]); } } + template std::vector rebuild_padding( const paddle::Tensor &tmp_out, // [token_num, dim_embed] From c6cd8f78fb87d9bf6aa4240e30c8666beb9a623d Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Sun, 10 Aug 2025 17:34:33 +0800 Subject: [PATCH 14/22] pre-commit --- fastdeploy/worker/gpu_model_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index ebaf94a8e9..bf2611aa8b 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -941,7 +941,6 @@ def initialize_kv_cache(self, profile: bool = False) -> None: cache_kvs_list.append(value_cache) self.share_inputs["caches"] = cache_kvs_list - else: for i in range(self.model_config.num_hidden_layers): cache_kvs[f"key_caches_{i}"] = paddle.full( From 2ccb4a7021d7cbfbaa1574459770b8c63c763c95 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 11 Aug 2025 14:08:43 +0800 Subject: [PATCH 15/22] merge develop --- custom_ops/gpu_ops/rebuild_padding.cu | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu index ce3b73ed52..5fcf7e740a 100644 --- a/custom_ops/gpu_ops/rebuild_padding.cu +++ b/custom_ops/gpu_ops/rebuild_padding.cu @@ -29,26 +29,17 @@ __global__ void RebuildPaddingKernel(T *output_data, using LoadT = AlignedVector; LoadT src_vec; const int global_idx = blockDim.x * blockIdx.x + threadIdx.x; - - // Precompute the shared memory offset for the sequence lengths - __shared__ int shared_cu_seqlens_q[BLOCK_SIZE]; - - if (threadIdx.x < BLOCK_SIZE && threadIdx.x < blockDim.x) { - shared_cu_seqlens_q[threadIdx.x] = cu_seqlens_q[threadIdx.x]; - } - __syncthreads(); - for (int i = global_idx * VecSize; i < elem_nums; i += gridDim.x * blockDim.x * VecSize) { const int bi = i / dim_embed; const int bias_idx = i % dim_embed; + int seq_id = 0; + if (seq_len_this_time[bi] == 0) continue; + if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue; + if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1; - // Check if the sequence length is valid - if (seq_len_this_time[bi] == 0 || (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0)) continue; - - int seq_id = seq_len_encoder[bi] > 0 ? seq_len_encoder[bi] - 1 : 0; - - const int ori_token_idx = shared_cu_seqlens_q[bi] + seq_id; + const int ori_token_idx = + cu_seqlens_q[bi] + seq_id; const int src_offset = ori_token_idx * dim_embed + bias_idx; Load(&input_data[src_offset], &src_vec); Store(src_vec, &output_data[i]); @@ -98,9 +89,6 @@ __global__ void RebuildAppendPaddingKernel(T *output_data, } } - - - template std::vector rebuild_padding( const paddle::Tensor &tmp_out, // [token_num, dim_embed] From 586f8dfa87d5c6d3c1ad58c0ed00f89a82f3e454 Mon Sep 17 00:00:00 2001 From: lizexu123 <2694294196@qq.com> Date: Tue, 12 Aug 2025 11:48:32 +0000 Subject: [PATCH 16/22] fix mtp --- custom_ops/gpu_ops/rebuild_padding.cu | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu index 5fcf7e740a..74da1d89fc 100644 --- a/custom_ops/gpu_ops/rebuild_padding.cu +++ b/custom_ops/gpu_ops/rebuild_padding.cu @@ -60,14 +60,6 @@ __global__ void RebuildAppendPaddingKernel(T *output_data, const int bsz) { AlignedVector src_vec; const int64_t global_idx = blockDim.x * blockIdx.x + threadIdx.x; - - // Precompute shared memory for cu_seqlens_q - __shared__ int shared_cu_seqlens_q[BLOCK_SIZE]; - if (threadIdx.x < BLOCK_SIZE && threadIdx.x < blockDim.x) { - shared_cu_seqlens_q[threadIdx.x] = cu_seqlens_q[threadIdx.x]; - } - __syncthreads(); - for (int64_t i = global_idx * VecSize; i < output_elem_nums; i += gridDim.x * blockDim.x * VecSize) { const int out_token_id = i / dim_embed; @@ -75,20 +67,22 @@ __global__ void RebuildAppendPaddingKernel(T *output_data, const int bi = ori_token_id / max_input_length; - // Skip the invalid sequences - if (seq_len_this_time[bi] == 0 || (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0)) continue; - - int seq_id = seq_len_encoder[bi] > 0 ? seq_len_encoder[bi] - 1 : 0; + int seq_id = 0; + if (seq_len_this_time[bi] == 0) continue; + if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue; - const int cum_offset_bi = bi * max_input_length - shared_cu_seqlens_q[bi]; + if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1; + const int cum_offset_bi = bi * max_input_length - cu_seqlens_q[bi]; const int input_token_id = ori_token_id - cum_offset_bi + seq_id; const int bias_idx = i % dim_embed; - Load(&input_data[input_token_id * dim_embed + bias_idx], &src_vec); + Load(&input_data[input_token_id * dim_embed + bias_idx], + &src_vec); Store(src_vec, &output_data[i]); } } + template std::vector rebuild_padding( const paddle::Tensor &tmp_out, // [token_num, dim_embed] From 1ccf9f11cf96fcd305103a1ef71aa88da22eef6c Mon Sep 17 00:00:00 2001 From: lizexu123 <2694294196@qq.com> Date: Tue, 12 Aug 2025 11:50:51 +0000 Subject: [PATCH 17/22] delete BLCOK_SIZE --- custom_ops/gpu_ops/rebuild_padding.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu index 74da1d89fc..772fefa1ac 100644 --- a/custom_ops/gpu_ops/rebuild_padding.cu +++ b/custom_ops/gpu_ops/rebuild_padding.cu @@ -14,7 +14,6 @@ #include "helper.h" // NOLINT -#define BLOCK_SIZE 128 template __global__ void RebuildPaddingKernel(T *output_data, From 6a9f3eec81f6ef726ada6ccf365603b363c3c75b Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Thu, 14 Aug 2025 19:35:07 +0800 Subject: [PATCH 18/22] fix --- test/layers/test_rebuild_padding.py | 222 ---------------------------- 1 file changed, 222 deletions(-) delete mode 100644 test/layers/test_rebuild_padding.py diff --git a/test/layers/test_rebuild_padding.py b/test/layers/test_rebuild_padding.py deleted file mode 100644 index a66bde915b..0000000000 --- a/test/layers/test_rebuild_padding.py +++ /dev/null @@ -1,222 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import time -import unittest -from typing import Tuple - -import numpy as np -import paddle - - -class TestCuSeqlensQPerformance(unittest.TestCase): - - def setUp(self): - paddle.device.set_device("gpu:0") - - # Test configurations:(batch_size, max_seq_len, dim_embed, avg_seq_len_ratio) - self.test_configs = [ - # Small scale tests - (4, 512, 2048, 0.8), - (8, 512, 4096, 0.7), - # Medium scale tests - (16, 1024, 4096, 0.6), - (32, 1024, 4096, 0.8), - # Large scale tests - (64, 2048, 4096, 0.5), - (128, 1024, 8192, 0.7), - (256, 512, 4096, 0.9), - ] - - self.warmup_runs = 10 - self.benchmark_runs = 50 - - def generate_realistic_test_data( - self, batch_size: int, max_seq_len: int, dim_embed: int, avg_ratio: float - ) -> dict: - """Generate test data closer to real-world scenarios""" - - avg_seq_len = int(max_seq_len * avg_ratio) - std_seq_len = avg_seq_len // 4 - - seq_lens = np.random.normal(avg_seq_len, std_seq_len, batch_size) - seq_lens = np.clip(seq_lens, max_seq_len // 10, max_seq_len).astype(np.int32) - - total_tokens = np.sum(seq_lens) - - tmp_out = paddle.randn([total_tokens, dim_embed], dtype=paddle.float16) - tmp_out = tmp_out.cuda() - - cu_seqlens_q_np = np.zeros(batch_size + 1, dtype=np.int32) - for i in range(batch_size): - cu_seqlens_q_np[i + 1] = cu_seqlens_q_np[i] + seq_lens[i] - - cu_seqlens_q = paddle.to_tensor(cu_seqlens_q_np, dtype=paddle.int32).cuda() - - seq_len_this_time = paddle.to_tensor(seq_lens, dtype=paddle.int32).cuda() - seq_len_decoder = paddle.to_tensor(seq_lens, dtype=paddle.int32).cuda() - seq_len_encoder = paddle.zeros([batch_size], dtype=paddle.int32).cuda() - - return { - "tmp_out": tmp_out, - "cu_seqlens_q": cu_seqlens_q, - "seq_len_this_time": seq_len_this_time, - "seq_len_decoder": seq_len_decoder, - "seq_len_encoder": seq_len_encoder, - "max_input_length": max_seq_len, - "actual_tokens": total_tokens, - "seq_lens": seq_lens, - } - - def benchmark_cu_seqlens_performance(self, data_dict: dict) -> Tuple[float, float, paddle.Tensor]: - """Test performance of cu_seqlens_q version""" - - def rebuild_padding_cu_seqlens( - tmp_out, cu_seqlens_q, seq_len_this_time, seq_len_decoder, seq_len_encoder, max_input_length - ): - - from fastdeploy.model_executor.pre_and_post_process import rebuild_padding - - hidden_states = rebuild_padding( - tmp_out, cu_seqlens_q, seq_len_this_time, seq_len_decoder, seq_len_encoder, None, max_input_length - ) - return hidden_states - - for _ in range(self.warmup_runs): - result = rebuild_padding_cu_seqlens( - data_dict["tmp_out"], - data_dict["cu_seqlens_q"], - data_dict["seq_len_this_time"], - data_dict["seq_len_decoder"], - data_dict["seq_len_encoder"], - data_dict["max_input_length"], - ) - paddle.device.cuda.synchronize() - - paddle.device.cuda.synchronize() - start_time = time.perf_counter() - - for _ in range(self.benchmark_runs): - result = rebuild_padding_cu_seqlens( - data_dict["tmp_out"], - data_dict["cu_seqlens_q"], - data_dict["seq_len_this_time"], - data_dict["seq_len_decoder"], - data_dict["seq_len_encoder"], - data_dict["max_input_length"], - ) - - paddle.device.cuda.synchronize() - end_time = time.perf_counter() - - avg_time = (end_time - start_time) / self.benchmark_runs * 1000 # ms - - # throughput(tokens/ms) - throughput = data_dict["actual_tokens"] / avg_time - - return avg_time, throughput, result - - def test_performance_scaling(self): - """Test performance unfer different scales""" - print("\n" + "=" * 90) - print("CU_SEQLENS_Q Performance Scaling Test") - print("=" * 90) - print( - f"{'Config':<20} {'Batch':<6} {'SeqLen':<7} {'Tokens':<8} {'Time(ms)':<10} {'Throughput':<12} {'Memory(MB)'}" - ) - print("-" * 90) - - results = [] - - for i, (batch_size, max_seq_len, dim_embed, avg_ratio) in enumerate(self.test_configs): - config_name = f"Config_{i+1}" - - try: - data_dict = self.generate_realistic_test_data(batch_size, max_seq_len, dim_embed, avg_ratio) - - paddle.device.cuda.empty_cache() - mem_before = paddle.device.cuda.memory_allocated() / 1024 / 1024 # MB - - avg_time, throughput, result = self.benchmark_cu_seqlens_performance(data_dict) - - mem_after = paddle.device.cuda.memory_allocated() / 1024 / 1024 # MB - mem_usage = mem_after - mem_before - - results.append( - { - "config": config_name, - "batch_size": batch_size, - "max_seq_len": max_seq_len, - "dim_embed": dim_embed, - "actual_tokens": data_dict["actual_tokens"], - "avg_time": avg_time, - "throughput": throughput, - "memory_mb": mem_usage, - "result_shape": result.shape, - } - ) - - print( - f"{config_name:<20} {batch_size:<6} {max_seq_len:<7} " - f"{data_dict['actual_tokens']:<8} {avg_time:<10.3f} " - f"{throughput:<12.1f} {mem_usage:<8.1f}" - ) - - expected_shape = [batch_size, dim_embed] - self.assertEqual(list(result.shape), expected_shape, f"Output shape mismatch for {config_name}") - - except Exception as e: - raise RuntimeError( - f"Failed to test configuration {config_name} (batch={batch_size}, seq_len={max_seq_len}): {str(e)}" - ) - - print("-" * 90) - return results - - -def main(): - """Run all performance tests""" - print("Starting CU_SEQLENS_Q Performance Benchmark...") - print(f"GPU: {paddle.device.cuda.get_device_name()}") - print(f"GPU Memory: {paddle.device.cuda.get_device_properties().total_memory / 1024**3:.1f} GB") - - test_instance = TestCuSeqlensQPerformance() - test_instance.setUp() - - try: - scaling_results = test_instance.test_performance_scaling() - - print("\n" + "=" * 50) - print("Performance Summary") - print("=" * 50) - - if scaling_results: - best_throughput = max(scaling_results, key=lambda x: x["throughput"]) - print(f"Best throughput: {best_throughput['throughput']:.1f} tokens/ms") - print( - f" Config: {best_throughput['config']} " - f"(batch={best_throughput['batch_size']}, " - f"seq_len={best_throughput['max_seq_len']})" - ) - - print("=" * 50) - - except Exception as e: - raise RuntimeError(f"Performance benchmark failed: {str(e)}") - - -if __name__ == "__main__": - main() From 9f6bf4c8dfa25b35e34ae26697ceccde809c42d0 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Thu, 14 Aug 2025 21:15:49 +0800 Subject: [PATCH 19/22] merge develop --- custom_ops/cpu_ops/rebuild_padding.cc | 1 + custom_ops/gpu_ops/rebuild_padding.cu | 1 - fastdeploy/model_executor/pre_and_post_process.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc index adbf95e5fd..2dfc9f17e2 100644 --- a/custom_ops/cpu_ops/rebuild_padding.cc +++ b/custom_ops/cpu_ops/rebuild_padding.cc @@ -19,6 +19,7 @@ #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) #endif + template void RebuildPaddingCPUImpl(T *output_data, const T *input_data, diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu index 772fefa1ac..93c1bb38c2 100644 --- a/custom_ops/gpu_ops/rebuild_padding.cu +++ b/custom_ops/gpu_ops/rebuild_padding.cu @@ -14,7 +14,6 @@ #include "helper.h" // NOLINT - template __global__ void RebuildPaddingKernel(T *output_data, const T *input_data, diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index e3dfcbf3c5..24f596b185 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -545,7 +545,7 @@ def rebuild_padding( output_padding_offset, max_input_length, ) - # elif current_platform.is_gcu(): + elif current_platform.is_gcu(): from fastdeploy.model_executor.ops.gcu import rebuild_padding hidden_states = rebuild_padding( From 3baf2b5c9982b77a8c28a05337d4fd2846a0e394 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Thu, 14 Aug 2025 21:16:20 +0800 Subject: [PATCH 20/22] fix pre-commit --- fastdeploy/worker/gpu_model_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 58ed5b0b79..ecc79a680e 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -577,7 +577,6 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decod self.share_inputs["min_dec_len"][idx : idx + 1] = max_dec_len self.share_inputs["stop_flags"][idx : idx + 1] = False self.share_inputs["temperature"][idx : idx + 1] = 1 - self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length From dcf8f56ab5a522007b42aa6abcd789e3cf7da9ac Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Thu, 14 Aug 2025 21:38:04 +0800 Subject: [PATCH 21/22] fix pre-commit --- fastdeploy/model_executor/pre_and_post_process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index f81e89988e..30b87d65b1 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -596,7 +596,7 @@ def rebuild_padding( hidden_states = rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, From 9593694ade8692a2197c0f2d498913775a2afb1f Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 18 Aug 2025 19:07:34 +0800 Subject: [PATCH 22/22] fix --- fastdeploy/worker/gpu_model_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 1d317b71be..2dfe1021c6 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -679,7 +679,6 @@ def _init_share_inputs(self, max_num_seqs: int): 0, dtype="int64", ) - self.share_inputs["batch_id_per_token"] = paddle.full( [max_num_seqs * self.parallel_config.max_model_len, 1], 0, dtype="int32" )