From 4950687370992a59c00e1a2bd003005840f787dd Mon Sep 17 00:00:00 2001
From: lizexu123 <2694294196@qq.com>
Date: Wed, 30 Jul 2025 11:24:21 +0000
Subject: [PATCH 01/22] rebuild_padding

---
 custom_ops/cpu_ops/get_padding_offset.cc      |  5 +--
 custom_ops/cpu_ops/rebuild_padding.cc         | 36 +++++++++---------
 custom_ops/gpu_ops/get_padding_offset.cu      |  5 +--
 custom_ops/gpu_ops/rebuild_padding.cu         | 37 ++++++++++---------
 .../speculate_get_padding_offset.cu           |  6 +--
 .../model_executor/pre_and_post_process.py    |  3 --
 fastdeploy/worker/gpu_model_runner.py         | 13 +++----
 7 files changed, 46 insertions(+), 59 deletions(-)
diff --git a/custom_ops/cpu_ops/get_padding_offset.cc b/custom_ops/cpu_ops/get_padding_offset.cc
index 8fe73bc8e4..02ee71a263 100644
--- a/custom_ops/cpu_ops/get_padding_offset.cc
+++ b/custom_ops/cpu_ops/get_padding_offset.cc
@@ -84,7 +84,6 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
                    seq_length,
                    bsz);
     return {x_remove_padding,
-            cum_offsets_out,
             padding_offset,
             cu_seqlens_q,
             cu_seqlens_k};
@@ -97,7 +96,7 @@ std::vector<std::vector<int64_t>> GetPaddingOffsetInferShape(
     const std::vector<int64_t> &seq_len_shape) {
     int64_t bsz = seq_len_shape[0];
     int64_t seq_len = input_ids_shape[1];
-    return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}};
+    return {{-1}, {-1}, {bsz + 1}, {bsz + 1}};
 }
 
 std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
@@ -106,7 +105,6 @@ std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
     const paddle::DataType &token_num_dtype,
     const paddle::DataType &seq_len_dtype) {
     return {input_ids_dtype,
-            seq_len_dtype,
             seq_len_dtype,
             seq_len_dtype,
             seq_len_dtype};
@@ -115,7 +113,6 @@ std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
 PD_BUILD_STATIC_OP(get_padding_offset_cpu)
     .Inputs({"input_ids", "cum_offsets", "token_num", "seq_len"})
     .Outputs({"x_remove_padding",
-              "cum_offsets_out",
               "padding_offset",
               "cu_seqlens_q",
               "cu_seqlens_k"})
diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc
index 8ce533d041..315a04f30b 100644
--- a/custom_ops/cpu_ops/rebuild_padding.cc
+++ b/custom_ops/cpu_ops/rebuild_padding.cc
@@ -22,7 +22,7 @@
 template <typename T>
 void RebuildPaddingCPUImpl(T *output_data,
                            const T *input_data,
-                           const int *cum_offsets_data,
+                           const int *cu_seqlens_q,
                            const int *seq_len_this_time_data,
                            const int *seq_lens_decoder_data,
                            const int *seq_lens_encoder_data,
@@ -44,7 +44,7 @@ void RebuildPaddingCPUImpl(T *output_data,
             seq_id = seq_lens_encoder_data[bi] - 1;
         }
         const int ori_token_idx =
-            bi * max_input_length - cum_offsets_data[bi] + seq_id;
+            cu_seqlens_q[bi] + seq_id;
         const int src_offset = ori_token_idx * dim_embed + bias_idx;
 
         output_data[i] = input_data[src_offset];
@@ -54,7 +54,7 @@ void RebuildPaddingCPUImpl(T *output_data,
 template <typename T>
 void RebuildAppendPaddingCPUImpl(T *output_data,
                                  const T *input_data,
-                                 const int *cum_offsets_data,
+                                 const int *cu_seqlens_q,
                                  const int *seq_len_this_time_data,
                                  const int *seq_lens_decoder_data,
                                  const int *seq_lens_encoder_data,
@@ -76,7 +76,8 @@ void RebuildAppendPaddingCPUImpl(T *output_data,
         if (seq_lens_encoder_data[bi] > 0) {
             seq_id = seq_lens_encoder_data[bi] - 1;
         }
-        int input_token_id = ori_token_id - cum_offsets_data[bi] + seq_id;
+
+        int input_token_id = cu_seqlens_q[bi] + seq_id;
         int bias_idx = i % dim_embed;
         int src_offset = input_token_id * dim_embed + bias_idx;
         output_data[i] = input_data[src_offset];
@@ -85,14 +86,13 @@ void RebuildAppendPaddingCPUImpl(T *output_data,
 
 std::vector<paddle::Tensor> RebuildPaddingCPU(
     const paddle::Tensor &tmp_out,
-    const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &cu_seqlens_q,
     const paddle::Tensor &seq_len_this_time,
     const paddle::Tensor &seq_lens_decoder,
     const paddle::Tensor &seq_lens_encoder,
     const paddle::optional<paddle::Tensor> &output_padding_offset,
     int max_input_length) {
     auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
-    auto cum_offsets_cpu = cum_offsets.copy_to(paddle::CPUPlace(), true);
     auto seq_len_this_time_cpu =
         seq_len_this_time.copy_to(paddle::CPUPlace(), true);
     auto seq_lens_decoder_cpu =
@@ -107,7 +107,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
 
     int token_num = tmp_out_cpu.shape()[0];
     int dim_embed = tmp_out_cpu.shape()[1];
-    int bsz = cum_offsets_cpu.shape()[0];
+    int bsz = cu_seqlens_q.shape()[0] - 1;
 
     paddle::Tensor out;
     if (output_padding_offset_cpu) {
@@ -128,7 +128,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
             {bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
     }
 
-    const int *cum_offsets_data = cum_offsets_cpu.data<int>();
+    const int *cu_seqlens_q_data = cu_seqlens_q.data<int>();
     const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
     const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
     const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
@@ -141,7 +141,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
             case paddle::DataType::FLOAT32:
                 RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
                                                    tmp_out_cpu.data<float>(),
-                                                   cum_offsets_data,
+                                                   cu_seqlens_q_data,
                                                    seq_len_this_time_data,
                                                    seq_lens_decoder_data,
                                                    seq_lens_encoder_data,
@@ -154,7 +154,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                 RebuildAppendPaddingCPUImpl<paddle::float16>(
                     out.data<paddle::float16>(),
                     tmp_out_cpu.data<paddle::float16>(),
-                    cum_offsets_data,
+                    cu_seqlens_q_data,
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -167,7 +167,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                 RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
                     out.data<paddle::bfloat16>(),
                     tmp_out_cpu.data<paddle::bfloat16>(),
-                    cum_offsets_data,
+                    cu_seqlens_q_data,
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -186,7 +186,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
             case paddle::DataType::FLOAT32:
                 RebuildPaddingCPUImpl<float>(out.data<float>(),
                                              tmp_out_cpu.data<float>(),
-                                             cum_offsets_data,
+                                             cu_seqlens_q_data,
                                              seq_len_this_time_data,
                                              seq_lens_decoder_data,
                                              seq_lens_encoder_data,
@@ -198,7 +198,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                 RebuildPaddingCPUImpl<paddle::float16>(
                     out.data<paddle::float16>(),
                     tmp_out_cpu.data<paddle::float16>(),
-                    cum_offsets_data,
+                    cu_seqlens_q_data,
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -211,7 +211,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                 RebuildPaddingCPUImpl<paddle::bfloat16>(
                     out.data<paddle::bfloat16>(),
                     tmp_out_cpu.data<paddle::bfloat16>(),
-                    cum_offsets_data,
+                    cu_seqlens_q_data,
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -230,7 +230,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
 
 std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
     const std::vector<int64_t> &tmp_out_shape,
-    const std::vector<int64_t> &cum_offsets_shape,
+    const std::vector<int64_t> &cu_seqlens_q_shape,
     const std::vector<int64_t> &seq_len_this_time_shape,
     const std::vector<int64_t> &seq_lens_decoder_shape,
     const std::vector<int64_t> &seq_lens_encoder_shape,
@@ -239,14 +239,14 @@ std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
     if (output_padding_offset_shape) {
         return {{-1, dim_embed}};
     } else {
-        int64_t bsz = cum_offsets_shape[0];
+        int64_t bsz = cu_seqlens_q_shape[0]-1;
         return {{bsz, dim_embed}};
     }
 }
 
 std::vector<paddle::DataType> RebuildPaddingInferDtype(
     const paddle::DataType &tmp_out_dtype,
-    const paddle::DataType &cum_offsets_dtype,
+    const paddle::DataType &cu_seqlens_q_dtype,
     const paddle::DataType &seq_len_this_time_dtype,
     const paddle::DataType &seq_lens_decoder_dtype,
     const paddle::DataType &seq_lens_encoder_dtype,
@@ -256,7 +256,7 @@ std::vector<paddle::DataType> RebuildPaddingInferDtype(
 
 PD_BUILD_STATIC_OP(rebuild_padding_cpu)
     .Inputs({"tmp_out",
-             "cum_offsets",
+             "cu_seqlens_q",
              "seq_len_this_time",
              "seq_lens_decoder",
              "seq_lens_encoder",
diff --git a/custom_ops/gpu_ops/get_padding_offset.cu b/custom_ops/gpu_ops/get_padding_offset.cu
index 8fae9b88c3..f505e1c326 100644
--- a/custom_ops/gpu_ops/get_padding_offset.cu
+++ b/custom_ops/gpu_ops/get_padding_offset.cu
@@ -101,7 +101,6 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
         cum_offsets_out.data<int>(),
         seq_length);
     return {x_remove_padding,
-            cum_offsets_out,
             batch_id_per_token,
             cu_seqlens_q,
             cu_seqlens_k};  // , enc_token_num, dec_token_num};
@@ -114,7 +113,7 @@ std::vector<std::vector<int64_t>> GetPaddingOffsetInferShape(
     const std::vector<int64_t> &seq_len_shape) {
     int64_t bsz = seq_len_shape[0];
     int64_t seq_len = input_ids_shape[1];
-    return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}};
+    return {{-1}, {-1}, {bsz + 1}, {bsz + 1}};
 }
 
 std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
@@ -123,7 +122,6 @@ std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
     const paddle::DataType &token_num_dtype,
     const paddle::DataType &seq_len_dtype) {
     return {input_ids_dtype,
-            seq_len_dtype,
             seq_len_dtype,
             seq_len_dtype,
             seq_len_dtype};
@@ -132,7 +130,6 @@ std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
 PD_BUILD_STATIC_OP(get_padding_offset)
     .Inputs({"input_ids", "token_num", "cum_offsets", "seq_len"})
     .Outputs({"x_remove_padding",
-              "cum_offsets_out",
               "batch_id_per_token",
               "cu_seqlens_q",
               "cu_seqlens_k"})
diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu
index 3d69e9e459..38a1fc551c 100644
--- a/custom_ops/gpu_ops/rebuild_padding.cu
+++ b/custom_ops/gpu_ops/rebuild_padding.cu
@@ -17,7 +17,7 @@
 template <typename T, int VecSize>
 __global__ void RebuildPaddingKernel(T *output_data,
                                      const T *input_data,
-                                     const int *cum_offsets,
+                                     const int *cu_seqlens_q,
                                      const int *seq_len_this_time,
                                      const int *seq_len_decoder,
                                      const int *seq_len_encoder,
@@ -36,8 +36,9 @@ __global__ void RebuildPaddingKernel(T *output_data,
         if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue;
         // if encoder, get last token; just decoder, get first token.
         if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1;
+
         const int ori_token_idx =
-            bi * max_input_length - cum_offsets[bi] + seq_id;
+            cu_seqlens_q[bi] + seq_id;
         const int src_offset = ori_token_idx * dim_embed + bias_idx;
         Load<T, VecSize>(&input_data[src_offset], &src_vec);
         Store<T, VecSize>(src_vec, &output_data[i]);
@@ -47,7 +48,7 @@ __global__ void RebuildPaddingKernel(T *output_data,
 template <typename T, int VecSize>
 __global__ void RebuildAppendPaddingKernel(T *output_data,
                                            const T *input_data,
-                                           const int *cum_offset,
+                                           const int *cu_seqlens_q,
                                            const int *seq_len_this_time,
                                            const int *seq_len_decoder,
                                            const int *seq_len_encoder,
@@ -69,7 +70,7 @@ __global__ void RebuildAppendPaddingKernel(T *output_data,
         // if encoder, get last token; just decoder, get first token.
         if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1;
 
-        const int input_token_id = ori_token_id - cum_offset[bi] + seq_id;
+        const int input_token_id = cu_seqlens_q[bi]+ seq_id;
         const int bias_idx = i % dim_embed;
 
         Load<T, VecSize>(&input_data[input_token_id * dim_embed + bias_idx],
@@ -81,7 +82,7 @@ __global__ void RebuildAppendPaddingKernel(T *output_data,
 template <paddle::DataType D>
 std::vector<paddle::Tensor> rebuild_padding(
     const paddle::Tensor &tmp_out,      // [token_num, dim_embed]
-    const paddle::Tensor &cum_offsets,  // [bsz, 1]
+    const paddle::Tensor &cu_seqlens_q,  // [bsz+1, 1]
     const paddle::Tensor &seq_len_this_time,
     const paddle::Tensor &seq_lens_decoder,
     const paddle::Tensor &seq_lens_encoder,
@@ -100,7 +101,7 @@ std::vector<paddle::Tensor> rebuild_padding(
     std::vector<int64_t> tmp_out_shape = tmp_out.shape();
     const int token_num = tmp_out_shape[0];
     const int dim_embed = tmp_out_shape[1];
-    const int bsz = cum_offsets.shape()[0];
+    const int bsz = cu_seqlens_q.shape()[0] - 1;
 
     paddle::Tensor out;
     if (output_padding_offset) {
@@ -133,7 +134,7 @@ std::vector<paddle::Tensor> rebuild_padding(
             <<<grid_size, blocksize, 0, cu_stream>>>(
                 reinterpret_cast<DataType_ *>(out.data<data_t>()),
                 reinterpret_cast<const DataType_ *>(tmp_out.data<data_t>()),
-                cum_offsets.data<int>(),
+                cu_seqlens_q.data<int>(),
                 seq_len_this_time.data<int>(),
                 seq_lens_decoder.data<int>(),
                 seq_lens_encoder.data<int>(),
@@ -147,7 +148,7 @@ std::vector<paddle::Tensor> rebuild_padding(
                 reinterpret_cast<DataType_ *>(out.data<data_t>()),
                 reinterpret_cast<DataType_ *>(
                     const_cast<data_t *>(tmp_out.data<data_t>())),
-                cum_offsets.data<int>(),
+                cu_seqlens_q.data<int>(),
                 seq_len_this_time.data<int>(),
                 seq_lens_decoder.data<int>(),
                 seq_lens_encoder.data<int>(),
@@ -160,7 +161,7 @@ std::vector<paddle::Tensor> rebuild_padding(
 
 paddle::Tensor RebuildPaddingFunc(
     const paddle::Tensor &tmp_out,      // [token_num, dim_embed]
-    const paddle::Tensor &cum_offsets,  // [bsz, 1]
+    const paddle::Tensor &cu_seqlens_q,  // [bsz, 1]
     const paddle::Tensor &seq_len_this_time,
     const paddle::Tensor &seq_lens_decoder,
     const paddle::Tensor &seq_lens_encoder,
@@ -170,7 +171,7 @@ paddle::Tensor RebuildPaddingFunc(
         case paddle::DataType::BFLOAT16: {
             return rebuild_padding<paddle::DataType::BFLOAT16>(
                 tmp_out,
-                cum_offsets,
+                cu_seqlens_q,
                 seq_len_this_time,
                 seq_lens_decoder,
                 seq_lens_encoder,
@@ -180,7 +181,7 @@ paddle::Tensor RebuildPaddingFunc(
         case paddle::DataType::FLOAT16: {
             return rebuild_padding<paddle::DataType::FLOAT16>(
                 tmp_out,
-                cum_offsets,
+                cu_seqlens_q,
                 seq_len_this_time,
                 seq_lens_decoder,
                 seq_lens_encoder,
@@ -190,7 +191,7 @@ paddle::Tensor RebuildPaddingFunc(
         case paddle::DataType::FLOAT32: {
             return rebuild_padding<paddle::DataType::FLOAT32>(
                 tmp_out,
-                cum_offsets,
+                cu_seqlens_q,
                 seq_len_this_time,
                 seq_lens_decoder,
                 seq_lens_encoder,
@@ -208,14 +209,14 @@ paddle::Tensor RebuildPaddingFunc(
 
 std::vector<paddle::Tensor> RebuildPadding(
     const paddle::Tensor &tmp_out,      // [token_num, dim_embed]
-    const paddle::Tensor &cum_offsets,  // [bsz, 1]
+    const paddle::Tensor &cu_seqlens_q,   // [bsz+1, 1]
     const paddle::Tensor &seq_len_this_time,
     const paddle::Tensor &seq_lens_decoder,
     const paddle::Tensor &seq_lens_encoder,
     const paddle::optional<paddle::Tensor> &output_padding_offset,
     int max_input_length) {
     return {RebuildPaddingFunc(tmp_out,
-                               cum_offsets,
+                               cu_seqlens_q,
                                seq_len_this_time,
                                seq_lens_decoder,
                                seq_lens_encoder,
@@ -225,7 +226,7 @@ std::vector<paddle::Tensor> RebuildPadding(
 
 std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
     const std::vector<int64_t> &tmp_out_shape,
-    const std::vector<int64_t> &cum_offsets_shape,
+    const std::vector<int64_t> &cu_seqlens_q_shape,
     const std::vector<int64_t> &seq_len_this_time_shape,
     const std::vector<int64_t> &seq_lens_decoder_shape,
     const std::vector<int64_t> &seq_lens_encoder_shape,
@@ -235,14 +236,14 @@ std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
     if (output_padding_offset_shape) {
         return {{-1, dim_embed}};
     } else {
-        int64_t bsz = cum_offsets_shape[0];
+        int64_t bsz = cu_seqlens_q_shape[0] - 1;
         return {{bsz, dim_embed}};
     }
 }
 
 std::vector<paddle::DataType> RebuildPaddingInferDtype(
     const paddle::DataType &tmp_out_dtype,
-    const paddle::DataType &cum_offsets_dtype,
+    const paddle::DataType &cu_seqlens_q_dtype,
     const paddle::DataType &seq_len_this_time_dtype,
     const paddle::DataType &seq_lens_decoder_dtype,
     const paddle::DataType &seq_lens_encoder_dtype,
@@ -252,7 +253,7 @@ std::vector<paddle::DataType> RebuildPaddingInferDtype(
 
 PD_BUILD_STATIC_OP(rebuild_padding)
     .Inputs({"tmp_out",
-             "cum_offsets",
+             "cu_seqlens_q",
              "seq_len_this_time",
              "seq_lens_decoder",
              "seq_lens_encoder",
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu
index 96186d761f..e37dacbf34 100644
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu
@@ -106,7 +106,6 @@ std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
         seq_length,
         max_draft_tokens);
     return {x_remove_padding,
-            cum_offsets_out,
             batch_id_per_token,
             cu_seqlens_q,
             cu_seqlens_k};  // , enc_token_num, dec_token_num};
@@ -121,7 +120,7 @@ std::vector<std::vector<int64_t>> SpeculateGetPaddingOffsetInferShape(
     const std::vector<int64_t>& seq_lens_encoder_shape) {
     int64_t bsz = seq_len_shape[0];
     int64_t seq_len = input_ids_shape[1];
-    return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}};
+    return {{-1}, {-1}, {bsz + 1}, {bsz + 1}};
 }
 
 std::vector<paddle::DataType> SpeculateGetPaddingOffsetInferDtype(
@@ -132,7 +131,6 @@ std::vector<paddle::DataType> SpeculateGetPaddingOffsetInferDtype(
     const paddle::DataType& seq_len_dtype,
     const paddle::DataType& seq_lens_encoder_dtype) {
     return {input_ids_dtype,
-            seq_len_dtype,
             seq_len_dtype,
             seq_len_dtype,
             seq_len_dtype};
@@ -141,12 +139,10 @@ std::vector<paddle::DataType> SpeculateGetPaddingOffsetInferDtype(
 PD_BUILD_STATIC_OP(speculate_get_padding_offset)
     .Inputs({"input_ids",
              "draft_tokens",
-             "cum_offsets",
              "token_num",
              "seq_len",
              "seq_lens_encoder"})
     .Outputs({"x_remove_padding",
-              "cum_offsets_out",
               "batch_id_per_token",
               "cu_seqlens_q",
               "cu_seqlens_k"})
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index 7d0a2aef78..190f442fbf 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -104,7 +104,6 @@ def pre_process(
     if speculative_decoding:
         (
             ids_remove_padding,
-            cum_offsets,
             batch_id_per_token,
             cu_seqlens_q,
             cu_seqlens_k,
@@ -134,14 +133,12 @@ def pre_process(
     else:
         (
             ids_remove_padding,
-            cum_offsets,
             batch_id_per_token,
             cu_seqlens_q,
             cu_seqlens_k,
         ) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time)
     return (
         ids_remove_padding,
-        cum_offsets,
         batch_id_per_token,
         cu_seqlens_q,
         cu_seqlens_k,
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index ca73961408..8b54bb3dcd 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -580,7 +580,6 @@ def _init_share_inputs(self, max_num_seqs: int):
             0,
             dtype="int64",
         )
-        self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["batch_id_per_token"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
@@ -695,7 +694,6 @@ def _prepare_inputs(self) -> None:
         # Remove padding
         (
             ids_remove_padding,
-            cum_offsets,
             batch_id_per_token,
             cu_seqlens_q,
             cu_seqlens_k,
@@ -711,7 +709,6 @@ def _prepare_inputs(self) -> None:
         )
 
         self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False)
-        self.share_inputs["cum_offsets"].copy_(cum_offsets, False)
         self.share_inputs["batch_id_per_token"].copy_(batch_id_per_token, False)
         self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False)
         self.share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False)
@@ -910,7 +907,7 @@ def _dummy_run(
             expected_decode_len=expected_decode_len,
         )
         if self.speculative_method in ["mtp"]:
-            self.proposer.dummy_prefill_inputs(
+            self.proposer.dummy_prefipre_processll_inputs(
                 num_tokens=num_tokens,
                 batch_size=batch_size,
                 expected_decode_len=expected_decode_len,
@@ -940,7 +937,7 @@ def _dummy_run(
 
                 hidden_states = rebuild_padding(
                     model_output,
-                    self.share_inputs["cum_offsets"],
+                    self.share_inputs["cu_seqlens_q"],
                     self.share_inputs["seq_lens_this_time"],
                     self.share_inputs["seq_lens_decoder"],
                     self.share_inputs["seq_lens_encoder"],
@@ -1199,9 +1196,11 @@ class at the server level, which is too granular for ModelRunner.
                 ids_remove_padding=self.share_inputs["ids_remove_padding"],
                 forward_meta=self.forward_meta,
             )
-            hidden_states = rebuild_padding(
+            from fastdeploy.model_executor.ops.cpu import rebuild_padding_cpu
+
+            hidden_states = rebuild_padding_cpu(
                 model_output,
-                self.share_inputs["cum_offsets"],
+                self.share_inputs["cu_seqlens_q"],
                 self.share_inputs["seq_lens_this_time"],
                 self.share_inputs["seq_lens_decoder"],
                 self.share_inputs["seq_lens_encoder"],

From f1db5268f15b7236aa6a8226c43191946193bffe Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 4 Aug 2025 14:13:12 +0800
Subject: [PATCH 02/22] update rebuild_padding

---
 custom_ops/cpu_ops/rebuild_padding.cc         | 65 ++++++++++---------
 .../model_executor/pre_and_post_process.py    | 14 ++--
 fastdeploy/worker/gcu_model_runner.py         |  7 +-
 fastdeploy/worker/gpu_model_runner.py         |  6 +-
 fastdeploy/worker/iluvatar_model_runner.py    |  5 +-
 5 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc
index 315a04f30b..5f8e033f56 100644
--- a/custom_ops/cpu_ops/rebuild_padding.cc
+++ b/custom_ops/cpu_ops/rebuild_padding.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 template <typename T>
 void RebuildPaddingCPUImpl(T *output_data,
                            const T *input_data,
-                           const int *cu_seqlens_q,
+                           const int *cu_seqlens_q_data,
                            const int *seq_len_this_time_data,
                            const int *seq_lens_decoder_data,
                            const int *seq_lens_encoder_data,
@@ -40,11 +40,13 @@ void RebuildPaddingCPUImpl(T *output_data,
         if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
             continue;
         }
+
+        // if encoder, get last token; just decoder, get first token.
         if (seq_lens_encoder_data[bi] > 0) {
             seq_id = seq_lens_encoder_data[bi] - 1;
         }
-        const int ori_token_idx =
-            cu_seqlens_q[bi] + seq_id;
+
+        const int ori_token_idx = cu_seqlens_q_data[bi] + seq_id;
         const int src_offset = ori_token_idx * dim_embed + bias_idx;
 
         output_data[i] = input_data[src_offset];
@@ -54,7 +56,7 @@ void RebuildPaddingCPUImpl(T *output_data,
 template <typename T>
 void RebuildAppendPaddingCPUImpl(T *output_data,
                                  const T *input_data,
-                                 const int *cu_seqlens_q,
+                                 const int *cu_seqlens_q_data,
                                  const int *seq_len_this_time_data,
                                  const int *seq_lens_decoder_data,
                                  const int *seq_lens_encoder_data,
@@ -63,36 +65,42 @@ void RebuildAppendPaddingCPUImpl(T *output_data,
                                  const int dim_embed,
                                  const int64_t output_elem_nums) {
     for (int i = 0; i < output_elem_nums; ++i) {
-        int out_token_id = i / dim_embed;
-        int ori_token_id =
+        const int out_token_id = i / dim_embed;
+        const int ori_token_id =
             out_token_id + output_padding_offset_data[out_token_id];
-        int bi = ori_token_id / max_input_length;
-        if (seq_len_this_time_data[bi] == 0 ||
-            (seq_lens_decoder_data[bi] == 0 &&
-             seq_lens_encoder_data[bi] == 0)) {
+        const int bi = ori_token_id / max_input_length;
+        int seq_id = 0;
+
+        if (seq_len_this_time_data[bi] == 0) {
             continue;
         }
-        int seq_id = 0;
+        if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
+            continue;
+        }
+
+        // if encoder, get last token; just decoder, get first token.
         if (seq_lens_encoder_data[bi] > 0) {
             seq_id = seq_lens_encoder_data[bi] - 1;
         }
 
-        int input_token_id = cu_seqlens_q[bi] + seq_id;
-        int bias_idx = i % dim_embed;
-        int src_offset = input_token_id * dim_embed + bias_idx;
+        const int input_token_id = cu_seqlens_q_data[bi] + seq_id;
+        const int bias_idx = i % dim_embed;
+        const int src_offset = input_token_id * dim_embed + bias_idx;
+
         output_data[i] = input_data[src_offset];
     }
 }
 
 std::vector<paddle::Tensor> RebuildPaddingCPU(
     const paddle::Tensor &tmp_out,
-    const paddle::Tensor &cu_seqlens_q,
+    const paddle::Tensor &cu_seqlens_q,  // 改名：从cum_offsets改为cu_seqlens_q
     const paddle::Tensor &seq_len_this_time,
     const paddle::Tensor &seq_lens_decoder,
     const paddle::Tensor &seq_lens_encoder,
     const paddle::optional<paddle::Tensor> &output_padding_offset,
     int max_input_length) {
     auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
+    auto cu_seqlens_q_cpu = cu_seqlens_q.copy_to(paddle::CPUPlace(), true);  // 改名
     auto seq_len_this_time_cpu =
         seq_len_this_time.copy_to(paddle::CPUPlace(), true);
     auto seq_lens_decoder_cpu =
@@ -107,7 +115,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
 
     int token_num = tmp_out_cpu.shape()[0];
     int dim_embed = tmp_out_cpu.shape()[1];
-    int bsz = cu_seqlens_q.shape()[0] - 1;
+    int bsz = cu_seqlens_q_cpu.shape()[0] - 1;  // 改为bsz+1的长度，所以bsz = shape[0] - 1
 
     paddle::Tensor out;
     if (output_padding_offset_cpu) {
@@ -128,7 +136,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
             {bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
     }
 
-    const int *cu_seqlens_q_data = cu_seqlens_q.data<int>();
+    const int *cu_seqlens_q_data = cu_seqlens_q_cpu.data<int>();  // 改名
     const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
     const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
     const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
@@ -141,7 +149,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
             case paddle::DataType::FLOAT32:
                 RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
                                                    tmp_out_cpu.data<float>(),
-                                                   cu_seqlens_q_data,
+                                                   cu_seqlens_q_data,  // 改名
                                                    seq_len_this_time_data,
                                                    seq_lens_decoder_data,
                                                    seq_lens_encoder_data,
@@ -154,7 +162,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                 RebuildAppendPaddingCPUImpl<paddle::float16>(
                     out.data<paddle::float16>(),
                     tmp_out_cpu.data<paddle::float16>(),
-                    cu_seqlens_q_data,
+                    cu_seqlens_q_data,  // 改名
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -167,7 +175,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                 RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
                     out.data<paddle::bfloat16>(),
                     tmp_out_cpu.data<paddle::bfloat16>(),
-                    cu_seqlens_q_data,
+                    cu_seqlens_q_data,  // 改名
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -186,7 +194,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
             case paddle::DataType::FLOAT32:
                 RebuildPaddingCPUImpl<float>(out.data<float>(),
                                              tmp_out_cpu.data<float>(),
-                                             cu_seqlens_q_data,
+                                             cu_seqlens_q_data,  // 改名
                                              seq_len_this_time_data,
                                              seq_lens_decoder_data,
                                              seq_lens_encoder_data,
@@ -198,7 +206,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                 RebuildPaddingCPUImpl<paddle::float16>(
                     out.data<paddle::float16>(),
                     tmp_out_cpu.data<paddle::float16>(),
-                    cu_seqlens_q_data,
+                    cu_seqlens_q_data,  // 改名
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -207,11 +215,10 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                     elem_nums);
                 break;
             case paddle::DataType::BFLOAT16:
-
                 RebuildPaddingCPUImpl<paddle::bfloat16>(
                     out.data<paddle::bfloat16>(),
                     tmp_out_cpu.data<paddle::bfloat16>(),
-                    cu_seqlens_q_data,
+                    cu_seqlens_q_data,  // 改名
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -230,7 +237,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
 
 std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
     const std::vector<int64_t> &tmp_out_shape,
-    const std::vector<int64_t> &cu_seqlens_q_shape,
+    const std::vector<int64_t> &cu_seqlens_q_shape,  // 改名：从cum_offsets_shape改为cu_seqlens_q_shape
     const std::vector<int64_t> &seq_len_this_time_shape,
     const std::vector<int64_t> &seq_lens_decoder_shape,
     const std::vector<int64_t> &seq_lens_encoder_shape,
@@ -239,14 +246,14 @@ std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
     if (output_padding_offset_shape) {
         return {{-1, dim_embed}};
     } else {
-        int64_t bsz = cu_seqlens_q_shape[0]-1;
+        int64_t bsz = cu_seqlens_q_shape[0] - 1;  // 改为bsz+1的长度，所以bsz = shape[0] - 1
         return {{bsz, dim_embed}};
     }
 }
 
 std::vector<paddle::DataType> RebuildPaddingInferDtype(
     const paddle::DataType &tmp_out_dtype,
-    const paddle::DataType &cu_seqlens_q_dtype,
+    const paddle::DataType &cu_seqlens_q_dtype,  // 改名：从cum_offsets_dtype改为cu_seqlens_q_dtype
     const paddle::DataType &seq_len_this_time_dtype,
     const paddle::DataType &seq_lens_decoder_dtype,
     const paddle::DataType &seq_lens_encoder_dtype,
@@ -256,7 +263,7 @@ std::vector<paddle::DataType> RebuildPaddingInferDtype(
 
 PD_BUILD_STATIC_OP(rebuild_padding_cpu)
     .Inputs({"tmp_out",
-             "cu_seqlens_q",
+             "cu_seqlens_q",  // 改名：从cum_offsets改为cu_seqlens_q
              "seq_len_this_time",
              "seq_lens_decoder",
              "seq_lens_encoder",
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index 190f442fbf..28a4cab486 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -484,7 +484,7 @@ def step_cuda(
 
 def rebuild_padding(
     tmp_out: paddle.Tensor,
-    cum_offsets: paddle.Tensor,
+    cu_seqlens_q: paddle.Tensor,
     seq_len_this_time: paddle.Tensor,
     seq_lens_decoder: paddle.Tensor,
     seq_lens_encoder: paddle.Tensor,
@@ -500,7 +500,7 @@ def rebuild_padding(
 
         hidden_states = rebuild_padding(
             tmp_out,
-            cum_offsets,
+            cu_seqlens_q,
             seq_len_this_time,
             seq_lens_decoder,
             seq_lens_encoder,
@@ -512,7 +512,7 @@ def rebuild_padding(
 
         hidden_states = rebuild_padding(
             tmp_out,
-            cum_offsets,
+            cu_seqlens_q,
             seq_len_this_time,
             seq_lens_decoder,
             seq_lens_encoder,
@@ -524,19 +524,19 @@ def rebuild_padding(
 
         hidden_states = rebuild_padding(
             tmp_out,
-            cum_offsets,
+            cu_seqlens_q,
             seq_len_this_time,
             seq_lens_decoder,
             seq_lens_encoder,
             output_padding_offset,
             max_input_length,
         )
-    elif current_platform.is_gcu():
+        # elif current_platform.is_gcu():
         from fastdeploy.model_executor.ops.gcu import rebuild_padding
 
         hidden_states = rebuild_padding(
             tmp_out,
-            cum_offsets,
+            cu_seqlens_q,
             seq_len_this_time,
             seq_lens_decoder,
             seq_lens_encoder,
@@ -548,7 +548,7 @@ def rebuild_padding(
 
         hidden_states = rebuild_padding_cpu(
             tmp_out,
-            cum_offsets,
+            cu_seqlens_q,
             seq_len_this_time,
             seq_lens_decoder,
             seq_lens_encoder,
diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py
index 63bf6e7b34..7fe06e0fb4 100644
--- a/fastdeploy/worker/gcu_model_runner.py
+++ b/fastdeploy/worker/gcu_model_runner.py
@@ -398,7 +398,7 @@ def _init_share_inputs(self, max_num_seqs: int):
             0,
             dtype="int64",
         )
-        self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
+
         self.share_inputs["batch_id_per_token"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
@@ -495,7 +495,6 @@ def _prepare_inputs(self) -> None:
         )
 
         self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False)
-        self.share_inputs["cum_offsets"].copy_(cum_offsets, False)
         self.share_inputs["batch_id_per_token"].copy_(batch_id_per_token, False)
         self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False)
         self.share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False)
@@ -698,7 +697,7 @@ def _dummy_run(
 
             hidden_states = rebuild_padding(
                 model_output,
-                self.share_inputs["cum_offsets"],
+                self.share_inputs["cu_seqlens_q"],
                 self.share_inputs["seq_lens_this_time"],
                 self.share_inputs["seq_lens_decoder"],
                 self.share_inputs["seq_lens_encoder"],
@@ -910,7 +909,7 @@ class at the server level, which is too granular for ModelRunner.
 
         hidden_states = rebuild_padding(
             model_output,
-            self.share_inputs["cum_offsets"],
+            self.share_inputs["cu_seqlens_q"],
             self.share_inputs["seq_lens_this_time"],
             self.share_inputs["seq_lens_decoder"],
             self.share_inputs["seq_lens_encoder"],
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 8b54bb3dcd..40331074f0 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -907,7 +907,7 @@ def _dummy_run(
             expected_decode_len=expected_decode_len,
         )
         if self.speculative_method in ["mtp"]:
-            self.proposer.dummy_prefipre_processll_inputs(
+            self.proposer.dummy_prefill_inputs(
                 num_tokens=num_tokens,
                 batch_size=batch_size,
                 expected_decode_len=expected_decode_len,
@@ -1196,9 +1196,7 @@ class at the server level, which is too granular for ModelRunner.
                 ids_remove_padding=self.share_inputs["ids_remove_padding"],
                 forward_meta=self.forward_meta,
             )
-            from fastdeploy.model_executor.ops.cpu import rebuild_padding_cpu
-
-            hidden_states = rebuild_padding_cpu(
+            hidden_states = rebuild_padding(
                 model_output,
                 self.share_inputs["cu_seqlens_q"],
                 self.share_inputs["seq_lens_this_time"],
diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py
index f3ef3823c0..f4fcb2fc03 100644
--- a/fastdeploy/worker/iluvatar_model_runner.py
+++ b/fastdeploy/worker/iluvatar_model_runner.py
@@ -363,7 +363,6 @@ def _init_share_inputs(self, max_num_seqs: int):
             0,
             dtype="int64",
         )
-        self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["padding_offset"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
@@ -468,7 +467,6 @@ def _prepare_inputs(self) -> None:
         )
 
         self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False)
-        self.share_inputs["cum_offsets"].copy_(cum_offsets, False)
         self.share_inputs["padding_offset"].copy_(padding_offset, False)
         self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False)
         self.share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False)
@@ -531,7 +529,6 @@ def initialize_forward_meta(self):
             seq_lens_encoder=self.share_inputs["seq_lens_encoder"],
             seq_lens_decoder=self.share_inputs["seq_lens_decoder"],
             seq_lens_this_time=self.share_inputs["seq_lens_this_time"],
-            cum_offsets=self.share_inputs["cum_offsets"],
             padding_offset=self.share_inputs["padding_offset"],
             cu_seqlens_q=self.share_inputs["cu_seqlens_q"],
             cu_seqlens_k=self.share_inputs["cu_seqlens_k"],
@@ -656,7 +653,7 @@ def _dummy_run(
 
             hiddden_states = rebuild_padding(
                 model_output,
-                self.share_inputs["cum_offsets"],
+                self.share_inputs["cu_seqlens_q"],
                 self.share_inputs["seq_lens_this_time"],
                 self.share_inputs["seq_lens_decoder"],
                 self.share_inputs["seq_lens_encoder"],

From 4f4a7b86509b7b52277f1c25128e463df4b5c33b Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 4 Aug 2025 16:13:48 +0800
Subject: [PATCH 03/22] add rebuild_padding test

---
 test/layers/test_rebuild_padding.py | 212 ++++++++++++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 test/layers/test_rebuild_padding.py

diff --git a/test/layers/test_rebuild_padding.py b/test/layers/test_rebuild_padding.py
new file mode 100644
index 0000000000..af3b670e34
--- /dev/null
+++ b/test/layers/test_rebuild_padding.py
@@ -0,0 +1,212 @@
+import time
+import unittest
+from typing import Tuple
+
+import numpy as np
+import paddle
+
+
+class TestCuSeqlensQPerformance(unittest.TestCase):
+
+    def setUp(self):
+        paddle.device.set_device("gpu:0")
+
+        # Test configurations:(batch_size, max_seq_len, dim_embed, avg_seq_len_ratio)
+        self.test_configs = [
+            # Small scale tests
+            (4, 512, 2048, 0.8),
+            (8, 512, 4096, 0.7),
+            # Medium scale tests
+            (16, 1024, 4096, 0.6),
+            (32, 1024, 4096, 0.8),
+            # Large scale tests
+            (64, 2048, 4096, 0.5),
+            (128, 1024, 8192, 0.7),
+            (256, 512, 4096, 0.9),
+            (16, 4096, 4096, 0.6),
+            (32, 2048, 8192, 0.8),
+        ]
+
+        self.warmup_runs = 10
+        self.benchmark_runs = 50
+
+    def generate_realistic_test_data(
+        self, batch_size: int, max_seq_len: int, dim_embed: int, avg_ratio: float
+    ) -> dict:
+        """Generate test data closer to real-world scenarios"""
+
+        avg_seq_len = int(max_seq_len * avg_ratio)
+        std_seq_len = avg_seq_len // 4
+
+        seq_lens = np.random.normal(avg_seq_len, std_seq_len, batch_size)
+        seq_lens = np.clip(seq_lens, max_seq_len // 10, max_seq_len).astype(np.int32)
+
+        total_tokens = np.sum(seq_lens)
+
+        tmp_out = paddle.randn([total_tokens, dim_embed], dtype=paddle.float16)
+        tmp_out = tmp_out.cuda()
+
+        cu_seqlens_q_np = np.zeros(batch_size + 1, dtype=np.int32)
+        for i in range(batch_size):
+            cu_seqlens_q_np[i + 1] = cu_seqlens_q_np[i] + seq_lens[i]
+
+        cu_seqlens_q = paddle.to_tensor(cu_seqlens_q_np, dtype=paddle.int32).cuda()
+
+        seq_len_this_time = paddle.to_tensor(seq_lens, dtype=paddle.int32).cuda()
+        seq_len_decoder = paddle.to_tensor(seq_lens, dtype=paddle.int32).cuda()
+        seq_len_encoder = paddle.zeros([batch_size], dtype=paddle.int32).cuda()
+
+        return {
+            "tmp_out": tmp_out,
+            "cu_seqlens_q": cu_seqlens_q,
+            "seq_len_this_time": seq_len_this_time,
+            "seq_len_decoder": seq_len_decoder,
+            "seq_len_encoder": seq_len_encoder,
+            "max_input_length": max_seq_len,
+            "actual_tokens": total_tokens,
+            "seq_lens": seq_lens,
+        }
+
+    def benchmark_cu_seqlens_performance(self, data_dict: dict) -> Tuple[float, float, paddle.Tensor]:
+        """Test performance of cu_seqlens_q version"""
+
+        def rebuild_padding_cu_seqlens(
+            tmp_out, cu_seqlens_q, seq_len_this_time, seq_len_decoder, seq_len_encoder, max_input_length
+        ):
+
+            from fastdeploy.model_executor.pre_and_post_process import rebuild_padding
+
+            hidden_states = rebuild_padding(
+                tmp_out, cu_seqlens_q, seq_len_this_time, seq_len_decoder, seq_len_encoder, None, max_input_length
+            )
+            return hidden_states
+
+        for _ in range(self.warmup_runs):
+            result = rebuild_padding_cu_seqlens(
+                data_dict["tmp_out"],
+                data_dict["cu_seqlens_q"],
+                data_dict["seq_len_this_time"],
+                data_dict["seq_len_decoder"],
+                data_dict["seq_len_encoder"],
+                data_dict["max_input_length"],
+            )
+            paddle.device.cuda.synchronize()
+
+        paddle.device.cuda.synchronize()
+        start_time = time.perf_counter()
+
+        for _ in range(self.benchmark_runs):
+            result = rebuild_padding_cu_seqlens(
+                data_dict["tmp_out"],
+                data_dict["cu_seqlens_q"],
+                data_dict["seq_len_this_time"],
+                data_dict["seq_len_decoder"],
+                data_dict["seq_len_encoder"],
+                data_dict["max_input_length"],
+            )
+
+        paddle.device.cuda.synchronize()
+        end_time = time.perf_counter()
+
+        avg_time = (end_time - start_time) / self.benchmark_runs * 1000  # ms
+
+        # throughput(tokens/ms)
+        throughput = data_dict["actual_tokens"] / avg_time
+
+        return avg_time, throughput, result
+
+    def test_performance_scaling(self):
+        """Test performance unfer different scales"""
+        print("\n" + "=" * 90)
+        print("CU_SEQLENS_Q Performance Scaling Test")
+        print("=" * 90)
+        print(
+            f"{'Config':<20} {'Batch':<6} {'SeqLen':<7} {'Tokens':<8} {'Time(ms)':<10} {'Throughput':<12} {'Memory(MB)'}"
+        )
+        print("-" * 90)
+
+        results = []
+
+        for i, (batch_size, max_seq_len, dim_embed, avg_ratio) in enumerate(self.test_configs):
+            config_name = f"Config_{i+1}"
+
+            try:
+                data_dict = self.generate_realistic_test_data(batch_size, max_seq_len, dim_embed, avg_ratio)
+
+                paddle.device.cuda.empty_cache()
+                mem_before = paddle.device.cuda.memory_allocated() / 1024 / 1024  # MB
+
+                avg_time, throughput, result = self.benchmark_cu_seqlens_performance(data_dict)
+
+                mem_after = paddle.device.cuda.memory_allocated() / 1024 / 1024  # MB
+                mem_usage = mem_after - mem_before
+
+                results.append(
+                    {
+                        "config": config_name,
+                        "batch_size": batch_size,
+                        "max_seq_len": max_seq_len,
+                        "dim_embed": dim_embed,
+                        "actual_tokens": data_dict["actual_tokens"],
+                        "avg_time": avg_time,
+                        "throughput": throughput,
+                        "memory_mb": mem_usage,
+                        "result_shape": result.shape,
+                    }
+                )
+
+                print(
+                    f"{config_name:<20} {batch_size:<6} {max_seq_len:<7} "
+                    f"{data_dict['actual_tokens']:<8} {avg_time:<10.3f} "
+                    f"{throughput:<12.1f} {mem_usage:<8.1f}"
+                )
+
+                expected_shape = [batch_size, dim_embed]
+                self.assertEqual(list(result.shape), expected_shape, f"Output shape mismatch for {config_name}")
+
+            except Exception as e:
+                print(
+                    f"{config_name:<20} {'ERROR':<6} {'ERROR':<7} {'ERROR':<8} "
+                    f"{'ERROR':<10} {'ERROR':<12} {'ERROR':<8} - {str(e)[:30]}..."
+                )
+
+        print("-" * 90)
+        return results
+
+
+def main():
+    """Run all performance tests"""
+    print("Starting CU_SEQLENS_Q Performance Benchmark...")
+    print(f"GPU: {paddle.device.cuda.get_device_name()}")
+    print(f"GPU Memory: {paddle.device.cuda.get_device_properties().total_memory / 1024**3:.1f} GB")
+
+    test_instance = TestCuSeqlensQPerformance()
+    test_instance.setUp()
+
+    try:
+        scaling_results = test_instance.test_performance_scaling()
+
+        print("\n" + "=" * 50)
+        print("Performance Summary")
+        print("=" * 50)
+
+        if scaling_results:
+            best_throughput = max(scaling_results, key=lambda x: x["throughput"])
+            print(f"Best throughput: {best_throughput['throughput']:.1f} tokens/ms")
+            print(
+                f"  Config: {best_throughput['config']} "
+                f"(batch={best_throughput['batch_size']}, "
+                f"seq_len={best_throughput['max_seq_len']})"
+            )
+
+        print("=" * 50)
+
+    except Exception as e:
+        print(f"Test failed with error: {e}")
+        import traceback
+
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()

From d62c1645eb6653a014c16e163d2d98ed79f138ea Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 4 Aug 2025 16:23:34 +0800
Subject: [PATCH 04/22] fix

---
 test/layers/test_rebuild_padding.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/layers/test_rebuild_padding.py b/test/layers/test_rebuild_padding.py
index af3b670e34..2860959e15 100644
--- a/test/layers/test_rebuild_padding.py
+++ b/test/layers/test_rebuild_padding.py
@@ -23,8 +23,6 @@ def setUp(self):
             (64, 2048, 4096, 0.5),
             (128, 1024, 8192, 0.7),
             (256, 512, 4096, 0.9),
-            (16, 4096, 4096, 0.6),
-            (32, 2048, 8192, 0.8),
         ]
 
         self.warmup_runs = 10

From dcb809d77895126ef8d6bd2fb1a83ebdf90761af Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 4 Aug 2025 22:31:45 +0800
Subject: [PATCH 05/22] fix test_rebuild_padding.py

---
 test/layers/test_rebuild_padding.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/test/layers/test_rebuild_padding.py b/test/layers/test_rebuild_padding.py
index 2860959e15..6dbbe27c5e 100644
--- a/test/layers/test_rebuild_padding.py
+++ b/test/layers/test_rebuild_padding.py
@@ -1,3 +1,19 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
 import time
 import unittest
 from typing import Tuple

From 8d6e60457d4765a1f5c2ba5c3b715d1d6c6e497c Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 4 Aug 2025 22:38:27 +0800
Subject: [PATCH 06/22] Remove the Chinese comments and translate this sentence
 into English.

---
 custom_ops/cpu_ops/rebuild_padding.cc | 28 +++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc
index 5f8e033f56..cd77cbc840 100644
--- a/custom_ops/cpu_ops/rebuild_padding.cc
+++ b/custom_ops/cpu_ops/rebuild_padding.cc
@@ -93,14 +93,14 @@ void RebuildAppendPaddingCPUImpl(T *output_data,
 
 std::vector<paddle::Tensor> RebuildPaddingCPU(
     const paddle::Tensor &tmp_out,
-    const paddle::Tensor &cu_seqlens_q,  // 改名：从cum_offsets改为cu_seqlens_q
+    const paddle::Tensor &cu_seqlens_q,
     const paddle::Tensor &seq_len_this_time,
     const paddle::Tensor &seq_lens_decoder,
     const paddle::Tensor &seq_lens_encoder,
     const paddle::optional<paddle::Tensor> &output_padding_offset,
     int max_input_length) {
     auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
-    auto cu_seqlens_q_cpu = cu_seqlens_q.copy_to(paddle::CPUPlace(), true);  // 改名
+    auto cu_seqlens_q_cpu = cu_seqlens_q.copy_to(paddle::CPUPlace(), true);
     auto seq_len_this_time_cpu =
         seq_len_this_time.copy_to(paddle::CPUPlace(), true);
     auto seq_lens_decoder_cpu =
@@ -115,7 +115,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
 
     int token_num = tmp_out_cpu.shape()[0];
     int dim_embed = tmp_out_cpu.shape()[1];
-    int bsz = cu_seqlens_q_cpu.shape()[0] - 1;  // 改为bsz+1的长度，所以bsz = shape[0] - 1
+    int bsz = cu_seqlens_q_cpu.shape()[0] - 1;
 
     paddle::Tensor out;
     if (output_padding_offset_cpu) {
@@ -136,7 +136,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
             {bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
     }
 
-    const int *cu_seqlens_q_data = cu_seqlens_q_cpu.data<int>();  // 改名
+    const int *cu_seqlens_q_data = cu_seqlens_q_cpu.data<int>();
     const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
     const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
     const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
@@ -149,7 +149,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
             case paddle::DataType::FLOAT32:
                 RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
                                                    tmp_out_cpu.data<float>(),
-                                                   cu_seqlens_q_data,  // 改名
+                                                   cu_seqlens_q_data,
                                                    seq_len_this_time_data,
                                                    seq_lens_decoder_data,
                                                    seq_lens_encoder_data,
@@ -162,7 +162,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                 RebuildAppendPaddingCPUImpl<paddle::float16>(
                     out.data<paddle::float16>(),
                     tmp_out_cpu.data<paddle::float16>(),
-                    cu_seqlens_q_data,  // 改名
+                    cu_seqlens_q_data,
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -175,7 +175,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                 RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
                     out.data<paddle::bfloat16>(),
                     tmp_out_cpu.data<paddle::bfloat16>(),
-                    cu_seqlens_q_data,  // 改名
+                    cu_seqlens_q_data,
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -194,7 +194,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
             case paddle::DataType::FLOAT32:
                 RebuildPaddingCPUImpl<float>(out.data<float>(),
                                              tmp_out_cpu.data<float>(),
-                                             cu_seqlens_q_data,  // 改名
+                                             cu_seqlens_q_data,
                                              seq_len_this_time_data,
                                              seq_lens_decoder_data,
                                              seq_lens_encoder_data,
@@ -206,7 +206,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                 RebuildPaddingCPUImpl<paddle::float16>(
                     out.data<paddle::float16>(),
                     tmp_out_cpu.data<paddle::float16>(),
-                    cu_seqlens_q_data,  // 改名
+                    cu_seqlens_q_data,
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -218,7 +218,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
                 RebuildPaddingCPUImpl<paddle::bfloat16>(
                     out.data<paddle::bfloat16>(),
                     tmp_out_cpu.data<paddle::bfloat16>(),
-                    cu_seqlens_q_data,  // 改名
+                    cu_seqlens_q_data,
                     seq_len_this_time_data,
                     seq_lens_decoder_data,
                     seq_lens_encoder_data,
@@ -237,7 +237,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
 
 std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
     const std::vector<int64_t> &tmp_out_shape,
-    const std::vector<int64_t> &cu_seqlens_q_shape,  // 改名：从cum_offsets_shape改为cu_seqlens_q_shape
+    const std::vector<int64_t> &cu_seqlens_q_shape,
     const std::vector<int64_t> &seq_len_this_time_shape,
     const std::vector<int64_t> &seq_lens_decoder_shape,
     const std::vector<int64_t> &seq_lens_encoder_shape,
@@ -246,14 +246,14 @@ std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
     if (output_padding_offset_shape) {
         return {{-1, dim_embed}};
     } else {
-        int64_t bsz = cu_seqlens_q_shape[0] - 1;  // 改为bsz+1的长度，所以bsz = shape[0] - 1
+        int64_t bsz = cu_seqlens_q_shape[0] - 1;
         return {{bsz, dim_embed}};
     }
 }
 
 std::vector<paddle::DataType> RebuildPaddingInferDtype(
     const paddle::DataType &tmp_out_dtype,
-    const paddle::DataType &cu_seqlens_q_dtype,  // 改名：从cum_offsets_dtype改为cu_seqlens_q_dtype
+    const paddle::DataType &cu_seqlens_q_dtype,
     const paddle::DataType &seq_len_this_time_dtype,
     const paddle::DataType &seq_lens_decoder_dtype,
     const paddle::DataType &seq_lens_encoder_dtype,
@@ -263,7 +263,7 @@ std::vector<paddle::DataType> RebuildPaddingInferDtype(
 
 PD_BUILD_STATIC_OP(rebuild_padding_cpu)
     .Inputs({"tmp_out",
-             "cu_seqlens_q",  // 改名：从cum_offsets改为cu_seqlens_q
+             "cu_seqlens_q",
              "seq_len_this_time",
              "seq_lens_decoder",
              "seq_lens_encoder",

From 9407c174bc2a0b760c08f077ca66ade4fb637bf1 Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 4 Aug 2025 23:03:01 +0800
Subject: [PATCH 07/22] fix comments:

---
 custom_ops/cpu_ops/rebuild_padding.cc | 24 ++++++++++++------------
 custom_ops/gpu_ops/rebuild_padding.cu |  2 +-
 test/layers/test_rebuild_padding.py   | 10 +++-------
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc
index cd77cbc840..cc7f7c779c 100644
--- a/custom_ops/cpu_ops/rebuild_padding.cc
+++ b/custom_ops/cpu_ops/rebuild_padding.cc
@@ -65,27 +65,27 @@ void RebuildAppendPaddingCPUImpl(T *output_data,
                                  const int dim_embed,
                                  const int64_t output_elem_nums) {
     for (int i = 0; i < output_elem_nums; ++i) {
-        const int out_token_id = i / dim_embed;
-        const int ori_token_id =
+        int out_token_id = i / dim_embed;
+        int ori_token_id =
             out_token_id + output_padding_offset_data[out_token_id];
-        const int bi = ori_token_id / max_input_length;
+        int bi = ori_token_id / max_input_length;
         int seq_id = 0;
 
-        if (seq_len_this_time_data[bi] == 0) {
-            continue;
-        }
-        if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
-            continue;
-        }
+         if (seq_len_this_time_data[bi] == 0 ||
+            (seq_lens_decoder_data[bi] == 0 &&
+            seq_lens_encoder_data[bi] == 0)) {
+                continue;
+            }
+
 
         // if encoder, get last token; just decoder, get first token.
         if (seq_lens_encoder_data[bi] > 0) {
             seq_id = seq_lens_encoder_data[bi] - 1;
         }
 
-        const int input_token_id = cu_seqlens_q_data[bi] + seq_id;
-        const int bias_idx = i % dim_embed;
-        const int src_offset = input_token_id * dim_embed + bias_idx;
+        int input_token_id = cu_seqlens_q_data[bi] + seq_id;
+        int bias_idx = i % dim_embed;
+        int src_offset = input_token_id * dim_embed + bias_idx;
 
         output_data[i] = input_data[src_offset];
     }
diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu
index 38a1fc551c..7632136667 100644
--- a/custom_ops/gpu_ops/rebuild_padding.cu
+++ b/custom_ops/gpu_ops/rebuild_padding.cu
@@ -161,7 +161,7 @@ std::vector<paddle::Tensor> rebuild_padding(
 
 paddle::Tensor RebuildPaddingFunc(
     const paddle::Tensor &tmp_out,      // [token_num, dim_embed]
-    const paddle::Tensor &cu_seqlens_q,  // [bsz, 1]
+    const paddle::Tensor &cu_seqlens_q,  // [bsz+1, 1]
     const paddle::Tensor &seq_len_this_time,
     const paddle::Tensor &seq_lens_decoder,
     const paddle::Tensor &seq_lens_encoder,
diff --git a/test/layers/test_rebuild_padding.py b/test/layers/test_rebuild_padding.py
index 6dbbe27c5e..a66bde915b 100644
--- a/test/layers/test_rebuild_padding.py
+++ b/test/layers/test_rebuild_padding.py
@@ -179,9 +179,8 @@ def test_performance_scaling(self):
                 self.assertEqual(list(result.shape), expected_shape, f"Output shape mismatch for {config_name}")
 
             except Exception as e:
-                print(
-                    f"{config_name:<20} {'ERROR':<6} {'ERROR':<7} {'ERROR':<8} "
-                    f"{'ERROR':<10} {'ERROR':<12} {'ERROR':<8} - {str(e)[:30]}..."
+                raise RuntimeError(
+                    f"Failed to test configuration {config_name} (batch={batch_size}, seq_len={max_seq_len}): {str(e)}"
                 )
 
         print("-" * 90)
@@ -216,10 +215,7 @@ def main():
         print("=" * 50)
 
     except Exception as e:
-        print(f"Test failed with error: {e}")
-        import traceback
-
-        traceback.print_exc()
+        raise RuntimeError(f"Performance benchmark failed: {str(e)}")
 
 
 if __name__ == "__main__":

From 97a73a771ced47b992da5b9256c4f3c7b10aa53f Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 4 Aug 2025 23:04:45 +0800
Subject: [PATCH 08/22] fix rebuild_padding.cc

---
 custom_ops/cpu_ops/rebuild_padding.cc | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc
index cc7f7c779c..3425072089 100644
--- a/custom_ops/cpu_ops/rebuild_padding.cc
+++ b/custom_ops/cpu_ops/rebuild_padding.cc
@@ -69,20 +69,16 @@ void RebuildAppendPaddingCPUImpl(T *output_data,
         int ori_token_id =
             out_token_id + output_padding_offset_data[out_token_id];
         int bi = ori_token_id / max_input_length;
-        int seq_id = 0;
-
-         if (seq_len_this_time_data[bi] == 0 ||
+        if (seq_len_this_time_data[bi] == 0 ||
             (seq_lens_decoder_data[bi] == 0 &&
             seq_lens_encoder_data[bi] == 0)) {
                 continue;
             }
-
-
+        int seq_id = 0;
         // if encoder, get last token; just decoder, get first token.
         if (seq_lens_encoder_data[bi] > 0) {
             seq_id = seq_lens_encoder_data[bi] - 1;
         }
-
         int input_token_id = cu_seqlens_q_data[bi] + seq_id;
         int bias_idx = i % dim_embed;
         int src_offset = input_token_id * dim_embed + bias_idx;

From 556624481b8e73704746c22f5867572fd7297570 Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 4 Aug 2025 23:05:29 +0800
Subject: [PATCH 09/22] fix rebuild_padding.cc

---
 custom_ops/cpu_ops/rebuild_padding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc
index 3425072089..19cce80500 100644
--- a/custom_ops/cpu_ops/rebuild_padding.cc
+++ b/custom_ops/cpu_ops/rebuild_padding.cc
@@ -75,7 +75,7 @@ void RebuildAppendPaddingCPUImpl(T *output_data,
                 continue;
             }
         int seq_id = 0;
-        // if encoder, get last token; just decoder, get first token.
+
         if (seq_lens_encoder_data[bi] > 0) {
             seq_id = seq_lens_encoder_data[bi] - 1;
         }

From 767d1d133f46f1233cac4998ce7e252557dd90c6 Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 4 Aug 2025 23:06:20 +0800
Subject: [PATCH 10/22] fix comments

---
 custom_ops/cpu_ops/rebuild_padding.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc
index 19cce80500..adbf95e5fd 100644
--- a/custom_ops/cpu_ops/rebuild_padding.cc
+++ b/custom_ops/cpu_ops/rebuild_padding.cc
@@ -41,7 +41,6 @@ void RebuildPaddingCPUImpl(T *output_data,
             continue;
         }
 
-        // if encoder, get last token; just decoder, get first token.
         if (seq_lens_encoder_data[bi] > 0) {
             seq_id = seq_lens_encoder_data[bi] - 1;
         }

From ed2da59232aeb7ad108574381b10352f83b9f07a Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Tue, 5 Aug 2025 11:21:04 +0800
Subject: [PATCH 11/22] Adapt to MTP.

---
 fastdeploy/spec_decode/mtp.py         | 5 +----
 fastdeploy/worker/gpu_model_runner.py | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py
index 39f0fce427..9d74b83050 100644
--- a/fastdeploy/spec_decode/mtp.py
+++ b/fastdeploy/spec_decode/mtp.py
@@ -272,7 +272,6 @@ def _init_model_inputs(self):
         self.model_inputs["not_need_stop"] = paddle.to_tensor([False], dtype="bool", place="cpu")
         self.model_inputs["pre_ids"] = paddle.clone(self.main_model_inputs["pre_ids"])
         self.model_inputs["ids_remove_padding"] = paddle.clone(self.main_model_inputs["ids_remove_padding"])
-        self.model_inputs["cum_offsets"] = paddle.clone(self.main_model_inputs["cum_offsets"])
         self.model_inputs["batch_id_per_token"] = paddle.clone(self.main_model_inputs["batch_id_per_token"])
         self.model_inputs["cu_seqlens_q"] = paddle.clone(self.main_model_inputs["cu_seqlens_q"])
         self.model_inputs["cu_seqlens_k"] = paddle.clone(self.main_model_inputs["cu_seqlens_k"])
@@ -524,7 +523,6 @@ def _propose(self, target_hidden_states):
                 # Remove padding
                 (
                     ids_remove_padding,
-                    cum_offsets,
                     batch_id_per_token,
                     cu_seqlens_q,
                     cu_seqlens_k,
@@ -540,7 +538,6 @@ def _propose(self, target_hidden_states):
                 )
                 # Initialize forward meta data
                 self.model_inputs["ids_remove_padding"].copy_(ids_remove_padding, False)
-                self.model_inputs["cum_offsets"].copy_(cum_offsets, False)
                 self.model_inputs["batch_id_per_token"].copy_(batch_id_per_token, False)
                 self.model_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False)
                 self.model_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False)
@@ -575,7 +572,7 @@ def _propose(self, target_hidden_states):
 
                 hidden_states = rebuild_padding(
                     model_output,
-                    self.model_inputs["cum_offsets"],
+                    self.model_inputs["cu_seqlens_q"],
                     self.model_inputs["seq_lens_this_time"],
                     self.model_inputs["seq_lens_decoder"],
                     self.model_inputs["seq_lens_encoder"],
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 416d5cf680..49e0f417d9 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1383,6 +1383,7 @@ class at the server level, which is too granular for ModelRunner.
         # 7. Updata 'infer_seed' and step_cuda()
         self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
         self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
+
         if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
             step_cuda(
                 self.share_inputs,

From 43fc43e03cbf9aca67cff7f299ea34847a06d20c Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Wed, 6 Aug 2025 20:20:31 +0800
Subject: [PATCH 12/22] fix mtp RebuildAppendPaddingKernel

---
 custom_ops/gpu_ops/rebuild_padding.cu | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu
index 7632136667..a1d0b897dc 100644
--- a/custom_ops/gpu_ops/rebuild_padding.cu
+++ b/custom_ops/gpu_ops/rebuild_padding.cu
@@ -34,7 +34,6 @@ __global__ void RebuildPaddingKernel(T *output_data,
         int seq_id = 0;
         if (seq_len_this_time[bi] == 0) continue;
         if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue;
-        // if encoder, get last token; just decoder, get first token.
         if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1;
 
         const int ori_token_idx =
@@ -55,22 +54,25 @@ __global__ void RebuildAppendPaddingKernel(T *output_data,
                                            const int *output_padding_offset,
                                            const int max_input_length,
                                            const int dim_embed,
-                                           const int64_t output_elem_nums) {
+                                           const int64_t output_elem_nums,
+                                           const int bsz) {
     AlignedVector<T, VecSize> src_vec;
     const int64_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
     for (int64_t i = global_idx * VecSize; i < output_elem_nums;
          i += gridDim.x * blockDim.x * VecSize) {
         const int out_token_id = i / dim_embed;
-        const int ori_token_id =
-            out_token_id + output_padding_offset[out_token_id];
+        const int ori_token_id = out_token_id + output_padding_offset[out_token_id];
+
         const int bi = ori_token_id / max_input_length;
+
         int seq_id = 0;
         if (seq_len_this_time[bi] == 0) continue;
         if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue;
-        // if encoder, get last token; just decoder, get first token.
+
         if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1;
 
-        const int input_token_id = cu_seqlens_q[bi]+ seq_id;
+        const int cum_offset_bi = bi * max_input_length - cu_seqlens_q[bi];
+        const int input_token_id = ori_token_id - cum_offset_bi + seq_id;
         const int bias_idx = i % dim_embed;
 
         Load<T, VecSize>(&input_data[input_token_id * dim_embed + bias_idx],
@@ -79,6 +81,8 @@ __global__ void RebuildAppendPaddingKernel(T *output_data,
     }
 }
 
+
+
 template <paddle::DataType D>
 std::vector<paddle::Tensor> rebuild_padding(
     const paddle::Tensor &tmp_out,      // [token_num, dim_embed]
@@ -141,7 +145,8 @@ std::vector<paddle::Tensor> rebuild_padding(
                 output_padding_offset.get_ptr()->data<int>(),
                 max_input_length,
                 dim_embed,
-                elem_nums);
+                elem_nums,
+                bsz);
     } else {
         RebuildPaddingKernel<DataType_, PackSize>
             <<<grid_size, blocksize, 0, cu_stream>>>(

From 569f7beaeb96fd274e8d63b1f510878a7d161c3d Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Sun, 10 Aug 2025 17:31:35 +0800
Subject: [PATCH 13/22] gaoxingneng

---
 custom_ops/gpu_ops/rebuild_padding.cu | 44 +++++++++++++++++++--------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu
index a1d0b897dc..ce3b73ed52 100644
--- a/custom_ops/gpu_ops/rebuild_padding.cu
+++ b/custom_ops/gpu_ops/rebuild_padding.cu
@@ -14,6 +14,8 @@
 
 #include "helper.h"  // NOLINT
 
+#define BLOCK_SIZE 128
+
 template <typename T, int VecSize>
 __global__ void RebuildPaddingKernel(T *output_data,
                                      const T *input_data,
@@ -27,17 +29,26 @@ __global__ void RebuildPaddingKernel(T *output_data,
     using LoadT = AlignedVector<T, VecSize>;
     LoadT src_vec;
     const int global_idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+    // Precompute the shared memory offset for the sequence lengths
+    __shared__ int shared_cu_seqlens_q[BLOCK_SIZE];
+
+    if (threadIdx.x < BLOCK_SIZE && threadIdx.x < blockDim.x) {
+        shared_cu_seqlens_q[threadIdx.x] = cu_seqlens_q[threadIdx.x];
+    }
+    __syncthreads();
+
     for (int i = global_idx * VecSize; i < elem_nums;
          i += gridDim.x * blockDim.x * VecSize) {
         const int bi = i / dim_embed;
         const int bias_idx = i % dim_embed;
-        int seq_id = 0;
-        if (seq_len_this_time[bi] == 0) continue;
-        if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue;
-        if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1;
 
-        const int ori_token_idx =
-            cu_seqlens_q[bi] + seq_id;
+        // Check if the sequence length is valid
+        if (seq_len_this_time[bi] == 0 || (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0)) continue;
+
+        int seq_id = seq_len_encoder[bi] > 0 ? seq_len_encoder[bi] - 1 : 0;
+
+        const int ori_token_idx = shared_cu_seqlens_q[bi] + seq_id;
         const int src_offset = ori_token_idx * dim_embed + bias_idx;
         Load<T, VecSize>(&input_data[src_offset], &src_vec);
         Store<T, VecSize>(src_vec, &output_data[i]);
@@ -58,6 +69,14 @@ __global__ void RebuildAppendPaddingKernel(T *output_data,
                                            const int bsz) {
     AlignedVector<T, VecSize> src_vec;
     const int64_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+    // Precompute shared memory for cu_seqlens_q
+    __shared__ int shared_cu_seqlens_q[BLOCK_SIZE];
+    if (threadIdx.x < BLOCK_SIZE && threadIdx.x < blockDim.x) {
+        shared_cu_seqlens_q[threadIdx.x] = cu_seqlens_q[threadIdx.x];
+    }
+    __syncthreads();
+
     for (int64_t i = global_idx * VecSize; i < output_elem_nums;
          i += gridDim.x * blockDim.x * VecSize) {
         const int out_token_id = i / dim_embed;
@@ -65,24 +84,23 @@ __global__ void RebuildAppendPaddingKernel(T *output_data,
 
         const int bi = ori_token_id / max_input_length;
 
-        int seq_id = 0;
-        if (seq_len_this_time[bi] == 0) continue;
-        if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue;
+        // Skip the invalid sequences
+        if (seq_len_this_time[bi] == 0 || (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0)) continue;
 
-        if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1;
+        int seq_id = seq_len_encoder[bi] > 0 ? seq_len_encoder[bi] - 1 : 0;
 
-        const int cum_offset_bi = bi * max_input_length - cu_seqlens_q[bi];
+        const int cum_offset_bi = bi * max_input_length - shared_cu_seqlens_q[bi];
         const int input_token_id = ori_token_id - cum_offset_bi + seq_id;
         const int bias_idx = i % dim_embed;
 
-        Load<T, VecSize>(&input_data[input_token_id * dim_embed + bias_idx],
-                         &src_vec);
+        Load<T, VecSize>(&input_data[input_token_id * dim_embed + bias_idx], &src_vec);
         Store<T, VecSize>(src_vec, &output_data[i]);
     }
 }
 
 
 
+
 template <paddle::DataType D>
 std::vector<paddle::Tensor> rebuild_padding(
     const paddle::Tensor &tmp_out,      // [token_num, dim_embed]

From c6cd8f78fb87d9bf6aa4240e30c8666beb9a623d Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Sun, 10 Aug 2025 17:34:33 +0800
Subject: [PATCH 14/22] pre-commit

---
 fastdeploy/worker/gpu_model_runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index ebaf94a8e9..bf2611aa8b 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -941,7 +941,6 @@ def initialize_kv_cache(self, profile: bool = False) -> None:
                 cache_kvs_list.append(value_cache)
 
             self.share_inputs["caches"] = cache_kvs_list
-
         else:
             for i in range(self.model_config.num_hidden_layers):
                 cache_kvs[f"key_caches_{i}"] = paddle.full(

From 2ccb4a7021d7cbfbaa1574459770b8c63c763c95 Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 11 Aug 2025 14:08:43 +0800
Subject: [PATCH 15/22] merge develop

---
 custom_ops/gpu_ops/rebuild_padding.cu | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu
index ce3b73ed52..5fcf7e740a 100644
--- a/custom_ops/gpu_ops/rebuild_padding.cu
+++ b/custom_ops/gpu_ops/rebuild_padding.cu
@@ -29,26 +29,17 @@ __global__ void RebuildPaddingKernel(T *output_data,
     using LoadT = AlignedVector<T, VecSize>;
     LoadT src_vec;
     const int global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-    // Precompute the shared memory offset for the sequence lengths
-    __shared__ int shared_cu_seqlens_q[BLOCK_SIZE];
-
-    if (threadIdx.x < BLOCK_SIZE && threadIdx.x < blockDim.x) {
-        shared_cu_seqlens_q[threadIdx.x] = cu_seqlens_q[threadIdx.x];
-    }
-    __syncthreads();
-
     for (int i = global_idx * VecSize; i < elem_nums;
          i += gridDim.x * blockDim.x * VecSize) {
         const int bi = i / dim_embed;
         const int bias_idx = i % dim_embed;
+        int seq_id = 0;
+        if (seq_len_this_time[bi] == 0) continue;
+        if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue;
+        if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1;
 
-        // Check if the sequence length is valid
-        if (seq_len_this_time[bi] == 0 || (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0)) continue;
-
-        int seq_id = seq_len_encoder[bi] > 0 ? seq_len_encoder[bi] - 1 : 0;
-
-        const int ori_token_idx = shared_cu_seqlens_q[bi] + seq_id;
+        const int ori_token_idx =
+            cu_seqlens_q[bi] + seq_id;
         const int src_offset = ori_token_idx * dim_embed + bias_idx;
         Load<T, VecSize>(&input_data[src_offset], &src_vec);
         Store<T, VecSize>(src_vec, &output_data[i]);
@@ -98,9 +89,6 @@ __global__ void RebuildAppendPaddingKernel(T *output_data,
     }
 }
 
-
-
-
 template <paddle::DataType D>
 std::vector<paddle::Tensor> rebuild_padding(
     const paddle::Tensor &tmp_out,      // [token_num, dim_embed]

From 586f8dfa87d5c6d3c1ad58c0ed00f89a82f3e454 Mon Sep 17 00:00:00 2001
From: lizexu123 <2694294196@qq.com>
Date: Tue, 12 Aug 2025 11:48:32 +0000
Subject: [PATCH 16/22] fix mtp

---
 custom_ops/gpu_ops/rebuild_padding.cu | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu
index 5fcf7e740a..74da1d89fc 100644
--- a/custom_ops/gpu_ops/rebuild_padding.cu
+++ b/custom_ops/gpu_ops/rebuild_padding.cu
@@ -60,14 +60,6 @@ __global__ void RebuildAppendPaddingKernel(T *output_data,
                                            const int bsz) {
     AlignedVector<T, VecSize> src_vec;
     const int64_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-    // Precompute shared memory for cu_seqlens_q
-    __shared__ int shared_cu_seqlens_q[BLOCK_SIZE];
-    if (threadIdx.x < BLOCK_SIZE && threadIdx.x < blockDim.x) {
-        shared_cu_seqlens_q[threadIdx.x] = cu_seqlens_q[threadIdx.x];
-    }
-    __syncthreads();
-
     for (int64_t i = global_idx * VecSize; i < output_elem_nums;
          i += gridDim.x * blockDim.x * VecSize) {
         const int out_token_id = i / dim_embed;
@@ -75,20 +67,22 @@ __global__ void RebuildAppendPaddingKernel(T *output_data,
 
         const int bi = ori_token_id / max_input_length;
 
-        // Skip the invalid sequences
-        if (seq_len_this_time[bi] == 0 || (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0)) continue;
-
-        int seq_id = seq_len_encoder[bi] > 0 ? seq_len_encoder[bi] - 1 : 0;
+        int seq_id = 0;
+        if (seq_len_this_time[bi] == 0) continue;
+        if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue;
 
-        const int cum_offset_bi = bi * max_input_length - shared_cu_seqlens_q[bi];
+        if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1;
+         const int cum_offset_bi = bi * max_input_length - cu_seqlens_q[bi];
         const int input_token_id = ori_token_id - cum_offset_bi + seq_id;
         const int bias_idx = i % dim_embed;
 
-        Load<T, VecSize>(&input_data[input_token_id * dim_embed + bias_idx], &src_vec);
+        Load<T, VecSize>(&input_data[input_token_id * dim_embed + bias_idx],
+                         &src_vec);
         Store<T, VecSize>(src_vec, &output_data[i]);
     }
 }
 
+
 template <paddle::DataType D>
 std::vector<paddle::Tensor> rebuild_padding(
     const paddle::Tensor &tmp_out,      // [token_num, dim_embed]

From 1ccf9f11cf96fcd305103a1ef71aa88da22eef6c Mon Sep 17 00:00:00 2001
From: lizexu123 <2694294196@qq.com>
Date: Tue, 12 Aug 2025 11:50:51 +0000
Subject: [PATCH 17/22] delete BLCOK_SIZE

---
 custom_ops/gpu_ops/rebuild_padding.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu
index 74da1d89fc..772fefa1ac 100644
--- a/custom_ops/gpu_ops/rebuild_padding.cu
+++ b/custom_ops/gpu_ops/rebuild_padding.cu
@@ -14,7 +14,6 @@
 
 #include "helper.h"  // NOLINT
 
-#define BLOCK_SIZE 128
 
 template <typename T, int VecSize>
 __global__ void RebuildPaddingKernel(T *output_data,

From 6a9f3eec81f6ef726ada6ccf365603b363c3c75b Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Thu, 14 Aug 2025 19:35:07 +0800
Subject: [PATCH 18/22] fix

---
 test/layers/test_rebuild_padding.py | 222 ----------------------------
 1 file changed, 222 deletions(-)
 delete mode 100644 test/layers/test_rebuild_padding.py

diff --git a/test/layers/test_rebuild_padding.py b/test/layers/test_rebuild_padding.py
deleted file mode 100644
index a66bde915b..0000000000
--- a/test/layers/test_rebuild_padding.py
+++ /dev/null
@@ -1,222 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import time
-import unittest
-from typing import Tuple
-
-import numpy as np
-import paddle
-
-
-class TestCuSeqlensQPerformance(unittest.TestCase):
-
-    def setUp(self):
-        paddle.device.set_device("gpu:0")
-
-        # Test configurations:(batch_size, max_seq_len, dim_embed, avg_seq_len_ratio)
-        self.test_configs = [
-            # Small scale tests
-            (4, 512, 2048, 0.8),
-            (8, 512, 4096, 0.7),
-            # Medium scale tests
-            (16, 1024, 4096, 0.6),
-            (32, 1024, 4096, 0.8),
-            # Large scale tests
-            (64, 2048, 4096, 0.5),
-            (128, 1024, 8192, 0.7),
-            (256, 512, 4096, 0.9),
-        ]
-
-        self.warmup_runs = 10
-        self.benchmark_runs = 50
-
-    def generate_realistic_test_data(
-        self, batch_size: int, max_seq_len: int, dim_embed: int, avg_ratio: float
-    ) -> dict:
-        """Generate test data closer to real-world scenarios"""
-
-        avg_seq_len = int(max_seq_len * avg_ratio)
-        std_seq_len = avg_seq_len // 4
-
-        seq_lens = np.random.normal(avg_seq_len, std_seq_len, batch_size)
-        seq_lens = np.clip(seq_lens, max_seq_len // 10, max_seq_len).astype(np.int32)
-
-        total_tokens = np.sum(seq_lens)
-
-        tmp_out = paddle.randn([total_tokens, dim_embed], dtype=paddle.float16)
-        tmp_out = tmp_out.cuda()
-
-        cu_seqlens_q_np = np.zeros(batch_size + 1, dtype=np.int32)
-        for i in range(batch_size):
-            cu_seqlens_q_np[i + 1] = cu_seqlens_q_np[i] + seq_lens[i]
-
-        cu_seqlens_q = paddle.to_tensor(cu_seqlens_q_np, dtype=paddle.int32).cuda()
-
-        seq_len_this_time = paddle.to_tensor(seq_lens, dtype=paddle.int32).cuda()
-        seq_len_decoder = paddle.to_tensor(seq_lens, dtype=paddle.int32).cuda()
-        seq_len_encoder = paddle.zeros([batch_size], dtype=paddle.int32).cuda()
-
-        return {
-            "tmp_out": tmp_out,
-            "cu_seqlens_q": cu_seqlens_q,
-            "seq_len_this_time": seq_len_this_time,
-            "seq_len_decoder": seq_len_decoder,
-            "seq_len_encoder": seq_len_encoder,
-            "max_input_length": max_seq_len,
-            "actual_tokens": total_tokens,
-            "seq_lens": seq_lens,
-        }
-
-    def benchmark_cu_seqlens_performance(self, data_dict: dict) -> Tuple[float, float, paddle.Tensor]:
-        """Test performance of cu_seqlens_q version"""
-
-        def rebuild_padding_cu_seqlens(
-            tmp_out, cu_seqlens_q, seq_len_this_time, seq_len_decoder, seq_len_encoder, max_input_length
-        ):
-
-            from fastdeploy.model_executor.pre_and_post_process import rebuild_padding
-
-            hidden_states = rebuild_padding(
-                tmp_out, cu_seqlens_q, seq_len_this_time, seq_len_decoder, seq_len_encoder, None, max_input_length
-            )
-            return hidden_states
-
-        for _ in range(self.warmup_runs):
-            result = rebuild_padding_cu_seqlens(
-                data_dict["tmp_out"],
-                data_dict["cu_seqlens_q"],
-                data_dict["seq_len_this_time"],
-                data_dict["seq_len_decoder"],
-                data_dict["seq_len_encoder"],
-                data_dict["max_input_length"],
-            )
-            paddle.device.cuda.synchronize()
-
-        paddle.device.cuda.synchronize()
-        start_time = time.perf_counter()
-
-        for _ in range(self.benchmark_runs):
-            result = rebuild_padding_cu_seqlens(
-                data_dict["tmp_out"],
-                data_dict["cu_seqlens_q"],
-                data_dict["seq_len_this_time"],
-                data_dict["seq_len_decoder"],
-                data_dict["seq_len_encoder"],
-                data_dict["max_input_length"],
-            )
-
-        paddle.device.cuda.synchronize()
-        end_time = time.perf_counter()
-
-        avg_time = (end_time - start_time) / self.benchmark_runs * 1000  # ms
-
-        # throughput(tokens/ms)
-        throughput = data_dict["actual_tokens"] / avg_time
-
-        return avg_time, throughput, result
-
-    def test_performance_scaling(self):
-        """Test performance unfer different scales"""
-        print("\n" + "=" * 90)
-        print("CU_SEQLENS_Q Performance Scaling Test")
-        print("=" * 90)
-        print(
-            f"{'Config':<20} {'Batch':<6} {'SeqLen':<7} {'Tokens':<8} {'Time(ms)':<10} {'Throughput':<12} {'Memory(MB)'}"
-        )
-        print("-" * 90)
-
-        results = []
-
-        for i, (batch_size, max_seq_len, dim_embed, avg_ratio) in enumerate(self.test_configs):
-            config_name = f"Config_{i+1}"
-
-            try:
-                data_dict = self.generate_realistic_test_data(batch_size, max_seq_len, dim_embed, avg_ratio)
-
-                paddle.device.cuda.empty_cache()
-                mem_before = paddle.device.cuda.memory_allocated() / 1024 / 1024  # MB
-
-                avg_time, throughput, result = self.benchmark_cu_seqlens_performance(data_dict)
-
-                mem_after = paddle.device.cuda.memory_allocated() / 1024 / 1024  # MB
-                mem_usage = mem_after - mem_before
-
-                results.append(
-                    {
-                        "config": config_name,
-                        "batch_size": batch_size,
-                        "max_seq_len": max_seq_len,
-                        "dim_embed": dim_embed,
-                        "actual_tokens": data_dict["actual_tokens"],
-                        "avg_time": avg_time,
-                        "throughput": throughput,
-                        "memory_mb": mem_usage,
-                        "result_shape": result.shape,
-                    }
-                )
-
-                print(
-                    f"{config_name:<20} {batch_size:<6} {max_seq_len:<7} "
-                    f"{data_dict['actual_tokens']:<8} {avg_time:<10.3f} "
-                    f"{throughput:<12.1f} {mem_usage:<8.1f}"
-                )
-
-                expected_shape = [batch_size, dim_embed]
-                self.assertEqual(list(result.shape), expected_shape, f"Output shape mismatch for {config_name}")
-
-            except Exception as e:
-                raise RuntimeError(
-                    f"Failed to test configuration {config_name} (batch={batch_size}, seq_len={max_seq_len}): {str(e)}"
-                )
-
-        print("-" * 90)
-        return results
-
-
-def main():
-    """Run all performance tests"""
-    print("Starting CU_SEQLENS_Q Performance Benchmark...")
-    print(f"GPU: {paddle.device.cuda.get_device_name()}")
-    print(f"GPU Memory: {paddle.device.cuda.get_device_properties().total_memory / 1024**3:.1f} GB")
-
-    test_instance = TestCuSeqlensQPerformance()
-    test_instance.setUp()
-
-    try:
-        scaling_results = test_instance.test_performance_scaling()
-
-        print("\n" + "=" * 50)
-        print("Performance Summary")
-        print("=" * 50)
-
-        if scaling_results:
-            best_throughput = max(scaling_results, key=lambda x: x["throughput"])
-            print(f"Best throughput: {best_throughput['throughput']:.1f} tokens/ms")
-            print(
-                f"  Config: {best_throughput['config']} "
-                f"(batch={best_throughput['batch_size']}, "
-                f"seq_len={best_throughput['max_seq_len']})"
-            )
-
-        print("=" * 50)
-
-    except Exception as e:
-        raise RuntimeError(f"Performance benchmark failed: {str(e)}")
-
-
-if __name__ == "__main__":
-    main()

From 9f6bf4c8dfa25b35e34ae26697ceccde809c42d0 Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Thu, 14 Aug 2025 21:15:49 +0800
Subject: [PATCH 19/22] merge develop

---
 custom_ops/cpu_ops/rebuild_padding.cc             | 1 +
 custom_ops/gpu_ops/rebuild_padding.cu             | 1 -
 fastdeploy/model_executor/pre_and_post_process.py | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc
index adbf95e5fd..2dfc9f17e2 100644
--- a/custom_ops/cpu_ops/rebuild_padding.cc
+++ b/custom_ops/cpu_ops/rebuild_padding.cc
@@ -19,6 +19,7 @@
 #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
 #endif
 
+
 template <typename T>
 void RebuildPaddingCPUImpl(T *output_data,
                            const T *input_data,
diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu
index 772fefa1ac..93c1bb38c2 100644
--- a/custom_ops/gpu_ops/rebuild_padding.cu
+++ b/custom_ops/gpu_ops/rebuild_padding.cu
@@ -14,7 +14,6 @@
 
 #include "helper.h"  // NOLINT
 
-
 template <typename T, int VecSize>
 __global__ void RebuildPaddingKernel(T *output_data,
                                      const T *input_data,
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index e3dfcbf3c5..24f596b185 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -545,7 +545,7 @@ def rebuild_padding(
             output_padding_offset,
             max_input_length,
         )
-        # elif current_platform.is_gcu():
+    elif current_platform.is_gcu():
         from fastdeploy.model_executor.ops.gcu import rebuild_padding
 
         hidden_states = rebuild_padding(

From 3baf2b5c9982b77a8c28a05337d4fd2846a0e394 Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Thu, 14 Aug 2025 21:16:20 +0800
Subject: [PATCH 20/22] fix pre-commit

---
 fastdeploy/worker/gpu_model_runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 58ed5b0b79..ecc79a680e 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -577,7 +577,6 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decod
             self.share_inputs["min_dec_len"][idx : idx + 1] = max_dec_len
             self.share_inputs["stop_flags"][idx : idx + 1] = False
             self.share_inputs["temperature"][idx : idx + 1] = 1
-
             self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1]
             self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length
 

From dcf8f56ab5a522007b42aa6abcd789e3cf7da9ac Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Thu, 14 Aug 2025 21:38:04 +0800
Subject: [PATCH 21/22] fix pre-commit

---
 fastdeploy/model_executor/pre_and_post_process.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index f81e89988e..30b87d65b1 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -596,7 +596,7 @@ def rebuild_padding(
 
         hidden_states = rebuild_padding(
             tmp_out,
-            cum_offsets,
+            cu_seqlens_q,
             seq_len_this_time,
             seq_lens_decoder,
             seq_lens_encoder,

From 9593694ade8692a2197c0f2d498913775a2afb1f Mon Sep 17 00:00:00 2001
From: lizexu <2694294196@qq.com>
Date: Mon, 18 Aug 2025 19:07:34 +0800
Subject: [PATCH 22/22] fix

---
 fastdeploy/worker/gpu_model_runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 1d317b71be..2dfe1021c6 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -679,7 +679,6 @@ def _init_share_inputs(self, max_num_seqs: int):
             0,
             dtype="int64",
         )
-        
         self.share_inputs["batch_id_per_token"] = paddle.full(
             [max_num_seqs * self.parallel_config.max_model_len, 1], 0, dtype="int32"
         )