PaddlePaddle
diff --git a/‎.github/workflows/_unit_test_coverage.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/_unit_test_coverage.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 4 deletions b/‎README.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎README_CN.md
Lines changed: 5 additions & 4 deletions b/‎README_CN.md
Lines changed: 5 additions & 4 deletions
diff --git a/‎custom_ops/gpu_ops/append_attention.cu
Lines changed: 10 additions & 0 deletions b/‎custom_ops/gpu_ops/append_attention.cu
Lines changed: 10 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
Lines changed: 14 additions & 5 deletions b/‎custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
Lines changed: 14 additions & 5 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
Lines changed: 14 additions & 5 deletions b/‎custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
Lines changed: 14 additions & 5 deletions
@@ -205,6 +205,7 @@ jobs:
       - name: Check Unit Test Success
         shell: bash
         run: |
+          cd FastDeploy
           if [ "$TEST_EXIT_CODE" -eq 8 ]; then
             filename=$(basename "$unittest_failed_url")
             if [ -z "${unittest_failed_url}" ]; then
@@ -225,6 +226,7 @@ jobs:
       - name: Verify Code Coverage Threshold (80%)
         shell: bash
         run: |
+          cd FastDeploy
           if [ "$COVERAGE_EXIT_CODE" -eq 9 ]; then
             echo "Coverage generation failed (exit code 9)"
             filename=$(basename "$diff_cov_result_json_url")
 
@@ -75,12 +75,12 @@ Learn how to use FastDeploy through our documentation:
 
 | Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching |  MTP | CUDA Graph | Maximum Context Length |
 |:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
-|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅(WINT4)| WIP |128K |
-|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|✅(WINT4)| WIP | 128K |
+|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅| WIP |128K |
+|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|❌| WIP | 128K |
 |ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
 |ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
-|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | WIP | ✅|128K |
-|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | WIP | ✅|128K |
+|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | ✅ | ✅|128K |
+|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | ❌ | ✅|128K |
 |ERNIE-4.5-0.3B | BF16/WINT8/FP8  |  ❌ |  ✅ |  ✅ | ❌ | ✅| 128K |
 
 ## Advanced Usage
 
@@ -1,4 +1,5 @@
 [English](README.md) | 简体中文
+[English](README.md) | 简体中文
 <p align="center">
   <a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://github.com/user-attachments/assets/42b0039f-39e3-4279-afda-6d1865dfbffb" width="500"></a>
 </p>
@@ -72,12 +73,12 @@ FastDeploy 支持在**英伟达（NVIDIA）GPU**、**昆仑芯（Kunlunxin）XPU
 
 | Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching |  MTP | CUDA Graph | Maximum Context Length |
 |:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
-|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅(WINT4)| WIP |128K |
-|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|✅(WINT4)| WIP | 128K |
+|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅| WIP |128K |
+|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|❌| WIP | 128K |
 |ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
 |ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
-|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | WIP | ✅|128K |
-|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | WIP | ✅|128K |
+|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | ✅ | ✅|128K |
+|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | ❌ | ✅|128K |
 |ERNIE-4.5-0.3B | BF16/WINT8/FP8  |  ❌ |  ✅ |  ✅ | ❌ | ✅| 128K |
 
 ## 进阶用法
 
@@ -72,6 +72,7 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
     const paddle::optional<paddle::Tensor>& cache_v_zp,
     const paddle::optional<paddle::Tensor>& out_linear_shifts,
     const paddle::optional<paddle::Tensor>& out_linear_smooths,
+    const paddle::optional<paddle::Tensor>& mask_offset,
     const paddle::optional<paddle::Tensor>& kv_signal_data,
     const paddle::optional<paddle::Tensor>& q_norm_weight,
     const paddle::optional<paddle::Tensor>& k_norm_weight,
@@ -441,6 +442,7 @@ std::vector<paddle::Tensor> AppendAttention(
     const paddle::optional<paddle::Tensor>& cache_v_zp,
     const paddle::optional<paddle::Tensor>& out_linear_shifts,
     const paddle::optional<paddle::Tensor>& out_linear_smooths,
+    const paddle::optional<paddle::Tensor>& mask_offset,
     const paddle::optional<paddle::Tensor>& kv_signal_data,
     const paddle::optional<paddle::Tensor>& q_norm_weight,
     const paddle::optional<paddle::Tensor>& k_norm_weight,
@@ -479,6 +481,10 @@ std::vector<paddle::Tensor> AppendAttention(
   meta_data.block_size = key_cache.dims()[2];
   meta_data.batch_size = seq_lens_this_time.dims()[0];
 
+  if (mask_offset) {
+    meta_data.mask_offset = mask_offset.get().data<int>();
+  }
+
   auto dispatch_by_template = [&](auto temp_args) -> std::vector<paddle::Tensor> {
       return AppendAttentionKernel<type2value<decltype(temp_args)>::value>(
           meta_data,
@@ -514,6 +520,7 @@ std::vector<paddle::Tensor> AppendAttention(
           cache_v_zp,
           out_linear_shifts,
           out_linear_smooths,
+          mask_offset,
           kv_signal_data,
           q_norm_weight,
           k_norm_weight,
@@ -594,6 +601,7 @@ std::vector<std::vector<int64_t>> AppendAttentionInferShape(
     const paddle::optional<std::vector<int64_t>>& cache_v_zp_shape,
     const paddle::optional<std::vector<int64_t>>& out_linear_shifts_shape,
     const paddle::optional<std::vector<int64_t>>& out_linear_smooths_shape,
+    const paddle::optional<std::vector<int64_t>>& mask_offset_shape,
     const paddle::optional<std::vector<int64_t>>& kv_signal_data_shape,
     const paddle::optional<std::vector<int64_t>>& q_norm_weight_shape,
     const paddle::optional<std::vector<int64_t>>& k_norm_weight_shape,
@@ -657,6 +665,7 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
     const paddle::optional<paddle::DataType>& cache_v_zp_dtype,
     const paddle::optional<paddle::DataType>& out_linear_shifts_dtype,
     const paddle::optional<paddle::DataType>& out_linear_smooths_dtype,
+    const paddle::optional<paddle::DataType>& mask_offset_dtype,
     const paddle::optional<paddle::DataType>& kv_signal_data_dtype,
     const paddle::optional<paddle::DataType>& q_norm_weight_dtype,
     const paddle::optional<paddle::DataType>& k_norm_weight_dtype,
@@ -738,6 +747,7 @@ PD_BUILD_STATIC_OP(append_attention)
              paddle::Optional("cache_v_zp"),
              paddle::Optional("out_linear_shifts"),
              paddle::Optional("out_linear_smooths"),
+             paddle::Optional("mask_offset"),
              paddle::Optional("kv_signal_data"),
              paddle::Optional("q_norm_weight"),
              paddle::Optional("k_norm_weight")})
 
@@ -43,6 +43,7 @@ __global__ void multi_query_append_attention_kernel(
     const int *__restrict__ tile_ids_per_batch,
     const int *__restrict__ cu_seqlens_q,
     const int *__restrict__ block_table,  // [bsz, block_num_per_seq]
+    const int *__restrict__ mask_offset,
     const int max_seq_len,
     const int max_dec_len,
     const int max_block_num_per_seq,
@@ -141,6 +142,7 @@ __global__ void multi_query_append_attention_kernel(
   } else {
     o_base_ptr_int8 = out + o_offset;
   }
+  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id : nullptr;
   smem_t qo_smem(smem);
 
   uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -179,7 +181,7 @@ __global__ void multi_query_append_attention_kernel(
                          kv_len - q_len +
                              tile_id * num_rows_per_block / GROUP_SIZE,
                          chunk_start)))
-              : chunk_len) /
+              : mask_offset ? 0 : chunk_len) /
       (num_frags_z * 16);
   uint32_t k_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
       8 * (tid / 16) + tid % 8, (tid % 16) / 8);
@@ -250,7 +252,8 @@ __global__ void multi_query_append_attention_kernel(
                           q_len,
                           kv_len,
                           chunk_end,
-                          s_frag);
+                          s_frag,
+                          mask_offset_this_seq);
     }
 
     // update m,d
@@ -406,6 +409,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
     const int *__restrict__ tile_ids_per_batch,
     const int *__restrict__ cu_seqlens_q,
     const int *__restrict__ block_table,  // [bsz, block_num_per_seq]
+    const int *__restrict__ mask_offset,
     const int max_seq_len,
     const int max_dec_len,
     const int max_block_num_per_seq,
@@ -502,7 +506,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
           tid % 8 * num_elems_per_128b<T>();
     }
   }
-
+  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id : nullptr;
   smem_t qo_smem(smem);
 
   uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -543,7 +547,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
                          kv_len - q_len +
                              tile_id * num_rows_per_block / GROUP_SIZE,
                          chunk_start)))
-              : chunk_len) /
+              : mask_offset ? 0 : chunk_len) /
       (NUM_WARP_KV * num_frags_z * 16);
 
   uint32_t k_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -616,7 +620,8 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
                           q_len,
                           kv_len,
                           chunk_end,
-                          s_frag);
+                          s_frag,
+                          mask_offset_this_seq);
     }
 
     // update m,d
@@ -882,6 +887,7 @@ void MultiQueryAppendAttention(
           tile_ids_per_batch.data<int>(),
           cu_seqlens_q.data<int>(),
           block_table.data<int>(),
+          meta_data.mask_offset,
           max_seq_len,
           max_dec_len,
           max_block_num_per_seq,
@@ -939,6 +945,7 @@ void MultiQueryAppendAttention(
           tile_ids_per_batch.data<int>(),
           cu_seqlens_q.data<int>(),
           block_table.data<int>(),
+          meta_data.mask_offset,
           max_seq_len,
           max_dec_len,
           max_block_num_per_seq,
@@ -1103,6 +1110,7 @@ void MultiQueryAppendAttention(
           tile_ids_per_batch.data<int>(),
           cu_seqlens_q.data<int>(),
           block_table.data<int>(),
+          meta_data.mask_offset,
           max_seq_len,
           max_dec_len,
           max_block_num_per_seq,
@@ -1171,6 +1179,7 @@ void MultiQueryAppendAttention(
           tile_ids_per_batch.data<int>(),
           cu_seqlens_q.data<int>(),
           block_table.data<int>(),
+          meta_data.mask_offset,
           max_seq_len,
           max_dec_len,
           max_block_num_per_seq,
 
@@ -48,6 +48,7 @@ __global__ void multi_query_append_attention_c4_kernel(
     const int *__restrict__ tile_ids_per_batch,
     const int *__restrict__ cu_seqlens_q,
     const int *__restrict__ block_table,  // [bsz, block_num_per_seq]
+    const int *__restrict__ mask_offset,
     const int max_seq_len,
     const int max_dec_len,
     const int max_block_num_per_seq,
@@ -172,6 +173,7 @@ __global__ void multi_query_append_attention_c4_kernel(
   } else {
     o_base_ptr_int8 = out + o_offset;
   }
+  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id : nullptr;
   smem_t qo_smem(smem);
 
   uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -248,7 +250,7 @@ __global__ void multi_query_append_attention_c4_kernel(
                          kv_len - q_len +
                              tile_id * num_rows_per_block / GROUP_SIZE,
                          chunk_start)))
-              : chunk_len) /
+              : mask_offset ? 0 : chunk_len) /
       (num_frags_z * 16);
 
   uint32_t k_smem_offset_r =
@@ -338,7 +340,8 @@ __global__ void multi_query_append_attention_c4_kernel(
                           q_len,
                           kv_len,
                           chunk_end,
-                          s_frag);
+                          s_frag,
+                          mask_offset_this_seq);
     }
 
     update_mdo_states<num_frags_x, num_frags_y, num_frags_z>(
@@ -505,6 +508,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
     const int *__restrict__ tile_ids_per_batch,
     const int *__restrict__ cu_seqlens_q,
     const int *__restrict__ block_table,  // [bsz, block_num_per_seq]
+    const int *__restrict__ mask_offset,
     const int max_seq_len,
     const int max_dec_len,
     const int max_block_num_per_seq,
@@ -627,7 +631,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
           tid % 8 * num_elems_per_128b<T>();
     }
   }
-
+  const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id : nullptr;
   smem_t qo_smem(smem);
 
   uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -706,7 +710,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
                          kv_len - q_len +
                              tile_id * num_rows_per_block / GROUP_SIZE,
                          chunk_start)))
-              : chunk_len) /
+              : mask_offset ? 0 : chunk_len) /
       (NUM_WARP_KV * num_frags_z * 16);
 
   uint32_t k_smem_offset_r =
@@ -793,7 +797,8 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
                           q_len,
                           kv_len,
                           chunk_end,
-                          s_frag);
+                          s_frag,
+                          mask_offset_this_seq);
     }
 
     update_mdo_states<num_frags_x, num_frags_y, num_frags_z>(
@@ -1088,6 +1093,7 @@ void MultiQueryAppendC4Attention(
           tile_ids_per_batch.data<int>(),
           cu_seqlens_q.data<int>(),
           block_table.data<int>(),
+          meta_data.mask_offset,
           max_seq_len,
           max_dec_len,
           max_block_num_per_seq,
@@ -1151,6 +1157,7 @@ void MultiQueryAppendC4Attention(
           tile_ids_per_batch.data<int>(),
           cu_seqlens_q.data<int>(),
           block_table.data<int>(),
+          meta_data.mask_offset,
           max_seq_len,
           max_dec_len,
           max_block_num_per_seq,
@@ -1335,6 +1342,7 @@ void MultiQueryAppendC4Attention(
           tile_ids_per_batch.data<int>(),
           cu_seqlens_q.data<int>(),
           block_table.data<int>(),
+          meta_data.mask_offset,
           max_seq_len,
           max_dec_len,
           max_block_num_per_seq,
@@ -1411,6 +1419,7 @@ void MultiQueryAppendC4Attention(
           tile_ids_per_batch.data<int>(),
           cu_seqlens_q.data<int>(),
           block_table.data<int>(),
+          meta_data.mask_offset,
           max_seq_len,
           max_dec_len,
           max_block_num_per_seq,