PaddlePaddle
diff --git a/‎.github/workflows/approve.yml
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/approve.yml
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/ci_gcu.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/ci_gcu.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/ci_iluvatar.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/ci_iluvatar.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/sample_kernels/rejection_top_p_sampling.cu
Lines changed: 4 additions & 0 deletions b/‎custom_ops/gpu_ops/sample_kernels/rejection_top_p_sampling.cu
Lines changed: 4 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/sample_kernels/sampling.cuh
Lines changed: 69 additions & 4 deletions b/‎custom_ops/gpu_ops/sample_kernels/sampling.cuh
Lines changed: 69 additions & 4 deletions
diff --git a/‎custom_ops/gpu_ops/sample_kernels/utils.cuh
Lines changed: 4 additions & 0 deletions b/‎custom_ops/gpu_ops/sample_kernels/utils.cuh
Lines changed: 4 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/w4afp8_gemm/kernel_traits.h
Lines changed: 154 additions & 0 deletions b/‎custom_ops/gpu_ops/w4afp8_gemm/kernel_traits.h
Lines changed: 154 additions & 0 deletions
@@ -6,6 +6,9 @@ on:
       - develop
       - 'release/*'
 
+env:
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
 jobs:
   Approval:
     name: Approval
 
@@ -13,7 +13,8 @@ concurrency:
 
 jobs:
   CI_GCU:
-    runs-on: [self-hosted, GCU-S60-8Card]
+    runs-on:
+      group: GCU
     steps:
       - name: Print current runner name
         run: |
 
@@ -11,7 +11,8 @@ concurrency:
 
 jobs:
   CI_ILUVATAR:
-    runs-on: [self-hosted, IXUCA]
+    runs-on:
+      group: IXUCA
     steps:
       - name: Print current runner name
         run: |
 
@@ -167,3 +167,6 @@ build
 .ccls-cache
 
 third_party
+
+custom_ops/gpu_ops/w4afp8_gemm/w4afp8_gemm_*.cu
+custom_ops/gpu_ops/w4afp8_gemm/w4afp8_gemm_template.h
@@ -29,7 +29,11 @@ std::vector<paddle::Tensor> TopPSamplingReject(const paddle::Tensor &probs,
 
   // need_batch_random
   if (seed == -1) {
+#ifdef PADDLE_WITH_COREX
+    auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(probs.place()));
+#else
     phi::GPUContext* dev_ctx = static_cast<phi::GPUContext*>(phi::DeviceContextPool::Instance().Get(probs.place()));
+#endif
     auto gen_cuda = dev_ctx->GetGenerator();
     auto seed_offset = gen_cuda->IncrementOffset(32 * batch_size);
     philox_seed = seed_offset.first;
 
@@ -212,9 +212,15 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
     prob_greater_than_threshold[j] = pred(prob_vec[j]) ? prob_vec[j] : 0;
     valid[j] = pred(prob_vec[j]) && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d;
   }
+#ifdef PADDLE_WITH_COREX
+  float aggregate_local =
+      BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage->block_prim.reduce)
+          .Sum(prob_greater_than_threshold);
+#else
   float aggregate_local =
       BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage->block_prim.reduce)
           .Sum<VEC_SIZE>(prob_greater_than_threshold);
+#endif
   if (tx == 0) {
     temp_storage->block_aggregate.value = aggregate_local;
   }
@@ -226,8 +232,13 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
       DeterministicInclusiveSum<VEC_SIZE, BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>(
           prob_greater_than_threshold, inclusive_cdf, temp_storage);
     } else {
+#ifdef PADDLE_WITH_COREX
+      BlockScan<float, BLOCK_THREADS, SCAN_ALGORITHM>(temp_storage->block_prim.scan)
+          .InclusiveSum(prob_greater_than_threshold, inclusive_cdf);
+#else
       BlockScan<float, BLOCK_THREADS, SCAN_ALGORITHM>(temp_storage->block_prim.scan)
           .InclusiveSum<VEC_SIZE>(prob_greater_than_threshold, inclusive_cdf);
+#endif
 
       __syncthreads();
     }
@@ -239,11 +250,21 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
 
     bool greater_than_u_diff[VEC_SIZE];
 #ifdef SAMPLING_CUB_SUBTRACTLEFT_DEFINED
-    BlockAdjacentDifference<bool, BLOCK_THREADS>(temp_storage->block_prim.adj_diff)
-        .SubtractLeft<VEC_SIZE>(greater_than_u, greater_than_u_diff, BoolDiffOp());
+    #ifdef PADDLE_WITH_COREX
+      BlockAdjacentDifference<bool, BLOCK_THREADS>(temp_storage->block_prim.adj_diff)
+          .SubtractLeft(greater_than_u, greater_than_u_diff, BoolDiffOp());
+    #else
+      BlockAdjacentDifference<bool, BLOCK_THREADS>(temp_storage->block_prim.adj_diff)
+          .SubtractLeft<VEC_SIZE>(greater_than_u, greater_than_u_diff, BoolDiffOp());
+    #endif
 #else
-    BlockAdjacentDifference<bool, BLOCK_THREADS>(temp_storage->block_prim.adj_diff)
-        .FlagHeads<VEC_SIZE>(greater_than_u_diff, greater_than_u, BoolDiffOp(), 0);
+    #ifdef PADDLE_WITH_COREX
+      BlockAdjacentDifference<bool, BLOCK_THREADS>(temp_storage->block_prim.adj_diff)
+          .FlagHeads(greater_than_u_diff, greater_than_u, BoolDiffOp(), 0);
+    #else
+      BlockAdjacentDifference<bool, BLOCK_THREADS>(temp_storage->block_prim.adj_diff)
+          .FlagHeads<VEC_SIZE>(greater_than_u_diff, greater_than_u, BoolDiffOp(), 0);
+    #endif
 #endif
     __syncthreads();
 
@@ -355,18 +376,30 @@ __global__ void TopKTopPSamplingFromProbKernel(DType* probs, IdType* output,
             (probs_vec[j] > pivot_1 && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
       }
 
+#ifdef PADDLE_WITH_COREX
+      aggregate_gt_pivot_0 +=
+          BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
+              .Sum(probs_gt_pivot_0);
+#else
       aggregate_gt_pivot_0 +=
           BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
               .Sum<VEC_SIZE>(probs_gt_pivot_0);
+#endif
       if (tx == 0) {
         temp_storage.block_aggregate.pair = aggregate_gt_pivot_0;
       }
       __syncthreads();
       aggregate_gt_pivot_0 = temp_storage.block_aggregate.pair;
 
+#ifdef PADDLE_WITH_COREX
+      aggregate_gt_pivot_1 +=
+          BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
+              .Sum(probs_gt_pivot_1);
+#else
       aggregate_gt_pivot_1 +=
           BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
               .Sum<VEC_SIZE>(probs_gt_pivot_1);
+#endif
       if (tx == 0) {
         temp_storage.block_aggregate.pair = aggregate_gt_pivot_1;
       }
@@ -466,16 +499,26 @@ __global__ void TopPSamplingFromProbKernel(DType* probs, IdType* output,
         probs_gt_pivot_1[j] = (probs_vec[j] > pivot_1) ? probs_vec[j] : 0;
       }
 
+#ifdef PADDLE_WITH_COREX
+      aggregate_gt_pivot_0 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
+                                  .Sum(probs_gt_pivot_0);
+#else
       aggregate_gt_pivot_0 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
                                   .Sum<VEC_SIZE>(probs_gt_pivot_0);
+#endif
       if (tx == 0) {
         temp_storage.block_aggregate.value = aggregate_gt_pivot_0;
       }
       __syncthreads();
       aggregate_gt_pivot_0 = temp_storage.block_aggregate.value;
 
+#ifdef PADDLE_WITH_COREX
+      aggregate_gt_pivot_1 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
+                                  .Sum(probs_gt_pivot_1);
+#else
       aggregate_gt_pivot_1 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
                                   .Sum<VEC_SIZE>(probs_gt_pivot_1);
+#endif
       if (tx == 0) {
         temp_storage.block_aggregate.value = aggregate_gt_pivot_1;
       }
@@ -521,9 +564,15 @@ __device__ __forceinline__ float GetMaxValue(float* in_data, uint32_t row_idx, u
     for (uint32_t j = 0; j < VEC_SIZE; ++j) {
       in_data_[j] = in_data_vec[j];
     }
+#ifdef PADDLE_WITH_COREX
+    max_val = max(
+        max_val, BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
+                     .Reduce(in_data_, cub::Max()));
+#else
     max_val = max(
         max_val, BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
                      .Reduce<VEC_SIZE>(in_data_, cub::Max()));
+#endif
     __syncthreads();
   }
   if (tx == 0) {
@@ -605,7 +654,11 @@ __global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType*
   const uint32_t bx = blockIdx.x, tx = threadIdx.x;
   const uint32_t row_idx = bx;
   const uint32_t k = top_k_arr[row_idx] == 0 ? d : top_k_arr[row_idx];
+#ifdef PADDLE_WITH_COREX
+  double pivot = std::numeric_limits<float>::infinity(), normalizer = 1;
+#else
   double pivot = -cuda::std::numeric_limits<float>::infinity(), normalizer = 1;
+#endif
   vec_t<float, VEC_SIZE> probs_vec;
   if (k < d) {
     extern __shared__ __align__(alignof(RenormTempStorage<BLOCK_THREADS, REDUCE_ALGO>))
@@ -659,14 +712,26 @@ __global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType*
           }
         }
 
+#ifdef PADDLE_WITH_COREX
+        aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                                    temp_storage.block_prim.reduce_value_count)
+                                    .Sum(probs_gt_pivot_0_pair);
+#else
         aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
                                     temp_storage.block_prim.reduce_value_count)
                                     .Sum<VEC_SIZE>(probs_gt_pivot_0_pair);
+#endif
         __syncthreads();
 
+#ifdef PADDLE_WITH_COREX
+        aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                                    temp_storage.block_prim.reduce_value_count)
+                                    .Sum(probs_gt_pivot_1_pair);
+#else
         aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
                                     temp_storage.block_prim.reduce_value_count)
                                     .Sum<VEC_SIZE>(probs_gt_pivot_1_pair);
+#endif
         __syncthreads();
       }
       min_gt_low =
 
@@ -258,9 +258,13 @@ inline std::pair<int, int> GetCudaComputeCapability() {
 
 /******************* math *******************/
 __forceinline__ __device__ float ptx_rcp(float x) {
+#ifdef PADDLE_WITH_COREX
+  return __ivcorex_rcpf(x);
+#else
   float y;
   asm volatile("rcp.approx.ftz.f32 %0, %1;" : "=f"(y) : "f"(x));
   return y;
+#endif
 }
 
 template <typename T1, typename T2>
 
@@ -0,0 +1,154 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cute/algorithm/copy.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+
+using namespace cute;
+
+template <int kStages, class GemmType, class OutputType, class SmemLayoutA,
+          class SmemLayoutB, class SmemLayoutC>
+struct SharedStorage {
+    union {
+        struct {
+            cute::array_aligned<GemmType, cute::cosize_v<SmemLayoutA>> smem_a;
+            cute::array_aligned<GemmType, cute::cosize_v<SmemLayoutB>> smem_b;
+        };
+        cute::array_aligned<OutputType, cute::cosize_v<SmemLayoutC>> smem_c;
+    };
+
+  struct {
+    typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline;
+  };
+};
+
+template<int kBlockM_, int kBlockN_, int kBlockK_,
+        int kNWarps_, int kStages_,
+        int kTiles_, int M_,
+        int TokenPackSize_,
+        int TAIL_N_ = 0,
+        int kClusterM_ = 1,
+        typename elem_type=cutlass::float_e4m3_t,
+        typename OutputType = cutlass::bfloat16_t>
+struct Kernel_traits {
+    using Element = elem_type;
+    using ElementAccum = float;
+    using ElementOutput = OutputType;
+    static_assert(cutlass::sizeof_bits_v<Element> == 8);
+
+    static constexpr int kNWarps = kNWarps_;
+    static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp;
+    static constexpr int NumProducerThreads = cutlass::NumThreadsPerWarpGroup;
+    static constexpr int NumMmaThreads = kNThreads - NumProducerThreads;
+
+    static_assert(kNWarps_ == 12 || kNWarps_ == 16);
+
+    static constexpr int kBlockM = kBlockM_;
+    static constexpr int kBlockN = kBlockN_;
+    static constexpr int kBlockK = kBlockK_;
+    static constexpr int kTiles = kTiles_;
+    static constexpr int TokenPackSize = TokenPackSize_;
+    static constexpr int M = M_;
+    static constexpr int TAIL_N = TAIL_N_;
+
+    using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kBlockK>>;
+    using TileShape_MNK_TAIL = Shape<Int<kBlockM>, Int<TAIL_N>, Int<kBlockK>>;
+
+    static constexpr int kClusterM = kClusterM_;
+    using ClusterShape_MNK = Shape<Int<kClusterM>, _1, _1>;
+
+    static constexpr int kStages = kStages_;
+    static_assert(kStages > 1);
+
+    using AtomLayoutMNK = Layout<Shape<Int<kBlockM / 64>, _1, _1>>;
+
+    using TiledMma = decltype(cute::make_tiled_mma(
+        cute::GMMA::rs_op_selector<Element, Element, ElementAccum, TileShape_MNK>(),
+        AtomLayoutMNK{}));
+
+    using TiledMma_TAIL = decltype(cute::make_tiled_mma(
+        cute::GMMA::rs_op_selector<Element, Element, ElementAccum, TileShape_MNK_TAIL>(),
+        AtomLayoutMNK{}));
+
+    using SmemLayoutAtomA = decltype(
+        cutlass::gemm::collective::detail::rs_smem_selector<
+            GMMA::Major::K, Element, Int<kBlockM>, Int<kBlockK / 2>>());
+
+    using SmemLayoutA = decltype(
+        tile_to_shape(SmemLayoutAtomA{},
+            make_shape(Int<kBlockM>{}, Int<kBlockK / 2>{}, Int<kStages>{})));
+
+    using SmemLayoutAtomB = decltype(
+        cutlass::gemm::collective::detail::rs_smem_selector<
+            GMMA::Major::K, Element, decltype(cute::get<1>(TileShape_MNK{})),
+            decltype(cute::get<2>(TileShape_MNK{}))>());
+
+    using SmemLayoutB = decltype(
+        tile_to_shape(SmemLayoutAtomB{},
+            make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
+
+    using SmemLayoutAtomB_TAIL = decltype(
+        cutlass::gemm::collective::detail::rs_smem_selector<
+            GMMA::Major::K, Element, decltype(cute::get<1>(TileShape_MNK_TAIL{})),
+            decltype(cute::get<2>(TileShape_MNK_TAIL{}))>());
+
+    using SmemLayoutB_TAIL = decltype(
+        tile_to_shape(SmemLayoutAtomB_TAIL{},
+            make_shape(
+                shape<1>(TileShape_MNK_TAIL{}),
+                shape<2>(TileShape_MNK_TAIL{}),
+                Int<kStages>{})
+            ));
+
+    using SmemLayoutAtomC = decltype(
+        cutlass::gemm::collective::detail::rs_smem_selector<
+        GMMA::Major::K, ElementOutput,
+        decltype(cute::get<0>(TileShape_MNK{})),
+        decltype(cute::get<1>(TileShape_MNK{}))>());
+
+    using SmemLayoutC = decltype(tile_to_shape(SmemLayoutAtomC{}, select<0, 1>(TileShape_MNK{})));
+
+    using SmemCopyAtomAB = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+    using SmemCopyAtomC = Copy_Atom<cute::SM90_U32x4_STSM_N, ElementOutput>;
+
+    using SharedStorage = SharedStorage<
+        kStages, Element, ElementOutput, SmemLayoutA, SmemLayoutB, SmemLayoutC>;
+
+    using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
+    using PipelineState = typename cutlass::PipelineState<kStages>;
+
+
+    static constexpr int kNumVecElem = ceil_div(128, sizeof_bits_v<OutputType>);
+    static constexpr int kNumThreadsPerRow = kBlockN / kNumVecElem;
+    // static_assert(NumMmaThreads % kNumThreadsPerRow == 0);
+    static constexpr int kNumRows = NumMmaThreads / kNumThreadsPerRow;
+    using TiledCopyCAtom = cute::Copy_Atom<cute::UniversalCopy<cutlass::uint128_t>, OutputType>;
+    using TiledCopyCThrLayout = decltype(cute::make_layout(
+        cute::make_shape(Int<kNumRows>{}, Int<kNumThreadsPerRow>{}),
+        LayoutRight{}));
+    using TiledCopyCValLayout = decltype(cute::make_layout(
+        cute::make_shape(_1{}, Int<kNumVecElem>{}),
+        LayoutRight{}));
+    using TiledCopyC = decltype(make_tiled_copy(
+        TiledCopyCAtom{},
+        TiledCopyCThrLayout{}, // Thr layout
+        TiledCopyCValLayout{} // Val layout
+    ));
+};