PaddlePaddle
diff --git a/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_cutlass_kernel.h
Lines changed: 4 additions & 4 deletions b/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_cutlass_kernel.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels.h
Lines changed: 17 additions & 18 deletions b/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels.h
Lines changed: 17 additions & 18 deletions
diff --git a/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_bf16_int2.cu
Lines changed: 0 additions & 1 deletion b/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_bf16_int2.cu
Lines changed: 0 additions & 1 deletion
diff --git a/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_fp16_int2.cu
Lines changed: 1 addition & 1 deletion b/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_fp16_int2.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_fp8_int2_bf16.cu
Lines changed: 3 additions & 3 deletions b/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_fp8_int2_bf16.cu
Lines changed: 3 additions & 3 deletions
diff --git a/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_fp8_int2_fp16.cu
Lines changed: 3 additions & 3 deletions b/‎custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_fp8_int2_fp16.cu
Lines changed: 3 additions & 3 deletions
@@ -784,13 +784,13 @@ struct Wint2xMoeFCGemm : public MoeFCGemm<Mma_, Epilogue_, ThreadblockSwizzle_,
       cutlass::MatrixCoord tb_offset_scale{0, threadblock_offset.n()};
       cutlass::MatrixCoord tb_offset_local_scale{0, threadblock_offset.n() * 2};
 
-    // static_assert(platform::is_same<ElementScale, cutlass::float_e4m3_t>::value,
-    //     "ElementScale must be float_e4m3_t");
+      // static_assert(platform::is_same<ElementScale, cutlass::float_e4m3_t>::value,
+      //     "ElementScale must be float_e4m3_t");
 
       using ElementSuperScale = typename Mma::QuantParamsAccessor::ElementSuperScale;
 
-      static_assert(platform::is_same<ElementSuperScale, cutlass::bfloat16_t>::value,
-          "ElementSuperScale must be bfloat16_t");
+      // static_assert(platform::is_same<ElementSuperScale, cutlass::bfloat16_t>::value,
+      //     "ElementSuperScale must be bfloat16_t");
 
       // TODO（"baoqiwen"）, reinterpret_cast
       ElementScale* weight_scale_ptr = params.weight_scales + problem_idx * gemm_n;
 
@@ -24,8 +24,7 @@
 
 namespace phi {
 
-template <typename InType,
-          typename OutType,
+template <typename T, /*The type used for activations/scales/compute*/
           typename WeightQuantTraits /* The quant traits for the MoE weights */>
 class MoeGemmRunner {
  public:
@@ -34,11 +33,11 @@ class MoeGemmRunner {
 
   MoeGemmRunner();
 
-  void moe_gemm_bias_act(const InType* A,
+  void moe_gemm_bias_act(const T* A,
                          const WeightType* B,
-                         const OutType* weight_scales,
-                         const OutType* biases,
-                         OutType* C,
+                         const T* weight_scales,
+                         const T* biases,
+                         T* C,
                          int64_t* total_rows_before_expert,
                          int64_t total_rows,
                          int64_t tune_total_rows,
@@ -49,10 +48,10 @@ class MoeGemmRunner {
                          std::string activation_type,
                          cudaStream_t stream);
 
-  void moe_gemm(const InType* A,
+  void moe_gemm(const T* A,
                 const WeightType* B,
-                const OutType* weight_scales,
-                OutType* C,
+                const T* weight_scales,
+                T* C,
                 int64_t* total_rows_before_expert,
                 int64_t total_rows,
                 int64_t tune_total_rows,
@@ -64,11 +63,11 @@ class MoeGemmRunner {
 
  private:
   template <typename EpilogueTag>
-  void dispatch_to_arch(const InType* A,
+  void dispatch_to_arch(const T* A,
                         const WeightType* B,
-                        const OutType* weight_scales,
-                        const OutType* biases,
-                        OutType* C,
+                        const T* weight_scales,
+                        const T* biases,
+                        T* C,
                         int64_t* total_rows_before_expert,
                         int64_t total_rows,
                         int64_t gemm_n,
@@ -80,11 +79,11 @@ class MoeGemmRunner {
                         int* occupancy = nullptr);
 
   template <typename EpilogueTag>
-  void run_gemm(const InType* A,
+  void run_gemm(const T* A,
                 const WeightType* B,
-                const OutType* weight_scales,
-                const OutType* biases,
-                OutType* C,
+                const T* weight_scales,
+                const T* biases,
+                T* C,
                 int64_t* total_rows_before_expert,
                 int64_t total_rows,
                 int64_t tune_total_rows,
@@ -99,4 +98,4 @@ class MoeGemmRunner {
   int multi_processor_count_;
 };
 
-}  // namespace phi
+}  // namespace phi
@@ -23,7 +23,6 @@ namespace phi {
 
 #ifdef PADDLE_CUDA_BF16
 template class MoeGemmRunner<
-    __nv_bfloat16,
     __nv_bfloat16,
     cutlass::WintQuantTraits<__nv_bfloat16, cutlass::WintQuantMethod::kWeightOnlyInt2>>;
 #endif
 
@@ -22,6 +22,6 @@
 namespace phi {
 
 template class MoeGemmRunner<
-    half, half, cutlass::WintQuantTraits<half, cutlass::WintQuantMethod::kWeightOnlyInt2>>;
+    half, cutlass::WintQuantTraits<half, cutlass::WintQuantMethod::kWeightOnlyInt2>>;
 
 } // namespace phi
@@ -15,14 +15,14 @@
  */
 
 #pragma once
-#include "cutlass_kernels/moe_gemm/fused_moe_gemm_kernels.h"
-#include "cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h"
+#include "cutlass_kernels/moe_gemm/fused_moe_gemm_mixed_io_kernels.h"
+#include "cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_mixed_io_template.h"
 #include "helper.h"
 
 namespace phi {
 
 #ifdef PADDLE_CUDA_BF16
-template class MoeGemmRunner<
+template class MixedMoeGemmRunner<
     cutlass::float_e4m3_t,
     __nv_bfloat16,
     cutlass::WintQuantTraits<cutlass::float_e4m3_t, cutlass::WintQuantMethod::kWeightOnlyInt2>>;
 
@@ -15,13 +15,13 @@
  */
 
 #pragma once
-#include "cutlass_kernels/moe_gemm/fused_moe_gemm_kernels.h"
-#include "cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h"
+#include "cutlass_kernels/moe_gemm/fused_moe_gemm_mixed_io_kernels.h"
+#include "cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_mixed_io_template.h"
 #include "helper.h"
 
 namespace phi {
 
-template class MoeGemmRunner<
+template class MixedMoeGemmRunner<
     cutlass::float_e4m3_t,
     half,
     cutlass::WintQuantTraits<cutlass::float_e4m3_t, cutlass::WintQuantMethod::kWeightOnlyInt2>>;