Replace rotate-and-reduce with new lowering

j2kun · j2kun · commit f257f95ca4f5 · 2025-08-28T16:43:49.000-07:00
Adds support for mul reductions in rotate-and-reduce kernel impl
diff --git a/lib/Analysis/RotationAnalysis/RotationAnalysis.cpp b/lib/Analysis/RotationAnalysis/RotationAnalysis.cpp
@@ -17,6 +17,8 @@
 #include "mlir/include/mlir/IR/Visitors.h"                 // from @llvm-project
 #include "mlir/include/mlir/Support/LLVM.h"                // from @llvm-project
 
+#define DEBUG_TYPE "rotation-analysis"
+
 namespace mlir {
 namespace heir {
 namespace rotation_analysis {
diff --git a/lib/Analysis/RotationAnalysis/RotationAnalysis.h b/lib/Analysis/RotationAnalysis/RotationAnalysis.h
@@ -10,7 +10,6 @@
 #include <vector>
 
 #include "llvm/include/llvm/Support/Casting.h"             // from @llvm-project
-#include "llvm/include/llvm/Support/Debug.h"               // from @llvm-project
 #include "mlir/include/mlir/Analysis/DataFlowFramework.h"  // from @llvm-project
 #include "mlir/include/mlir/IR/BuiltinTypes.h"             // from @llvm-project
 #include "mlir/include/mlir/IR/Diagnostics.h"              // from @llvm-project
@@ -19,8 +18,6 @@
 #include "mlir/include/mlir/IR/Value.h"                    // from @llvm-project
 #include "mlir/include/mlir/Support/LLVM.h"                // from @llvm-project
 
-#define DEBUG_TYPE "rotation-analysis"
-
 namespace mlir {
 namespace heir {
 namespace rotation_analysis {
@@ -92,8 +89,6 @@ class PartialReduction {
     // first element.
     reduction.addRotation(0);
 
-    LLVM_DEBUG(llvm::dbgs()
-               << "Initializing at " << tensor << " with rotations [0]\n");
     return reduction;
   }
 
@@ -107,11 +102,6 @@ class PartialReduction {
            "Internal state of RotationAnalysis is broken; tensor having saved "
            "value should be impossible");
 
-    LLVM_DEBUG({
-      llvm::dbgs() << "Rotating\n\t";
-      lhs.print(llvm::dbgs());
-      llvm::dbgs() << " by " << shift;
-    });
     PartialReduction shifted;
     shifted.tensor = lhs.tensor;
     shifted.opName = lhs.opName;
@@ -124,11 +114,6 @@ class PartialReduction {
     for (auto index : lhs.accessedIndices) {
       shifted.addRotation((index + shift) % size);
     }
-    LLVM_DEBUG({
-      llvm::dbgs() << " to\n\t";
-      shifted.print(llvm::dbgs());
-      llvm::dbgs() << "\n";
-    });
     return shifted;
   }
 
@@ -202,15 +187,6 @@ class PartialReduction {
     for (auto value : rhs.savedValues) {
       merged.savedValues.push_back(value);
     }
-    LLVM_DEBUG({
-      llvm::dbgs() << "Joining\n\t";
-      lhs.print(llvm::dbgs());
-      llvm::dbgs() << " and\n\t";
-      rhs.print(llvm::dbgs());
-      llvm::dbgs() << " to get\n\t";
-      merged.print(llvm::dbgs());
-      llvm::dbgs() << "\n";
-    });
     return merged;
   }
 
@@ -254,15 +230,6 @@ class PartialReduction {
     }
     merged.savedValues = lhs.savedValues;
     merged.savedValues.push_back(rhs);
-    LLVM_DEBUG({
-      llvm::dbgs() << "Saving\n\t";
-      rhs.print(llvm::dbgs());
-      llvm::dbgs() << " inside\n\t";
-      lhs.print(llvm::dbgs());
-      llvm::dbgs() << " to get\n\t";
-      merged.print(llvm::dbgs());
-      llvm::dbgs() << "\n";
-    });
     return merged;
   }
 
diff --git a/lib/Dialect/TensorExt/IR/TensorExtOps.td b/lib/Dialect/TensorExt/IR/TensorExtOps.td
@@ -139,7 +139,7 @@ def TensorExt_RotateAndReduceOp : TensorExt_Op<"rotate_and_reduce",[Pure, AllTyp
     This op reduces products of a plaintext with a periodically rotated
     tensor.
 
-    In generality, the reduction performs is
+    In almost full generality, the reduction performed is
 
     \[
       \sum_{i \in [0, n]} p(P, T*i) \cdot rotate(v, T*i)
@@ -167,6 +167,11 @@ def TensorExt_RotateAndReduceOp : TensorExt_Op<"rotate_and_reduce",[Pure, AllTyp
     `n = |v|` so that the reduction is simply a sum of all rotation of the
     ciphertext.
 
+    If `reduceOp` is set to an MLIR operation name (e.g., `arith.mulf`), then
+    the reduction operation is modified to use that operation instead of a sum.
+    The chosen op must be one of `arith.muli`, `arith.mulf`, `arith.addi`,
+    or `arith.addf`.
+
     Efficient lowerings of this operation can use the Baby-Step / Giant-Step
     approach from [Faster Homomorphic Linear Transformations in
     HElib](https://eprint.iacr.org/2018/244.pdf) to reduce the number of
@@ -177,10 +182,34 @@ def TensorExt_RotateAndReduceOp : TensorExt_Op<"rotate_and_reduce",[Pure, AllTyp
     ins AnyRankedTensor:$tensor,
     Optional<AnyRankedTensor>:$plaintexts,
     IndexAttr:$period,
-    IndexAttr:$steps
+    IndexAttr:$steps,
+    OptionalAttr<Builtin_StringAttr>:$reduceOp
   );
   let results = (outs AnyRankedTensor:$output);
   let hasVerifier = 1;
+
+  let builders = [
+    // Default builder for case of empty plaintexts
+    OpBuilder<(ins
+        "Value":$tensor, "int64_t":$period, "int64_t":$steps,
+        "::llvm::StringRef":$reduceOp), [{
+      return build(
+        $_builder,
+        $_state,
+        tensor.getType(),
+        ValueRange{tensor},
+        {
+          $_builder.getNamedAttr(
+            "period", $_builder.getIndexAttr(period)),
+          $_builder.getNamedAttr(
+            "steps", $_builder.getIndexAttr(steps)),
+          $_builder.getNamedAttr(
+            "reduceOp", $_builder.getStringAttr(reduceOp))
+        }
+      );
+    }]>
+  ];
+
   // TODO(#2134): Add canonicalization patterns
 }
 
diff --git a/lib/Dialect/TensorExt/Transforms/BUILD b/lib/Dialect/TensorExt/Transforms/BUILD
@@ -73,6 +73,7 @@ cc_library(
         "RotateAndReduce.h",
     ],
     deps = [
+        ":ImplementRotateAndReduce",
         ":pass_inc_gen",
         "@heir//lib/Analysis/RotationAnalysis",
         "@heir//lib/Dialect/TensorExt/IR:Dialect",
diff --git a/lib/Dialect/TensorExt/Transforms/ImplementRotateAndReduce.cpp b/lib/Dialect/TensorExt/Transforms/ImplementRotateAndReduce.cpp
@@ -42,17 +42,18 @@ LogicalResult convertRotateAndReduceOp(RotateAndReduceOp op) {
   unsigned period = op.getPeriod().getZExtValue();
   std::shared_ptr<ArithmeticDagNode<SSAValue>> implementedKernel;
   SSAValue vectorLeaf(input);
+  std::optional<SSAValue> plaintextsLeaf = std::nullopt;
 
-  if (!op.getPlaintexts()) {
-    implementedKernel = implementRotateAndReduce(
-        vectorLeaf, std::optional<SSAValue>(), period, steps);
+  if (op.getPlaintexts()) {
+    plaintextsLeaf = std::optional<SSAValue>(op.getPlaintexts());
   }
 
-  TypedValue<RankedTensorType> plaintexts = op.getPlaintexts();
-  auto plaintextsLeaf = std::optional<SSAValue>(plaintexts);
-  implementedKernel =
-      implementRotateAndReduce(vectorLeaf, plaintextsLeaf, period, steps);
-
+  std::string reduceOp = "arith.addi";
+  if (op.getReduceOp().has_value() && *op.getReduceOp() != nullptr) {
+    reduceOp = op.getReduceOp()->getValue().str();
+  }
+  implementedKernel = implementRotateAndReduce(vectorLeaf, plaintextsLeaf,
+                                               period, steps, reduceOp);
   IRRewriter rewriter(op.getContext());
   rewriter.setInsertionPointAfter(op);
   ImplicitLocOpBuilder b(op.getLoc(), rewriter);
diff --git a/lib/Dialect/TensorExt/Transforms/ImplementRotateAndReduce.h b/lib/Dialect/TensorExt/Transforms/ImplementRotateAndReduce.h
@@ -13,6 +13,8 @@ namespace tensor_ext {
 #define GEN_PASS_DECL_IMPLEMENTROTATEANDREDUCE
 #include "lib/Dialect/TensorExt/Transforms/Passes.h.inc"
 
+LogicalResult convertRotateAndReduceOp(RotateAndReduceOp op);
+
 }  // namespace tensor_ext
 }  // namespace heir
 }  // namespace mlir
diff --git a/lib/Dialect/TensorExt/Transforms/RotateAndReduce.cpp b/lib/Dialect/TensorExt/Transforms/RotateAndReduce.cpp
@@ -1,9 +1,8 @@
 #include "lib/Dialect/TensorExt/Transforms/RotateAndReduce.h"
 
-#include <cstdint>
-
 #include "lib/Analysis/RotationAnalysis/RotationAnalysis.h"
 #include "lib/Dialect/TensorExt/IR/TensorExtOps.h"
+#include "lib/Dialect/TensorExt/Transforms/ImplementRotateAndReduce.h"
 #include "llvm/include/llvm/ADT/TypeSwitch.h"              // from @llvm-project
 #include "llvm/include/llvm/Support/Debug.h"               // from @llvm-project
 #include "mlir/include/mlir/Analysis/DataFlow/Utils.h"     // from @llvm-project
@@ -13,13 +12,12 @@
 #include "mlir/include/mlir/Dialect/Tensor/IR/Tensor.h"    // from @llvm-project
 #include "mlir/include/mlir/IR/BuiltinAttributes.h"        // from @llvm-project
 #include "mlir/include/mlir/IR/BuiltinTypes.h"             // from @llvm-project
-#include "mlir/include/mlir/IR/ImplicitLocOpBuilder.h"     // from @llvm-project
 #include "mlir/include/mlir/IR/Iterators.h"                // from @llvm-project
 #include "mlir/include/mlir/IR/Visitors.h"                 // from @llvm-project
 #include "mlir/include/mlir/Support/LLVM.h"                // from @llvm-project
 #include "mlir/include/mlir/Support/LogicalResult.h"       // from @llvm-project
 
-#define DEBUG_NAME "rotate-and-reduce"
+#define DEBUG_TYPE "rotate-and-reduce"
 
 namespace mlir {
 namespace heir {
@@ -28,13 +26,12 @@ namespace tensor_ext {
 #define GEN_PASS_DEF_ROTATEANDREDUCE
 #include "lib/Dialect/TensorExt/Transforms/Passes.h.inc"
 
-/// A pass that searches for a length N sequence of binary operations that
+/// A pass that searches for a length N sequence of add operations that
 /// reduces a length N vector to a single scalar, and replaces it with a
 /// logarithmic number of rotations and binary operations.
 struct RotateAndReduce : impl::RotateAndReduceBase<RotateAndReduce> {
   using RotateAndReduceBase::RotateAndReduceBase;
 
-  // TODO(#2123): Rewrite this to use the tensor_ext.rotate_and_reduce op.
   template <typename ArithOp>
   void tryReplaceRotations(ArithOp op,
                            const rotation_analysis::PartialReduction& reduction,
@@ -43,17 +40,16 @@ struct RotateAndReduce : impl::RotateAndReduceBase<RotateAndReduce> {
                << "Trying to replace rotations ending in " << *op << "\n");
     auto b = ImplicitLocOpBuilder(op->getLoc(), op);
     auto tensor = reduction.getTensor();
-    Operation* finalOp = nullptr;
     auto tensorShape =
         mlir::cast<RankedTensorType>(tensor.getType()).getShape();
-    for (int64_t shiftSize = tensorShape[0] / 2; shiftSize > 0;
-         shiftSize /= 2) {
-      auto rotatedTensor = tensor_ext::RotateOp::create(
-          b, tensor, arith::ConstantOp::create(b, b.getIndexAttr(shiftSize)));
-      auto addOp = ArithOp::create(b, tensor, rotatedTensor);
-      finalOp = addOp;
-      tensor = addOp->getResult(0);
-    }
+
+    // Get the operation name for the reduce_op attribute
+    auto rotateAndReduceOp = tensor_ext::RotateAndReduceOp::create(
+        b, tensor,
+        /*period=*/1,
+        /*steps=*/tensorShape[0],
+        /*reduceOp=*/op->getName().getStringRef());
+    Operation* finalOp = rotateAndReduceOp;
 
     [[maybe_unused]] auto* parentOp = op->getParentOp();
     if (extraction) {
@@ -69,6 +65,12 @@ struct RotateAndReduce : impl::RotateAndReduceBase<RotateAndReduce> {
     }
     if (finalOp) op->replaceAllUsesWith(finalOp);
     LLVM_DEBUG(llvm::dbgs() << "Post-replacement: " << *parentOp << "\n");
+
+    // Convert the rotate_and_reduce op to its implementation immediately
+    if (failed(convertRotateAndReduceOp(rotateAndReduceOp))) {
+      LLVM_DEBUG(llvm::dbgs() << "Failed to convert rotate_and_reduce op\n");
+      return;
+    }
   }
 
   void runOnOperation() override {
@@ -98,20 +100,22 @@ struct RotateAndReduce : impl::RotateAndReduceBase<RotateAndReduce> {
 
             for (const auto& reduction :
                  rotationAnalysis.getRootedReductionsAt(result)) {
-              if (reduction.isComplete()) {
+              if (reduction.isComplete() &&
+                  cast<RankedTensorType>(reduction.getTensor().getType())
+                          .getNumElements() > 1) {
                 llvm::TypeSwitch<Operation&>(*op)
                     .Case<arith::AddIOp>([&](auto arithOp) {
                       tryReplaceRotations<arith::AddIOp>(arithOp, reduction,
                                                          extraction);
                     })
-                    .Case<arith::MulIOp>([&](auto arithOp) {
-                      tryReplaceRotations<arith::MulIOp>(arithOp, reduction,
-                                                         extraction);
-                    })
                     .Case<arith::AddFOp>([&](auto arithOp) {
                       tryReplaceRotations<arith::AddFOp>(arithOp, reduction,
                                                          extraction);
                     })
+                    .Case<arith::MulIOp>([&](auto arithOp) {
+                      tryReplaceRotations<arith::MulIOp>(arithOp, reduction,
+                                                         extraction);
+                    })
                     .Case<arith::MulFOp>([&](auto arithOp) {
                       tryReplaceRotations<arith::MulFOp>(arithOp, reduction,
                                                          extraction);
diff --git a/lib/Kernel/KernelImplementation.h b/lib/Kernel/KernelImplementation.h
@@ -132,15 +132,30 @@ template <typename T>
 std::enable_if_t<std::is_base_of<AbstractValue, T>::value,
                  std::shared_ptr<ArithmeticDagNode<T>>>
 implementRotateAndReduce(const T& vector, std::optional<T> plaintexts,
-                         int64_t period, int64_t steps) {
+                         int64_t period, int64_t steps,
+                         const std::string& reduceOp = "arith.addi") {
   using NodeTy = ArithmeticDagNode<T>;
   auto vectorDag = NodeTy::leaf(vector);
 
+  auto performReduction = [&](std::shared_ptr<NodeTy> left,
+                              std::shared_ptr<NodeTy> right) {
+    if (reduceOp == "arith.addi" || reduceOp == "arith.addf") {
+      return NodeTy::add(left, right);
+    }
+
+    if (reduceOp == "arith.muli" || reduceOp == "arith.mulf") {
+      return NodeTy::mul(left, right);
+    }
+
+    // Default to add for unknown operations
+    return NodeTy::add(left, right);
+  };
+
   if (!plaintexts.has_value()) {
     for (int64_t shiftSize = steps / 2; shiftSize > 0; shiftSize /= 2) {
       auto rotated = NodeTy::leftRotate(vectorDag, shiftSize * period);
-      auto added = NodeTy::add(vectorDag, rotated);
-      vectorDag = added;
+      auto reduced = performReduction(vectorDag, rotated);
+      vectorDag = reduced;
     }
     return vectorDag;
   }
@@ -191,12 +206,13 @@ implementRotateAndReduce(const T& vector, std::optional<T> plaintexts,
       auto rotatedPlaintext =
           NodeTy::leftRotate(plaintext, plaintextRotationAmount);
       auto multiplied = NodeTy::mul(rotatedPlaintext, babyStepVals[i]);
-      innerSum =
-          innerSum == nullptr ? multiplied : NodeTy::add(innerSum, multiplied);
+      innerSum = innerSum == nullptr ? multiplied
+                                     : performReduction(innerSum, multiplied);
     }
 
     auto rotatedSum = NodeTy::leftRotate(innerSum, period * j * giantStepSize);
-    result = result == nullptr ? rotatedSum : NodeTy::add(result, rotatedSum);
+    result =
+        result == nullptr ? rotatedSum : performReduction(result, rotatedSum);
   }
 
   return result;
diff --git a/lib/Transforms/ConvertToCiphertextSemantics/ConvertToCiphertextSemantics.cpp b/lib/Transforms/ConvertToCiphertextSemantics/ConvertToCiphertextSemantics.cpp
@@ -944,7 +944,8 @@ struct ConvertLinalgMatvecNewLayout
     auto rotateAndReduceOp = rewriter.create<tensor_ext::RotateAndReduceOp>(
         op.getLoc(), packedVectorType, packedVector, packedMatrix,
         /*period=*/rewriter.getIndexAttr(1),
-        /*steps=*/rewriter.getIndexAttr(numRotations));
+        /*steps=*/rewriter.getIndexAttr(numRotations),
+        /*reduce_op=*/rewriter.getStringAttr("arith.addf"));
     rotateAndReduceOp->setAttr(kLayoutAttrName, layoutAttr);
     setMaterializedAttr(rotateAndReduceOp);
 
diff --git a/tests/Dialect/TensorExt/Transforms/rotate_and_reduce.mlir b/tests/Dialect/TensorExt/Transforms/rotate_and_reduce.mlir