[CIR][ThroughMLIR] Lower CIR IV load with SCF IV move operation (#729)

ShivaChen · web-flow · commit dd9353b3a76b · 2024-07-10T13:31:03.000-07:00
Previously, when lowering induction variable in forOp, we removed the IV
load and replaced the users with SCF.IV.

The CIR IV users might still CIR operations during lowering forOp. It
caused the issue that CIR operation contained SCF.IV as operand which is
MLIR integer type instead CIR type.

This comment lower
    CIR load IV_ADDR
with
    ARITH addi SCF.IV, 0

So SCF.IV can be propagated by OpAdaptor when lowering individual IV
users. This simplifies the lowering and fixes the issue. The redundant
arith.addi can be removed by later MLIR passes.
diff --git a/clang/lib/CIR/Lowering/ThroughMLIR/LowerCIRLoopToSCF.cpp b/clang/lib/CIR/Lowering/ThroughMLIR/LowerCIRLoopToSCF.cpp
@@ -204,19 +204,6 @@ void SCFLoop::analysis() {
   assert(upperBound && "can't find loop upper bound");
 }
 
-// Return true if op operation is in the loop body.
-static bool isInLoopBody(mlir::Operation *op) {
-  mlir::Operation *parentOp = op->getParentOp();
-  if (!parentOp)
-    return false;
-  if (isa<mlir::scf::ForOp>(parentOp))
-    return true;
-  auto forOp = dyn_cast<mlir::cir::ForOp>(parentOp);
-  if (forOp && (&forOp.getBody() == op->getParentRegion()))
-    return true;
-  return false;
-}
-
 void SCFLoop::transferToSCFForOp() {
   auto ub = getUpperBound();
   auto lb = getLowerBound();
@@ -236,12 +223,13 @@ void SCFLoop::transferToSCFForOp() {
           "Not support lowering loop with break, continue or if yet");
     // Replace the IV usage to scf loop induction variable.
     if (isIVLoad(op, IVAddr)) {
-      auto newIV = scfForOp.getInductionVar();
-      op->getResult(0).replaceAllUsesWith(newIV);
-      // Only erase the IV load in the loop body because all the operations
-      // in loop step and condition regions will be erased.
-      if (isInLoopBody(op))
-        rewriter->eraseOp(op);
+      // Replace CIR IV load with arith.addi scf.IV, 0.
+      // The replacement makes the SCF IV can be automatically propogated
+      // by OpAdaptor for individual IV user lowering.
+      // The redundant arith.addi can be removed by later MLIR passes.
+      rewriter->setInsertionPoint(op);
+      auto newIV = plusConstant(scfForOp.getInductionVar(), loc, 0);
+      rewriter->replaceOp(op, newIV.getDefiningOp());
     }
     return mlir::WalkResult::advance();
   });
@@ -318,4 +306,4 @@ void populateCIRLoopToSCFConversionPatterns(mlir::RewritePatternSet &patterns,
       converter, patterns.getContext());
 }
 
-} // namespace cir
+} // namespace cir
diff --git a/clang/test/CIR/Lowering/ThroughMLIR/for.cir b/clang/test/CIR/Lowering/ThroughMLIR/for.cir
diff --git a/clang/test/CIR/Lowering/ThroughMLIR/for.cpp b/clang/test/CIR/Lowering/ThroughMLIR/for.cpp
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -fno-clangir-direct-lowering -emit-mlir %s -o %t.mlir
+// RUN: FileCheck --input-file=%t.mlir %s
+
+int a[101], b[101];
+
+void constantLoopBound() {
+  for (int i = 0; i < 100; ++i)
+    a[i] = 3;
+}
+// CHECK-LABEL: func.func @_Z17constantLoopBoundv() {
+// CHECK: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK: %[[C100:.*]] = arith.constant 100 : i32
+// CHECK: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C100]] step %[[C1]] : i32 {
+// CHECK:   %[[C3:.*]] = arith.constant 3 : i32
+// CHECK:   %[[BASE:.*]] = memref.get_global @a : memref<101xi32>
+// CHECK:   %[[C0_i32:.*]] = arith.constant 0 : i32
+// CHECK:   %[[IV:.*]] = arith.addi %[[I]], %[[C0_i32]] : i32
+// CHECK:   %[[INDEX:.*]] = arith.index_cast %[[IV]] : i32 to index
+// CHECK:   memref.store %[[C3]], %[[BASE]][%[[INDEX]]] : memref<101xi32>
+// CHECK: }
+
+void constantLoopBound_LE() {
+  for (int i = 0; i <= 100; ++i)
+    a[i] = 3;
+}
+// CHECK-LABEL: func.func @_Z20constantLoopBound_LEv() {
+// CHECK: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK: %[[C100:.*]] = arith.constant 100 : i32
+// CHECK: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK: %[[C101:.*]] = arith.addi %c100_i32, %c1_i32 : i32
+// CHECK: %[[C1_STEP:.*]] = arith.constant 1 : i32
+// CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C101]] step %[[C1_STEP]] : i32 {
+// CHECK:   %[[C3:.*]] = arith.constant 3 : i32
+// CHECK:   %[[BASE:.*]] = memref.get_global @a : memref<101xi32>
+// CHECK:   %[[C0_i32:.*]] = arith.constant 0 : i32
+// CHECK:   %[[IV:.*]] = arith.addi %[[I]], %[[C0_i32]] : i32
+// CHECK:   %[[INDEX:.*]] = arith.index_cast %[[IV]] : i32 to index
+// CHECK:   memref.store %[[C3]], %[[BASE]][%[[INDEX]]] : memref<101xi32>
+// CHECK: }
+
+void variableLoopBound(int l, int u) {
+  for (int i = l; i < u; ++i)
+    a[i] = 3;
+}
+// CHECK-LABEL: func.func @_Z17variableLoopBoundii
+// CHECK: memref.store %arg0, %alloca[] : memref<i32>
+// CHECK: memref.store %arg1, %alloca_0[] : memref<i32>
+// CHECK: %[[LOWER:.*]] = memref.load %alloca[] : memref<i32>
+// CHECK: %[[UPPER:.*]] = memref.load %alloca_0[] : memref<i32>
+// CHECK: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK: scf.for %[[I:.*]] = %[[LOWER]] to %[[UPPER]] step %[[C1]] : i32 {
+// CHECK:   %[[C3:.*]] = arith.constant 3 : i32
+// CHECK:   %[[BASE:.*]] = memref.get_global @a : memref<101xi32>
+// CHECK:   %[[C0:.*]] = arith.constant 0 : i32
+// CHECK:   %[[IV:.*]] = arith.addi %[[I]], %[[C0]] : i32
+// CHECK:   %[[INDEX:.*]] = arith.index_cast %[[IV]] : i32 to index
+// CHECK:   memref.store %[[C3]], %[[BASE]][%[[INDEX]]] : memref<101xi32>
+// CHECK: }
+
+void ariableLoopBound_LE(int l, int u) {
+  for (int i = l; i <= u; i+=4)
+    a[i] = 3;
+}
+// CHECK-LABEL: func.func @_Z19ariableLoopBound_LEii
+// CHECK: memref.store %arg0, %alloca[] : memref<i32>
+// CHECK: memref.store %arg1, %alloca_0[] : memref<i32>
+// CHECK: %[[LOWER:.*]] = memref.load %alloca[] : memref<i32>
+// CHECK: %[[UPPER_DEC_1:.*]] = memref.load %alloca_0[] : memref<i32>
+// CHECK: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK: %[[UPPER:.*]] = arith.addi %[[UPPER_DEC_1]], %[[C1]] : i32
+// CHECK: %[[C4:.*]] = arith.constant 4 : i32
+// CHECK: scf.for %[[I:.*]] = %[[LOWER]] to %[[UPPER]] step %[[C4]] : i32 {
+// CHECK:   %[[C3:.*]] = arith.constant 3 : i32
+// CHECK:   %[[BASE:.*]] = memref.get_global @a : memref<101xi32>
+// CHECK:   %[[C0:.*]] = arith.constant 0 : i32
+// CHECK:   %[[IV:.*]] = arith.addi %[[I]], %[[C0]] : i32
+// CHECK:   %[[INDEX:.*]] = arith.index_cast %[[IV]] : i32 to index
+// CHECK:   memref.store %[[C3]], %[[BASE]][%[[INDEX]]] : memref<101xi32>
+// CHECK: }
+
+void incArray() {
+  for (int i = 0; i < 100; ++i)
+    a[i] += b[i];
+}
+// CHECK-LABEL: func.func @_Z8incArrayv() {
+// CHECK: %[[C0:.*]] = arith.constant 0 : i32
+// CHECK: %[[C100:.*]] = arith.constant 100 : i32
+// CHECK: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C100]] step %[[C1]] : i32 {
+// CHECK:   %[[B:.*]] = memref.get_global @b : memref<101xi32>
+// CHECK:   %[[C0_2:.*]] = arith.constant 0 : i32
+// CHECK:   %[[IV2:.*]] = arith.addi %[[I]], %[[C0_2]] : i32
+// CHECK:   %[[INDEX_2:.*]] = arith.index_cast %[[IV2]] : i32 to index
+// CHECK:   %[[B_VALUE:.*]] = memref.load %[[B]][%[[INDEX_2]]] : memref<101xi32>
+// CHECK:   %[[A:.*]] = memref.get_global @a : memref<101xi32>
+// CHECK:   %[[C0_1:.*]] = arith.constant 0 : i32
+// CHECK:   %[[IV1:.*]] = arith.addi %[[I]], %[[C0_1]] : i32
+// CHECK:   %[[INDEX_1:.*]] = arith.index_cast %[[IV1]] : i32 to index
+// CHECK:   %[[A_VALUE:.*]] = memref.load %[[A]][%[[INDEX_1]]] : memref<101xi32>
+// CHECK:   %[[SUM:.*]] = arith.addi %[[A_VALUE]], %[[B_VALUE]] : i32
+// CHECK:   memref.store %[[SUM]], %[[A]][%[[INDEX_1]]] : memref<101xi32>
+// CHECK: }