[mlir] Fixes to hoist padding

Fix the BlockAndValueMapping update that was missing entries for scf.for op's blockIterArgs. Skip cloning subtensors of the padded tensor as the logic for these is separate. Add a filter to drop side-effecting ops. Tests are beefed up to verify the IR is sound in all hoisting configurations for 2-level 3-D tiled matmul. Differential Revision: https://reviews.llvm.org/D99255
author: Nicolas Vasilache <nicolas.vasilache@gmail.com> 2021-03-24 11:24:22 +0000
committer: Nicolas Vasilache <nicolas.vasilache@gmail.com> 2021-03-24 11:51:28 +0000
commit: 7716e5535c6b248b5faabd2d1af01415a78da8d7 (patch)
tree: 1294d00a922a1898681d8be89870543bf64ca511
parent: e9015bd59519e205c2205fa413c8af7e677cc65d (diff)
download: llvm-7716e5535c6b248b5faabd2d1af01415a78da8d7.zip
llvm-7716e5535c6b248b5faabd2d1af01415a78da8d7.tar.gz
llvm-7716e5535c6b248b5faabd2d1af01415a78da8d7.tar.bz2
3 files changed, 90 insertions, 9 deletions
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
index 3baf9b4..b4a2182c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -793,7 +793,15 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
   backwardSlice.insert(padTensorOp);
   // Stack step 1. iteratively clone loops and push `packedTensor`.
   for (Operation *op : backwardSlice) {
-    if (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op)) {
+    // Specifically sit out in the subtenso(packedTensor) case: this is the
+    // piece we seek to replace.
+    if (auto subTensor = dyn_cast<SubTensorOp>(op))
+      if (bvm.lookupOrDefault(subTensor.source()) == packedTensor)
+        continue;
+    auto effects = dyn_cast<MemoryEffectOpInterface>(op);
+    bool hasNoEffects = !effects || effects.hasNoEffect();
+    if (hasNoEffects &&
+        (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op))) {
       b.clone(*op, bvm);
       continue;
     }
@@ -808,8 +816,10 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
         b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
                              bvm.lookupOrDefault(forOp.upperBound()),
                              bvm.lookupOrDefault(forOp.step()), packedTensor);
-
+    // Map the induction var, region args and results to the `clonedForOp`.
     bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
+    bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
+    bvm.map(forOp.getResults(), clonedForOp.getResults());
     assert(clonedForOp->getNumRegions() == 1);
     clonedLoopIvs.push_back(clonedForOp.getInductionVar());
 
diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir
index 2459d2a..248aa64 100644
--- a/mlir/test/Dialect/Linalg/hoist-padding.mlir
+++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir
@@ -1,4 +1,13 @@
-// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s
+// Specific structural checks are performed on 2-level hoisting
+// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=2 -canonicalize | FileCheck %s
+
+// IR verification is performed on [0-6]-level hoisting
+// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=0 | FileCheck %s --check-prefix=VERIFIER-ONLY
+// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=1 | FileCheck %s --check-prefix=VERIFIER-ONLY
+// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=3 | FileCheck %s --check-prefix=VERIFIER-ONLY
+// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=4 | FileCheck %s --check-prefix=VERIFIER-ONLY
+// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=5 | FileCheck %s --check-prefix=VERIFIER-ONLY
+// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=6 | FileCheck %s --check-prefix=VERIFIER-ONLY
 
 // CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)>
 // CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
@@ -14,6 +23,7 @@
 //  CHECK-SAME:   %[[TA:[0-9a-z]+]]: tensor
 //  CHECK-SAME:   %[[TB:[0-9a-z]+]]: tensor
 //  CHECK-SAME:   %[[TC:[0-9a-z]+]]: tensor
+// VERIFIER-ONLY-LABEL: func @matmul_tensors
 func @matmul_tensors(
   %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
   -> tensor<?x?xf32>
@@ -140,6 +150,7 @@ func @matmul_tensors(
 #map2 = affine_map<(d0, d1) -> (2, d0 - d1)>
 
 // CHECK-LABEL: func @dot
+// VERIFIER-ONLY-LABEL: func @dot
 func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
     -> tensor<f32>
 {
@@ -217,3 +228,63 @@ func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
   }
   return %4 : tensor<f32>
 }
+
+// -----
+
+// CHECK-LABEL: func @matmul_2d_tiling
+// VERIFIER-ONLY-LABEL: func @matmul_2d_tiling
+func @matmul_2d_tiling(%arg0: tensor<32x128xf32>, %arg1: tensor<128x64xf32>, %arg2: tensor<32x64xf32>) -> tensor<32x64xf32> {
+  %c128 = constant 128 : index
+  %c64 = constant 64 : index
+  %c32 = constant 32 : index
+  %c16 = constant 16 : index
+  %cst = constant 0.000000e+00 : f32
+  %c2 = constant 2 : index
+  %c4 = constant 4 : index
+  %c0 = constant 0 : index
+  %1 = scf.for %arg3 = %c0 to %c32 step %c16 iter_args(%arg4 = %arg2) -> (tensor<32x64xf32>) {
+    %2 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<32x64xf32>) {
+      %3 = scf.for %arg7 = %c0 to %c128 step %c32 iter_args(%arg8 = %arg6) -> (tensor<32x64xf32>) {
+        %4 = subtensor %arg0[%arg3, %arg7] [16, 32] [1, 1] : tensor<32x128xf32> to tensor<16x32xf32>
+        %5 = subtensor %arg1[%arg7, %arg5] [32, 32] [1, 1] : tensor<128x64xf32> to tensor<32x32xf32>
+        %6 = subtensor %arg8[%arg3, %arg5] [16, 32] [1, 1] : tensor<32x64xf32> to tensor<16x32xf32>
+        %7 = scf.for %arg9 = %c0 to %c16 step %c2 iter_args(%arg10 = %6) -> (tensor<16x32xf32>) {
+          %10 = scf.for %arg11 = %c0 to %c32 step %c4 iter_args(%arg12 = %arg10) -> (tensor<16x32xf32>) {
+            %11 = scf.for %arg13 = %c0 to %c32 step %c16 iter_args(%arg14 = %arg12) -> (tensor<16x32xf32>) {
+              %12 = subtensor %4[%arg9, %arg13] [2, 16] [1, 1] : tensor<16x32xf32> to tensor<2x16xf32>
+              %13 = tensor.cast %12 : tensor<2x16xf32> to tensor<?x?xf32>
+              %14 = subtensor %5[%arg13, %arg11] [16, 4] [1, 1] : tensor<32x32xf32> to tensor<16x4xf32>
+              %15 = tensor.cast %14 : tensor<16x4xf32> to tensor<?x?xf32>
+              %16 = subtensor %arg14[%arg9, %arg11] [2, 4] [1, 1] : tensor<16x32xf32> to tensor<2x4xf32>
+              %17 = tensor.cast %16 : tensor<2x4xf32> to tensor<?x?xf32>
+              %18 = linalg.pad_tensor %13 low[%c0, %c0] high[%c0, %c0]  {
+              ^bb0(%arg15: index, %arg16: index):  // no predecessors
+                linalg.yield %cst : f32
+              } : tensor<?x?xf32> to tensor<2x16xf32>
+              %19 = linalg.pad_tensor %15 low[%c0, %c0] high[%c0, %c0]  {
+              ^bb0(%arg15: index, %arg16: index):  // no predecessors
+                linalg.yield %cst : f32
+              } : tensor<?x?xf32> to tensor<16x4xf32>
+              %20 = linalg.pad_tensor %17 low[%c0, %c0] high[%c0, %c0]  {
+              ^bb0(%arg15: index, %arg16: index):  // no predecessors
+                linalg.yield %cst : f32
+              } : tensor<?x?xf32> to tensor<2x4xf32>
+              %21 = linalg.matmul ins(%18, %19 : tensor<2x16xf32>, tensor<16x4xf32>) outs(%20 : tensor<2x4xf32>) -> tensor<2x4xf32>
+              %22 = tensor.cast %21 : tensor<2x4xf32> to tensor<?x?xf32>
+              %23 = subtensor_insert %22 into %arg14[%arg9, %arg11] [%c2, %c4] [1, 1] : tensor<?x?xf32> into tensor<16x32xf32>
+              scf.yield %23 : tensor<16x32xf32>
+            }
+            scf.yield %11 : tensor<16x32xf32>
+          }
+          scf.yield %10 : tensor<16x32xf32>
+        }
+        %8 = tensor.cast %7 : tensor<16x32xf32> to tensor<?x?xf32>
+        %9 = subtensor_insert %8 into %arg8[%arg3, %arg5] [%c16, %c32] [1, 1] : tensor<?x?xf32> into tensor<32x64xf32>
+        scf.yield %9 : tensor<32x64xf32>
+      }
+      scf.yield %3 : tensor<32x64xf32>
+    }
+    scf.yield %2 : tensor<32x64xf32>
+  }
+  return %1 : tensor<32x64xf32>
+}
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index 276a9f7..fd8fb3b 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -84,9 +84,9 @@ struct TestLinalgTransforms
   Option<bool> testTileAndPadPattern{
       *this, "test-tile-and-pad-pattern",
       llvm::cl::desc("Test tile and pad pattern"), llvm::cl::init(false)};
-  Option<bool> testHoistPadding2Levels{*this, "test-hoist-padding-2-level",
-                                       llvm::cl::desc("Test hoist padding"),
-                                       llvm::cl::init(false)};
+  Option<int> testHoistPadding{*this, "test-hoist-padding",
+                               llvm::cl::desc("Test hoist padding"),
+                               llvm::cl::init(0)};
 };
 } // end anonymous namespace
 
@@ -571,9 +571,9 @@ void TestLinalgTransforms::runOnFunction() {
     return applyAffineMinSCFCanonicalizationPatterns(getFunction());
   if (testTileAndPadPattern)
     return applyTileAndPadPattern(getFunction());
-  if (testHoistPadding2Levels) {
-    getFunction().walk([](linalg::PadTensorOp padTensorOp) {
-      (void)linalg::hoistPaddingOnTensors(padTensorOp, 2);
+  if (testHoistPadding) {
+    getFunction().walk([&](linalg::PadTensorOp padTensorOp) {
+      (void)linalg::hoistPaddingOnTensors(padTensorOp, testHoistPadding);
     });
   }
 }
author	Nicolas Vasilache <nicolas.vasilache@gmail.com>	2021-03-24 11:24:22 +0000
committer	Nicolas Vasilache <nicolas.vasilache@gmail.com>	2021-03-24 11:51:28 +0000
commit	7716e5535c6b248b5faabd2d1af01415a78da8d7 (patch)
tree	1294d00a922a1898681d8be89870543bf64ca511
parent	e9015bd59519e205c2205fa413c8af7e677cc65d (diff)
download	llvm-7716e5535c6b248b5faabd2d1af01415a78da8d7.zip llvm-7716e5535c6b248b5faabd2d1af01415a78da8d7.tar.gz llvm-7716e5535c6b248b5faabd2d1af01415a78da8d7.tar.bz2