aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMehdi Amini <joker.eph@gmail.com>2024-04-29 19:30:38 +0200
committerGitHub <noreply@github.com>2024-04-29 19:30:38 +0200
commitd566a5cd22b4a653f10698f90c691a1452dad5ce (patch)
tree30c641f0f4d251d9af1fb2c54a1fb647b81e997c
parentcd68d7b3c0ebf6da5e235cfabd5e6381737eb7fe (diff)
downloadllvm-d566a5cd22b4a653f10698f90c691a1452dad5ce.zip
llvm-d566a5cd22b4a653f10698f90c691a1452dad5ce.tar.gz
llvm-d566a5cd22b4a653f10698f90c691a1452dad5ce.tar.bz2
[MLIR] Improve KernelOutlining to avoid introducing an extra block (#90359)
This fixes a TODO in the code.
-rw-r--r--mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp34
-rw-r--r--mlir/test/Dialect/GPU/outlining.mlir35
2 files changed, 49 insertions, 20 deletions
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 2436113..f5e8055 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -241,24 +241,26 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
map.map(operand.value(), entryBlock.getArgument(operand.index()));
// Clone the region of the gpu.launch operation into the gpu.func operation.
- // TODO: If cloneInto can be modified such that if a mapping for
- // a block exists, that block will be used to clone operations into (at the
- // end of the block), instead of creating a new block, this would be much
- // cleaner.
launchOpBody.cloneInto(&outlinedFuncBody, map);
- // Branch from entry of the gpu.func operation to the block that is cloned
- // from the entry block of the gpu.launch operation.
- Block &launchOpEntry = launchOpBody.front();
- Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
- builder.setInsertionPointToEnd(&entryBlock);
- builder.create<cf::BranchOp>(loc, clonedLaunchOpEntry);
-
- outlinedFunc.walk([](gpu::TerminatorOp op) {
- OpBuilder replacer(op);
- replacer.create<gpu::ReturnOp>(op.getLoc());
- op.erase();
- });
+ // Replace the terminator op with returns.
+ for (Block &block : launchOpBody) {
+ Block *clonedBlock = map.lookup(&block);
+ auto terminator = dyn_cast<gpu::TerminatorOp>(clonedBlock->getTerminator());
+ if (!terminator)
+ continue;
+ OpBuilder replacer(terminator);
+ replacer.create<gpu::ReturnOp>(terminator->getLoc());
+ terminator->erase();
+ }
+
+ // Splice now the entry block of the gpu.launch operation at the end of the
+ // gpu.func entry block and erase the redundant block.
+ Block *clonedLaunchOpEntry = map.lookup(&launchOpBody.front());
+ entryBlock.getOperations().splice(entryBlock.getOperations().end(),
+ clonedLaunchOpEntry->getOperations());
+ clonedLaunchOpEntry->erase();
+
return outlinedFunc;
}
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 601add9..5e4724c 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -54,14 +54,43 @@ func.func @launch() {
// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x
// CHECK-NEXT: = gpu.block_dim y
// CHECK-NEXT: = gpu.block_dim z
-// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
-// CHECK-NEXT: ^[[BLOCK]]:
// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
// CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> ()
// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
// -----
+// Verify that we can outline a CFG
+// CHECK-LABEL: gpu.func @launchCFG_kernel(
+// CHECK: cf.br
+// CHECK: gpu.return
+func.func @launchCFG() {
+ %0 = "op"() : () -> (f32)
+ %1 = "op"() : () -> (memref<?xf32, 1>)
+ %gDimX = arith.constant 8 : index
+ %gDimY = arith.constant 12 : index
+ %gDimZ = arith.constant 16 : index
+ %bDimX = arith.constant 20 : index
+ %bDimY = arith.constant 24 : index
+ %bDimZ = arith.constant 28 : index
+
+ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
+ %grid_z = %gDimZ)
+ threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY,
+ %block_z = %bDimZ) {
+ "use"(%0): (f32) -> ()
+ cf.br ^bb1
+ ^bb1:
+ "some_op"(%bx, %block_x) : (index, index) -> ()
+ %42 = memref.load %1[%tx] : memref<?xf32, 1>
+ gpu.terminator
+ }
+ return
+}
+
+
+// -----
+
// This test checks gpu-out-lining can handle gpu.launch kernel from an llvm.func
// CHECK-LABEL: @launch_from_llvm_func
llvm.func @launch_from_llvm_func() {
@@ -475,8 +504,6 @@ func.func @launch_cluster() {
// CHECK-NEXT: %[[CDIM:.*]] = gpu.cluster_dim x
// CHECK-NEXT: = gpu.cluster_dim y
// CHECK-NEXT: = gpu.cluster_dim z
-// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
-// CHECK-NEXT: ^[[BLOCK]]:
// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
// CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> ()
// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>