2 files changed, 35 insertions, 1 deletions
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 2e21cd7..c72fde2 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -672,7 +672,8 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
 
 def GPU_LaunchOp : GPU_Op<"launch", [
       AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface,
-      DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      DeclareOpInterfaceMethods<InferIntRangeInterface>,
+      RecursiveMemoryEffects]>,
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
                Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir
index c2abb96..372dd78 100644
--- a/mlir/test/Dialect/GPU/canonicalize.mlir
+++ b/mlir/test/Dialect/GPU/canonicalize.mlir
@@ -11,6 +11,8 @@ func.func @fold_wait_op_test1() {
 }
 // CHECK-NOT: gpu.wait
 
+// -----
+
 // Erase duplicate barriers.
 // CHECK-LABEL: func @erase_barriers
 //       CHECK-NEXT: gpu.barrier
@@ -21,6 +23,8 @@ func.func @erase_barriers() {
   return
 }
 
+// -----
+
 // Replace uses of gpu.wait op with its async dependency.
 // CHECK-LABEL: func @fold_wait_op_test2
 func.func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) {
@@ -38,6 +42,8 @@ func.func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) {
 // CHECK-NEXT: gpu.alloc async [%[[TOKEN1]]] ()
 // CHECK-NEXT: return
 
+// -----
+
 // CHECK-LABEL: func @fold_memcpy_op
 func.func @fold_memcpy_op(%arg0: i1) {
     %cst = arith.constant 0.000000e+00 : f16
@@ -60,6 +66,8 @@ func.func @fold_memcpy_op(%arg0: i1) {
 }
 // CHECK-NOT: gpu.memcpy
 
+// -----
+
 // We cannot fold memcpy here as dest is a block argument.
 // CHECK-LABEL: func @do_not_fold_memcpy_op1
 func.func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) {
@@ -75,6 +83,8 @@ func.func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) {
 }
 // CHECK: gpu.memcpy
 
+// -----
+
 // We cannot fold gpu.memcpy as it is used by an op having read effect on dest.
 // CHECK-LABEL: func @do_not_fold_memcpy_op2
 func.func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 {
@@ -92,6 +102,8 @@ func.func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 {
 }
 // CHECK: gpu.memcpy
 
+// -----
+
 // We cannot fold gpu.memcpy, as the defining op if dest is not a alloc like op.
 // CHECK-LABEL: func @do_not_fold_memcpy_op3
 func.func @do_not_fold_memcpy_op3(%arg0: memref<1xi8>, %arg1: memref<i1>) {
@@ -102,6 +114,8 @@ func.func @do_not_fold_memcpy_op3(%arg0: memref<1xi8>, %arg1: memref<i1>) {
 }
 // CHECK: gpu.memcpy
 
+// -----
+
 // CHECK-LABEL: @memcpy_after_cast
 func.func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
   // CHECK-NOT: memref.cast
@@ -112,6 +126,8 @@ func.func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: @memset_after_cast
 func.func @memset_after_cast(%arg0: memref<10xf32>, %arg1: f32) {
   // CHECK-NOT: memref.cast
@@ -227,3 +243,20 @@ func.func @make_subgroup_reduce_uniform() {
   }
   return
 }
+
+// -----
+
+// The GPU kernel does not have any side effecting ops, so the entire
+// gpu.launch op can fold away.
+
+// CHECK-LABEL: func @gpu_launch_without_side_effects
+//   CHECK-NOT:   gpu.launch
+func.func @gpu_launch_without_side_effects() {
+  %0:6 = "test.test1"() : () -> (index, index, index, index, index, index)
+  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2)
+    threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) {
+    %1 = arith.addi %arg0, %arg1 : index
+    gpu.terminator
+  }
+  return
+}