diff options
-rw-r--r-- | mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 3 | ||||
-rw-r--r-- | mlir/test/Dialect/GPU/canonicalize.mlir | 33 |
2 files changed, 35 insertions, 1 deletions
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 2e21cd7..c72fde2 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -672,7 +672,8 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [ def GPU_LaunchOp : GPU_Op<"launch", [ AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface, - DeclareOpInterfaceMethods<InferIntRangeInterface>]>, + DeclareOpInterfaceMethods<InferIntRangeInterface>, + RecursiveMemoryEffects]>, Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies, Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir index c2abb96..372dd78 100644 --- a/mlir/test/Dialect/GPU/canonicalize.mlir +++ b/mlir/test/Dialect/GPU/canonicalize.mlir @@ -11,6 +11,8 @@ func.func @fold_wait_op_test1() { } // CHECK-NOT: gpu.wait +// ----- + // Erase duplicate barriers. // CHECK-LABEL: func @erase_barriers // CHECK-NEXT: gpu.barrier @@ -21,6 +23,8 @@ func.func @erase_barriers() { return } +// ----- + // Replace uses of gpu.wait op with its async dependency. // CHECK-LABEL: func @fold_wait_op_test2 func.func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) { @@ -38,6 +42,8 @@ func.func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) { // CHECK-NEXT: gpu.alloc async [%[[TOKEN1]]] () // CHECK-NEXT: return +// ----- + // CHECK-LABEL: func @fold_memcpy_op func.func @fold_memcpy_op(%arg0: i1) { %cst = arith.constant 0.000000e+00 : f16 @@ -60,6 +66,8 @@ func.func @fold_memcpy_op(%arg0: i1) { } // CHECK-NOT: gpu.memcpy +// ----- + // We cannot fold memcpy here as dest is a block argument. // CHECK-LABEL: func @do_not_fold_memcpy_op1 func.func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) { @@ -75,6 +83,8 @@ func.func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) { } // CHECK: gpu.memcpy +// ----- + // We cannot fold gpu.memcpy as it is used by an op having read effect on dest. // CHECK-LABEL: func @do_not_fold_memcpy_op2 func.func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 { @@ -92,6 +102,8 @@ func.func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 { } // CHECK: gpu.memcpy +// ----- + // We cannot fold gpu.memcpy, as the defining op if dest is not a alloc like op. // CHECK-LABEL: func @do_not_fold_memcpy_op3 func.func @do_not_fold_memcpy_op3(%arg0: memref<1xi8>, %arg1: memref<i1>) { @@ -102,6 +114,8 @@ func.func @do_not_fold_memcpy_op3(%arg0: memref<1xi8>, %arg1: memref<i1>) { } // CHECK: gpu.memcpy +// ----- + // CHECK-LABEL: @memcpy_after_cast func.func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { // CHECK-NOT: memref.cast @@ -112,6 +126,8 @@ func.func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { return } +// ----- + // CHECK-LABEL: @memset_after_cast func.func @memset_after_cast(%arg0: memref<10xf32>, %arg1: f32) { // CHECK-NOT: memref.cast @@ -227,3 +243,20 @@ func.func @make_subgroup_reduce_uniform() { } return } + +// ----- + +// The GPU kernel does not have any side effecting ops, so the entire +// gpu.launch op can fold away. + +// CHECK-LABEL: func @gpu_launch_without_side_effects +// CHECK-NOT: gpu.launch +func.func @gpu_launch_without_side_effects() { + %0:6 = "test.test1"() : () -> (index, index, index, index, index, index) + gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2) + threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) { + %1 = arith.addi %arg0, %arg1 : index + gpu.terminator + } + return +} |