diff options
Diffstat (limited to 'mlir/test/Conversion/SCFToGPU/parallel_loop.mlir')
| -rw-r--r-- | mlir/test/Conversion/SCFToGPU/parallel_loop.mlir | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir index 1dbce05..2f192df 100644 --- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir +++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir @@ -641,3 +641,83 @@ func.func @parallel_reduction_1d_outside() { // CHECK: scf.parallel // CHECK-NEXT: scf.parallel // CHECK: scf.reduce + +// ----- + +// CHECK-LABEL: @nested_parallel_with_side_effect +func.func @nested_parallel_with_side_effect() { + %c65536 = arith.constant 65536 : index + %c2 = arith.constant 2 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %alloc_0 = memref.alloc() : memref<2x256x256xf32> + %alloc_1 = memref.alloc() : memref<2x4x256x256xf32> + %alloc_2 = memref.alloc() : memref<4x4xf32> + %alloc_3 = memref.alloc() : memref<4x4xf32> + scf.parallel (%arg2, %arg3, %arg4) = (%c0, %c0, %c0) to (%c2, %c4, %c65536) step (%c1, %c1, %c1) { + %1 = arith.remsi %arg4, %c256 : index + %2 = arith.divsi %arg4, %c256 : index + %4 = memref.load %alloc_0[%arg2, %2, %1] : memref<2x256x256xf32> + memref.store %4, %alloc_1[%arg2, %arg3, %2, %1] : memref<2x4x256x256xf32> + scf.parallel (%arg5) = (%c0) to (%c4) step (%c1) { + %5 = memref.load %alloc_2[%arg5, %c0] : memref<4x4xf32> + memref.store %5, %alloc_3[%arg5, %c0] : memref<4x4xf32> + scf.reduce + } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} + scf.reduce + } {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]} + return +} + +// CHECK: gpu.launch +// CHECK-NOT: scf.parallel + +// ----- + +func.func @scf2gpu_index_creation_2d() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + + // Single 2-D scf.parallel mapped to block_x and thread_x. + // Use both IVs so the conversion must compute indices. + scf.parallel (%bx, %tx) = (%c0, %c0) to (%c32, %c32) step (%c1, %c1) { + %u = arith.addi %bx, %c0 : index + %v = arith.addi %tx, %c0 : index + } { + mapping = [ + #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>, + #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)> + ] + } + return +} + +// CHECK-LABEL: func @scf2gpu_index_creation_2d +// CHECK: gpu.launch +// CHECK: %[[IDX:.*]] = affine.apply +// CHECK: arith.addi %[[IDX]], + +// ----- + +func.func @scf2gpu_index_creation_1d() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + + scf.parallel (%t) = (%c0) to (%c64) step (%c1) { + %w = arith.addi %t, %c0 : index + } { + mapping = [ + #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)> + ] + } + return +} + +// CHECK-LABEL: func @scf2gpu_index_creation_1d +// CHECK: gpu.launch +// CHECK: %[[IDX:.*]] = affine.apply +// CHECK: arith.addi %[[IDX]], |
