1 files changed, 80 insertions, 0 deletions
diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
index 1dbce05..2f192df 100644
--- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
@@ -641,3 +641,83 @@ func.func @parallel_reduction_1d_outside() {
 // CHECK: scf.parallel
 // CHECK-NEXT: scf.parallel
 // CHECK: scf.reduce
+
+// -----
+
+// CHECK-LABEL: @nested_parallel_with_side_effect
+func.func @nested_parallel_with_side_effect() {
+  %c65536 = arith.constant 65536 : index
+  %c2 = arith.constant 2 : index
+  %c256 = arith.constant 256 : index
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %alloc_0 = memref.alloc() : memref<2x256x256xf32>
+  %alloc_1 = memref.alloc() : memref<2x4x256x256xf32>
+  %alloc_2 = memref.alloc() : memref<4x4xf32>
+  %alloc_3 = memref.alloc() : memref<4x4xf32>
+  scf.parallel (%arg2, %arg3, %arg4) = (%c0, %c0, %c0) to (%c2, %c4, %c65536) step (%c1, %c1, %c1) {
+    %1 = arith.remsi %arg4, %c256 : index
+    %2 = arith.divsi %arg4, %c256 : index
+    %4 = memref.load %alloc_0[%arg2, %2, %1] : memref<2x256x256xf32>
+    memref.store %4, %alloc_1[%arg2, %arg3, %2, %1] : memref<2x4x256x256xf32>
+    scf.parallel (%arg5) = (%c0) to (%c4) step (%c1) {
+      %5 = memref.load %alloc_2[%arg5, %c0] : memref<4x4xf32>
+      memref.store %5, %alloc_3[%arg5, %c0] : memref<4x4xf32>
+      scf.reduce
+    } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+    scf.reduce
+  } {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+  return
+}
+
+// CHECK: gpu.launch
+// CHECK-NOT: scf.parallel
+
+// -----
+
+func.func @scf2gpu_index_creation_2d() {
+  %c0  = arith.constant 0 : index
+  %c1  = arith.constant 1 : index
+  %c32 = arith.constant 32 : index
+
+  // Single 2-D scf.parallel mapped to block_x and thread_x.
+  // Use both IVs so the conversion must compute indices.
+  scf.parallel (%bx, %tx) = (%c0, %c0) to (%c32, %c32) step (%c1, %c1) {
+    %u = arith.addi %bx, %c0 : index
+    %v = arith.addi %tx, %c0 : index
+  } {
+    mapping = [
+      #gpu.loop_dim_map<processor = block_x,  map = (d0) -> (d0), bound = (d0) -> (d0)>,
+      #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
+    ]
+  }
+  return
+}
+
+// CHECK-LABEL: func @scf2gpu_index_creation_2d
+//       CHECK:   gpu.launch
+//       CHECK:     %[[IDX:.*]] = affine.apply
+//       CHECK:     arith.addi %[[IDX]],
+
+// -----
+
+func.func @scf2gpu_index_creation_1d() {
+  %c0  = arith.constant 0 : index
+  %c1  = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+
+  scf.parallel (%t) = (%c0) to (%c64) step (%c1) {
+    %w = arith.addi %t, %c0 : index
+  } {
+    mapping = [
+      #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
+    ]
+  }
+  return
+}
+
+// CHECK-LABEL: func @scf2gpu_index_creation_1d
+//       CHECK:   gpu.launch
+//       CHECK:     %[[IDX:.*]] = affine.apply
+//       CHECK:     arith.addi %[[IDX]],