31 files changed, 2223 insertions, 555 deletions
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index 45b1a1f..0cbe064 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -195,6 +195,36 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) {
 
 // -----
 
+// ALL-LABEL: func @distinct_objects
+//  ALL-SAME:   (%[[ARG0:.*]]: memref<?xf16>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: memref<?xf64>)
+func.func @distinct_objects(%arg0: memref<?xf16>, %arg1: memref<?xf32>, %arg2: memref<?xf64>) -> (memref<?xf16>, memref<?xf32>, memref<?xf64>) {
+//   ALL-DAG:   %[[CAST_0:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<?xf16> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//   ALL-DAG:   %[[CAST_1:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<?xf32> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//   ALL-DAG:   %[[CAST_2:.*]] = builtin.unrealized_conversion_cast %[[ARG2]] : memref<?xf64> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//       ALL:   %[[PTR_0:.*]] = llvm.extractvalue %[[CAST_0]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//       ALL:   %[[PTR_1:.*]] = llvm.extractvalue %[[CAST_1]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//       ALL:   %[[PTR_2:.*]] = llvm.extractvalue %[[CAST_2]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//       ALL:   %[[TRUE:.*]] = llvm.mlir.constant(true) : i1
+//       ALL:   llvm.intr.assume %[[TRUE]] ["separate_storage"(%[[PTR_0]], %[[PTR_1]] : !llvm.ptr, !llvm.ptr)] : i1
+//       ALL:   llvm.intr.assume %[[TRUE]] ["separate_storage"(%[[PTR_0]], %[[PTR_2]] : !llvm.ptr, !llvm.ptr)] : i1
+//       ALL:   llvm.intr.assume %[[TRUE]] ["separate_storage"(%[[PTR_1]], %[[PTR_2]] : !llvm.ptr, !llvm.ptr)] : i1
+  %1, %2, %3 = memref.distinct_objects %arg0, %arg1, %arg2 : memref<?xf16>, memref<?xf32>, memref<?xf64>
+  return %1, %2, %3 : memref<?xf16>, memref<?xf32>, memref<?xf64>
+}
+
+// -----
+
+// ALL-LABEL: func @distinct_objects_noop
+//  ALL-SAME:   (%[[ARG0:.*]]: memref<?xf16>)
+func.func @distinct_objects_noop(%arg0: memref<?xf16>) -> memref<?xf16> {
+// 1-operand version is noop
+//  ALL-NEXT:   return %[[ARG0]]
+  %1 = memref.distinct_objects %arg0 : memref<?xf16>
+  return %1 : memref<?xf16>
+}
+
+// -----
+
 // CHECK-LABEL: func @assume_alignment_w_offset
 // CHECK-INTERFACE-LABEL: func @assume_alignment_w_offset
 func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset: ?>>) {
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index ca3de3a..2fe0995 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2216,6 +2216,18 @@ func.func @test_mulf1(%arg0 : f32, %arg1 : f32) -> (f32) {
   return %2 : f32
 }
 
+// CHECK-LABEL: @test_mulf2(
+func.func @test_mulf2(%arg0 : f32) -> (f32, f32) {
+  // CHECK-DAG:  %[[C0:.+]] = arith.constant 0.000000e+00 : f32
+  // CHECK-DAG:  %[[C0n:.+]] = arith.constant -0.000000e+00 : f32
+  // CHECK-NEXT:  return %[[C0]], %[[C0n]]
+  %c0 = arith.constant 0.0 : f32
+  %c0n = arith.constant -0.0 : f32
+  %0 = arith.mulf %c0, %arg0 fastmath<nnan,nsz> : f32
+  %1 = arith.mulf %c0n, %arg0 fastmath<nnan,nsz> : f32
+  return %0, %1 : f32, f32
+}
+
 // -----
 
 // CHECK-LABEL: @test_divf(
diff --git a/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir b/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir
index 99790cc..fcd004a 100644
--- a/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir
+++ b/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir
@@ -85,3 +85,14 @@ func.func @no_expansion(%x: f32) -> f32 {
   %y = arith.addf %x, %c : f32
   func.return %y : f32
 }
+
+// -----
+
+func.func @no_promote_select(%c: i1, %x: bf16, %y: bf16) -> bf16 {
+// CHECK-LABEL: @no_promote_select
+// CHECK-SAME: (%[[C:.+]]: i1, %[[X:.+]]: bf16, %[[Y:.+]]: bf16)
+// CHECK: %[[Z:.+]] = arith.select %[[C]], %[[X]], %[[Y]] : bf16
+// CHECK: return %[[Z]]
+  %z = arith.select %c, %x, %y : bf16
+  func.return %z : bf16
+}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 0bad151..6134695 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1068,6 +1068,38 @@ llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) {
 
 // -----
 
+// CHECK-LABEL: rocdl.cvt.scalef32.pk8
+llvm.func @rocdl.cvt.scalef32.pk8(%v8xf32: vector<8xf32>,
+                                  %v8xf16: vector<8xf16>,
+                                  %v8xbf16: vector<8xbf16>,
+                                  %scale: f32) {
+
+  // CHECK: rocdl.cvt.scalef32.pk8.fp8.f32
+  %0 =      rocdl.cvt.scalef32.pk8.fp8.f32 %v8xf32, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.bf8.f32
+  %1 =      rocdl.cvt.scalef32.pk8.bf8.f32 %v8xf32, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.fp4.f32
+  %2 =      rocdl.cvt.scalef32.pk8.fp4.f32 %v8xf32, %scale : i32
+
+  // CHECK: rocdl.cvt.scalef32.pk8.fp8.f16
+  %3 =      rocdl.cvt.scalef32.pk8.fp8.f16 %v8xf16, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.bf8.f16
+  %4 =      rocdl.cvt.scalef32.pk8.bf8.f16 %v8xf16, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.fp4.f16
+  %5 =      rocdl.cvt.scalef32.pk8.fp4.f16 %v8xf16, %scale : i32
+
+  // CHECK: rocdl.cvt.scalef32.pk8.fp8.bf16
+  %6 =      rocdl.cvt.scalef32.pk8.fp8.bf16 %v8xbf16, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.bf8.bf16
+  %7 =      rocdl.cvt.scalef32.pk8.bf8.bf16 %v8xbf16, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.fp4.bf16
+  %8 =      rocdl.cvt.scalef32.pk8.fp4.bf16 %v8xbf16, %scale : i32
+
+  llvm.return
+}
+
+// -----
+
 // CHECK-LABEL: rocdl.cvt.scale.pk16
 llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) {
 
diff --git a/mlir/test/Dialect/Math/sincos-fusion.mlir b/mlir/test/Dialect/Math/sincos-fusion.mlir
new file mode 100644
index 0000000..29fb9f1
--- /dev/null
+++ b/mlir/test/Dialect/Math/sincos-fusion.mlir
@@ -0,0 +1,86 @@
+// RUN: mlir-opt -math-sincos-fusion %s | FileCheck %s
+
+// CHECK-LABEL:   func.func @sincos_fusion(
+// CHECK-SAME:      %[[ARG0:.*]]: f32,
+// CHECK-SAME:      %[[ARG1:.*]]: f32) -> (f32, f32, f32, f32) {
+// CHECK:           %[[VAL_0:.*]], %[[VAL_1:.*]] = math.sincos %[[ARG0]] : f32
+// CHECK:           %[[VAL_2:.*]], %[[VAL_3:.*]] = math.sincos %[[ARG1]] : f32
+// CHECK:           return %[[VAL_0]], %[[VAL_1]], %[[VAL_3]], %[[VAL_2]] : f32, f32, f32, f32
+// CHECK:         }
+func.func @sincos_fusion(%arg0 : f32, %arg1 : f32) -> (f32, f32, f32, f32) {
+    %0 = math.sin %arg0 : f32
+    %1 = math.cos %arg0 : f32
+
+    %2 = math.cos %arg1 : f32
+    %3 = math.sin %arg1 : f32
+
+    func.return %0, %1, %2, %3 : f32, f32, f32, f32
+}
+
+func.func private @sink(%arg0 : f32)
+
+// CHECK:         func.func private @sink(f32)
+// CHECK-LABEL:   func.func @sincos_ensure_ssa_dominance(
+// CHECK-SAME:      %[[ARG0:.*]]: f32,
+// CHECK-SAME:      %[[ARG1:.*]]: f32) -> (f32, f32, f32, f32) {
+// CHECK:           %[[VAL_0:.*]], %[[VAL_1:.*]] = math.sincos %[[ARG0]] : f32
+// CHECK:           call @sink(%[[VAL_0]]) : (f32) -> ()
+// CHECK:           %[[VAL_2:.*]], %[[VAL_3:.*]] = math.sincos %[[ARG1]] : f32
+// CHECK:           call @sink(%[[VAL_3]]) : (f32) -> ()
+// CHECK:           return %[[VAL_0]], %[[VAL_1]], %[[VAL_3]], %[[VAL_2]] : f32, f32, f32, f32
+// CHECK:         }
+func.func @sincos_ensure_ssa_dominance(%arg0 : f32, %arg1 : f32) -> (f32, f32, f32, f32) {
+    %0 = math.sin %arg0 : f32
+    func.call @sink(%0) : (f32) -> ()
+    %1 = math.cos %arg0 : f32
+    %2 = math.cos %arg1 : f32
+    func.call @sink(%2) : (f32) -> ()
+    %3 = math.sin %arg1 : f32
+    func.return %0, %1, %2, %3 : f32, f32, f32, f32
+}
+
+// CHECK-LABEL:   func.func @sincos_fusion_no_match_fmf(
+// CHECK-SAME:      %[[ARG0:.*]]: f32) -> (f32, f32) {
+// CHECK:           %[[VAL_0:.*]] = math.sin %[[ARG0]] fastmath<contract> : f32
+// CHECK:           %[[VAL_1:.*]] = math.cos %[[ARG0]] : f32
+// CHECK:           return %[[VAL_0]], %[[VAL_1]] : f32, f32
+// CHECK:         }
+func.func @sincos_fusion_no_match_fmf(%arg0 : f32) -> (f32, f32) {
+    %0 = math.sin %arg0 fastmath<contract> : f32
+    %1 = math.cos %arg0 : f32
+    func.return %0, %1 : f32, f32
+}
+
+// CHECK-LABEL:   func.func @sincos_no_fusion_different_block(
+// CHECK-SAME:      %[[ARG0:.*]]: f32,
+// CHECK-SAME:      %[[ARG1:.*]]: i1) -> f32 {
+// CHECK:           %[[VAL_0:.*]] = scf.if %[[ARG1]] -> (f32) {
+// CHECK:             %[[VAL_1:.*]] = math.sin %[[ARG0]] : f32
+// CHECK:             scf.yield %[[VAL_1]] : f32
+// CHECK:           } else {
+// CHECK:             %[[VAL_2:.*]] = math.cos %[[ARG0]] : f32
+// CHECK:             scf.yield %[[VAL_2]] : f32
+// CHECK:           }
+// CHECK:           return %[[VAL_0]] : f32
+// CHECK:         }
+func.func @sincos_no_fusion_different_block(%arg0 : f32, %flag : i1) -> f32 {
+  %0 = scf.if %flag -> f32 {
+    %s = math.sin %arg0 : f32
+    scf.yield %s : f32
+  } else {
+    %c = math.cos %arg0 : f32
+    scf.yield %c : f32
+  }
+  func.return %0 : f32
+}
+
+// CHECK-LABEL:   func.func @sincos_fusion_preserve_fastmath(
+// CHECK-SAME:      %[[ARG0:.*]]: f32) -> (f32, f32) {
+// CHECK:           %[[VAL_0:.*]], %[[VAL_1:.*]] = math.sincos %[[ARG0]] fastmath<contract> : f32
+// CHECK:           return %[[VAL_0]], %[[VAL_1]] : f32, f32
+// CHECK:         }
+func.func @sincos_fusion_preserve_fastmath(%arg0 : f32) -> (f32, f32) {
+    %0 = math.sin %arg0 fastmath<contract> : f32
+    %1 = math.cos %arg0 fastmath<contract> : f32
+    func.return %0, %1 : f32, f32
+}
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index 3f96d90..5ff2920 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -1169,3 +1169,19 @@ func.func @expand_shape_invalid_output_shape(
       into memref<2x15x20xf32, strided<[60000, 4000, 2], offset: 100>>
   return
 }
+
+// -----
+
+func.func @distinct_objects_types_mismatch(%arg0: memref<?xf32>, %arg1: memref<?xi32>) -> (memref<?xi32>, memref<?xf32>) {
+  // expected-error @+1 {{operand types and result types must match}}
+  %0, %1 = "memref.distinct_objects"(%arg0, %arg1) : (memref<?xf32>, memref<?xi32>) -> (memref<?xi32>, memref<?xf32>)
+  return %0, %1 : memref<?xi32>, memref<?xf32>
+}
+
+// -----
+
+func.func @distinct_objects_0_operands() {
+  // expected-error @+1 {{expected at least one operand}}
+  "memref.distinct_objects"() : () -> ()
+  return
+}
diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir
index 6c2298a..a90c950 100644
--- a/mlir/test/Dialect/MemRef/ops.mlir
+++ b/mlir/test/Dialect/MemRef/ops.mlir
@@ -302,6 +302,15 @@ func.func @assume_alignment(%0: memref<4x4xf16>) {
   return
 }
 
+// CHECK-LABEL: func @distinct_objects
+// CHECK-SAME: (%[[ARG0:.*]]: memref<?xf16>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: memref<?xf64>)
+func.func @distinct_objects(%arg0: memref<?xf16>, %arg1: memref<?xf32>, %arg2: memref<?xf64>) -> (memref<?xf16>, memref<?xf32>, memref<?xf64>) {
+  // CHECK:  %[[RES:.*]]:3 = memref.distinct_objects %[[ARG0]], %[[ARG1]], %[[ARG2]] : memref<?xf16>, memref<?xf32>, memref<?xf64>
+  %1, %2, %3 = memref.distinct_objects %arg0, %arg1, %arg2 : memref<?xf16>, memref<?xf32>, memref<?xf64>
+  // CHECK:  return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2 : memref<?xf16>, memref<?xf32>, memref<?xf64>
+  return %1, %2, %3 : memref<?xf16>, memref<?xf32>, memref<?xf64>
+}
+
 // CHECK-LABEL: func @expand_collapse_shape_static
 func.func @expand_collapse_shape_static(
     %arg0: memref<3x4x5xf32>,
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index cb69058..1484d7e 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -358,6 +358,41 @@ func.func @acc_loop_multiple_block() {
 
 // -----
 
+acc.firstprivate.recipe @firstprivatization_memref_10xf32 : memref<10xf32> init {
+^bb0(%arg0: memref<10xf32>):
+  %0 = memref.alloca() : memref<10xf32>
+  acc.yield %0 : memref<10xf32>
+} copy {
+^bb0(%arg0: memref<10xf32>, %arg1: memref<10xf32>):
+  memref.copy %arg0, %arg1 : memref<10xf32> to memref<10xf32>
+  acc.terminator
+} destroy {
+^bb0(%arg0: memref<10xf32>):
+  acc.terminator
+}
+
+func.func @testloopfirstprivate(%a: memref<10xf32>, %b: memref<10xf32>) -> () {
+  %c0 = arith.constant 0 : index
+  %c10 = arith.constant 10 : index
+  %c1 = arith.constant 1 : index
+  %firstprivate = acc.firstprivate varPtr(%a : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32>
+  acc.loop firstprivate(@firstprivatization_memref_10xf32 -> %firstprivate : memref<10xf32>) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
+    "test.openacc_dummy_op"() : () -> ()
+    acc.yield
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+  return
+}
+
+// CHECK-LABEL: func.func @testloopfirstprivate(
+// CHECK-SAME:    %[[ARG0:.*]]: memref<10xf32>, %[[ARG1:.*]]: memref<10xf32>)
+// CHECK:         %[[FIRSTPRIVATE:.*]] = acc.firstprivate varPtr(%[[ARG0]] : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32>
+// CHECK:         acc.loop firstprivate(@firstprivatization_memref_10xf32 -> %[[FIRSTPRIVATE]] : memref<10xf32>) control(%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
+// CHECK:           "test.openacc_dummy_op"() : () -> ()
+// CHECK:           acc.yield
+// CHECK:         } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+
+// -----
+
 acc.private.recipe @privatization_memref_10_f32 : memref<10xf32> init {
 ^bb0(%arg0: memref<10xf32>):
   %0 = memref.alloc() : memref<10xf32>
@@ -535,6 +570,7 @@ acc.firstprivate.recipe @firstprivatization_memref_10xf32 : memref<10xf32> init
   acc.yield %0 : memref<10xf32>
 } copy {
 ^bb0(%arg0: memref<10xf32>, %arg1: memref<10xf32>):
+  memref.copy %arg0, %arg1 : memref<10xf32> to memref<10xf32>
   acc.terminator
 } destroy {
 ^bb0(%arg0: memref<10xf32>):
diff --git a/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir b/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir
index adadb8b..0e9385e 100644
--- a/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir
+++ b/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s | FileCheck %s
-// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// RUN: mlir-opt %s            | FileCheck %s --enable-var-scope
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope
 
 
 // CHECK-LABEL: @omp_canonloop_raw(
@@ -24,10 +24,10 @@ func.func @omp_canonloop_raw(%tc : i32) -> () {
 func.func @omp_canonloop_sequential_raw(%tc : i32) -> () {
   // CHECK-NEXT: %canonloop_s0 = omp.new_cli
   %canonloop_s0 = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
+  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc]]) {
   "omp.canonical_loop" (%tc, %canonloop_s0) ({
     ^bb_first(%iv_first: i32):
-      // CHECK-NEXT: = llvm.add %iv, %iv : i32
+      // CHECK-NEXT: = llvm.add %iv_s0, %iv_s0 : i32
       %newval = llvm.add %iv_first, %iv_first : i32
     // CHECK-NEXT: omp.terminator
     omp.terminator
@@ -36,7 +36,7 @@ func.func @omp_canonloop_sequential_raw(%tc : i32) -> () {
 
   // CHECK-NEXT: %canonloop_s1 = omp.new_cli
   %canonloop_s1 = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv : i32 in range(%[[tc]]) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc]]) {
   "omp.canonical_loop" (%tc, %canonloop_s1) ({
     ^bb_second(%iv_second: i32):
     // CHECK: omp.terminator
@@ -52,17 +52,17 @@ func.func @omp_canonloop_sequential_raw(%tc : i32) -> () {
 // CHECK-LABEL: @omp_nested_canonloop_raw(
 // CHECK-SAME: %[[tc_outer:.+]]: i32, %[[tc_inner:.+]]: i32)
 func.func @omp_nested_canonloop_raw(%tc_outer : i32, %tc_inner : i32) -> () {
-  // CHECK-NEXT: %canonloop_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop = omp.new_cli
   %outer = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT: %canonloop_s0_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop_d1 = omp.new_cli
   %inner = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc_outer]]) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc_outer]]) {
   "omp.canonical_loop" (%tc_outer, %outer) ({
     ^bb_outer(%iv_outer: i32):
-      // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%[[tc_inner]]) {
+      // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc_inner]]) {
       "omp.canonical_loop" (%tc_inner, %inner) ({
         ^bb_inner(%iv_inner: i32):
-          // CHECK-NEXT: = llvm.add %iv, %iv_0 : i32
+          // CHECK-NEXT: = llvm.add %iv, %iv_d1 : i32
           %newval = llvm.add %iv_outer, %iv_inner: i32
           // CHECK-NEXT: omp.terminator
           omp.terminator
@@ -108,16 +108,24 @@ func.func @omp_canonloop_constant_pretty() -> () {
 func.func @omp_canonloop_sequential_pretty(%tc : i32) -> () {
   // CHECK-NEXT: %canonloop_s0 = omp.new_cli
   %canonloop_s0 = omp.new_cli
-  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
-  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%tc) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%tc) {
     // CHECK-NEXT: omp.terminator
     omp.terminator
   }
 
   // CHECK: %canonloop_s1 = omp.new_cli
   %canonloop_s1 = omp.new_cli
-  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s1) %iv : i32 in range(%[[tc]]) {
-  omp.canonical_loop(%canonloop_s1) %iv_0 : i32 in range(%tc) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%tc) {
+    // CHECK-NEXT: omp.terminator
+    omp.terminator
+  }
+
+  // CHECK: %canonloop_s2 = omp.new_cli
+  %canonloop_s2 = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop_s2) %iv_s2 : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop_s2) %iv_s2 : i32 in range(%tc) {
     // CHECK-NEXT: omp.terminator
     omp.terminator
   }
@@ -126,17 +134,17 @@ func.func @omp_canonloop_sequential_pretty(%tc : i32) -> () {
 }
 
 
-// CHECK-LABEL: @omp_canonloop_nested_pretty(
+// CHECK-LABEL: @omp_canonloop_2d_nested_pretty(
 // CHECK-SAME: %[[tc:.+]]: i32)
-func.func @omp_canonloop_nested_pretty(%tc : i32) -> () {
-  // CHECK-NEXT: %canonloop_s0 = omp.new_cli
-  %canonloop_s0 = omp.new_cli
-  // CHECK-NEXT: %canonloop_s0_s0 = omp.new_cli
-  %canonloop_s0_s0 = omp.new_cli
-  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
-  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%tc) {
-    // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%[[tc]]) {
-    omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%tc) {
+func.func @omp_canonloop_2d_nested_pretty(%tc : i32) -> () {
+  // CHECK-NEXT: %canonloop = omp.new_cli
+  %canonloop = omp.new_cli
+  // CHECK-NEXT: %canonloop_d1 = omp.new_cli
+  %canonloop_d1 = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) {
+    omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%tc) {
       // CHECK: omp.terminator
       omp.terminator
     }
@@ -147,6 +155,77 @@ func.func @omp_canonloop_nested_pretty(%tc : i32) -> () {
 }
 
 
+// CHECK-LABEL: @omp_canonloop_3d_nested_pretty(
+// CHECK-SAME: %[[tc:.+]]: i32)
+func.func @omp_canonloop_3d_nested_pretty(%tc : i32) -> () {
+  // CHECK: %canonloop = omp.new_cli
+  %canonloop = omp.new_cli
+  // CHECK: %canonloop_d1 = omp.new_cli
+  %canonloop_d1 = omp.new_cli
+  // CHECK: %canonloop_d2 = omp.new_cli
+  %canonloop_d2 = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) {
+    omp.canonical_loop(%canonloop_d1) %iv_1d : i32 in range(%tc) {
+      // CHECK-NEXT: omp.canonical_loop(%canonloop_d2) %iv_d2 : i32 in range(%[[tc]]) {
+      omp.canonical_loop(%canonloop_d2) %iv_d2 : i32 in range(%tc) {
+        // CHECK-NEXT: omp.terminator
+        omp.terminator
+      // CHECK-NEXT: }
+      }
+      // CHECK-NEXT: omp.terminator
+      omp.terminator
+    // CHECK-NEXT: }
+    }
+    // CHECK-NEXT: omp.terminator
+    omp.terminator
+  }
+
+  return
+}
+
+
+// CHECK-LABEL: @omp_canonloop_sequential_nested_pretty(
+// CHECK-SAME: %[[tc:.+]]: i32)
+func.func @omp_canonloop_sequential_nested_pretty(%tc : i32) -> () {
+  // CHECK-NEXT: %canonloop_s0 = omp.new_cli
+  %canonloop_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop_s0_d1 = omp.new_cli
+  %canonloop_s0_d1 = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%tc) {
+   // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_d1) %iv_s0_d1 : i32 in range(%[[tc]]) {
+    omp.canonical_loop(%canonloop_s0_d1) %iv_s0_d1 : i32 in range(%tc) {
+      // CHECK-NEXT: omp.terminator
+      omp.terminator
+    // CHECK-NEXT: }
+    }
+    // CHECK-NEXT: omp.terminator
+    omp.terminator
+  // CHECK-NEXT: }
+  }
+
+  // CHECK-NEXT: %canonloop_s1 = omp.new_cli
+  %canonloop_s1 = omp.new_cli
+  // CHECK-NEXT: %canonloop_s1_d1 = omp.new_cli
+  %canonloop_s1_d1 = omp.new_cli
+  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%tc) {
+    // CHECK-NEXT:  omp.canonical_loop(%canonloop_s1_d1) %iv_s1_d1 : i32 in range(%[[tc]]) {
+    omp.canonical_loop(%canonloop_s1_d1) %iv_s1d1 : i32 in range(%tc) {
+      // CHECK-NEXT: omp.terminator
+      omp.terminator
+    // CHECK-NEXT: }
+    }
+    // CHECK-NEXT: omp.terminator
+    omp.terminator
+  }
+
+  return
+}
+
+
 // CHECK-LABEL: @omp_newcli_unused(
 // CHECK-SAME: )
 func.func @omp_newcli_unused() -> () {
@@ -155,3 +234,74 @@ func.func @omp_newcli_unused() -> () {
   // CHECK-NEXT: return
   return
 }
+
+
+// CHECK-LABEL: @omp_canonloop_multiregion_isolatedfromabove(
+func.func @omp_canonloop_multiregion_isolatedfromabove() -> () {
+  omp.private {type = firstprivate} @x.privatizer : !llvm.ptr init {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+      %c42_i32 = arith.constant 42: i32
+      // CHECK: omp.canonical_loop %iv : i32 in range(%c42_i32) {
+      omp.canonical_loop %iv1 : i32 in range(%c42_i32) {
+        omp.terminator
+      }
+      // CHECK: omp.yield
+      omp.yield(%arg0 : !llvm.ptr)
+  } copy {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+      %c42_i32 = arith.constant 42: i32
+      // CHECK: omp.canonical_loop %iv : i32 in range(%c42_i32) {
+      omp.canonical_loop %iv : i32 in range(%c42_i32) {
+        // CHECK: omp.canonical_loop %iv_d1 : i32 in range(%c42_i32) {
+        omp.canonical_loop %iv_d1 : i32 in range(%c42_i32) {
+          omp.terminator
+        }
+        omp.terminator
+      }
+      // CHECK: omp.yield
+      omp.yield(%arg0 : !llvm.ptr)
+  } dealloc {
+    ^bb0(%arg0: !llvm.ptr):
+      %c42_i32 = arith.constant 42: i32
+      // CHECK: omp.canonical_loop %iv_s0 : i32 in range(%c42_i32) {
+      omp.canonical_loop %iv_s0 : i32 in range(%c42_i32) {
+        omp.terminator
+      }
+      // CHECK: omp.canonical_loop %iv_s1 : i32 in range(%c42_i32) {
+      omp.canonical_loop %iv_s1 : i32 in range(%c42_i32) {
+        omp.terminator
+      }
+      // CHECK: omp.yield
+      omp.yield
+  }
+
+  // CHECK: return
+  return
+}
+
+
+// CHECK-LABEL: @omp_canonloop_multiregion(
+func.func @omp_canonloop_multiregion(%c : i1) -> () {
+  %c42_i32 = arith.constant 42: i32
+  %canonloop1 = omp.new_cli
+  %canonloop2 = omp.new_cli
+  %canonloop3 = omp.new_cli
+  scf.if %c {
+    // CHECK: omp.canonical_loop(%canonloop_r0) %iv_r0 : i32 in range(%c42_i32) {
+    omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%c42_i32) {
+      omp.terminator
+    }
+  } else {
+    // CHECK: omp.canonical_loop(%canonloop_r1_s0) %iv_r1_s0 : i32 in range(%c42_i32) {
+    omp.canonical_loop(%canonloop2)  %iv2 : i32 in range(%c42_i32) {
+      omp.terminator
+    }
+    // CHECK: omp.canonical_loop(%canonloop_r1_s1) %iv_r1_s1 : i32 in range(%c42_i32) {
+    omp.canonical_loop(%canonloop3)  %iv3 : i32 in range(%c42_i32) {
+      omp.terminator
+    }
+  }
+
+  // CHECK: return
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/cli-tile.mlir b/mlir/test/Dialect/OpenMP/cli-tile.mlir
new file mode 100644
index 0000000..73d5478
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/cli-tile.mlir
@@ -0,0 +1,138 @@
+// RUN: mlir-opt %s            | FileCheck %s --enable-var-scope
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope
+
+
+// Raw syntax check (MLIR output is always pretty-printed)
+// CHECK-LABEL: @omp_tile_raw(
+// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) {
+func.func @omp_tile_raw(%tc : i32, %ts : i32) -> () {
+  // CHECK-NEXT: %canonloop = omp.new_cli
+  %canonloop = "omp.new_cli" () : () -> (!omp.cli)
+  // CHECK-NEXT: %grid1 = omp.new_cli
+  %grid = "omp.new_cli" () : () -> (!omp.cli)
+  // CHECK-NEXT: %intratile1 = omp.new_cli
+  %intratile = "omp.new_cli" () : () -> (!omp.cli)
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  "omp.canonical_loop" (%tc, %canonloop) ({
+    ^bb0(%iv: i32):
+      // CHECK: omp.terminator
+      omp.terminator
+  }) : (i32, !omp.cli) -> ()
+  // CHECK: omp.tile (%grid1, %intratile1) <- (%canonloop) sizes(%[[ts]] : i32)
+  "omp.tile"(%grid,  %intratile, %canonloop, %ts) <{operandSegmentSizes = array<i32: 2, 1, 1>}> : (!omp.cli,  !omp.cli, !omp.cli, i32) -> ()
+  //"omp.tile" (%canonloop) : (!omp.cli) -> ()
+  return
+}
+
+
+// Pretty syntax check
+// CHECK-LABEL: @omp_tile_pretty(
+// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) {
+func.func @omp_tile_pretty(%tc : i32, %ts : i32) -> () {
+  // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+  %canonloop = omp.new_cli
+  // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+  %grid = omp.new_cli
+  // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+  %intratile = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+  // CHECK: omp.tile (%grid1, %intratile1) <- (%canonloop) sizes(%[[ts]] : i32)
+  omp.tile(%grid, %intratile) <- (%canonloop) sizes(%ts : i32)
+  return
+}
+
+
+// Specifying the generatees for omp.tile is optional
+// CHECK-LABEL: @omp_tile_optionalgen_pretty(
+// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) {
+func.func @omp_tile_optionalgen_pretty(%tc : i32, %ts : i32) -> () {
+  // CHECK-NEXT: %canonloop = omp.new_cli
+  %canonloop = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+  // CHECK: omp.tile <- (%canonloop) sizes(%[[ts]] : i32)
+  omp.tile <- (%canonloop) sizes(%ts : i32)
+  return
+}
+
+
+// Two-dimensional tiling
+// CHECK-LABEL: @omp_tile_2d_pretty(
+// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32, %[[ts1:.+]]: i32, %[[ts2:.+]]: i32) {
+func.func @omp_tile_2d_pretty(%tc1 : i32, %tc2 : i32, %ts1 : i32, %ts2 : i32) -> () {
+  // CHECK-NEXT: %canonloop = omp.new_cli
+  %cli_outer = omp.new_cli
+  // CHECK-NEXT: %canonloop_d1 = omp.new_cli
+  %cli_inner = omp.new_cli
+  // CHECK-NEXT: %grid1 = omp.new_cli
+  %grid1 = omp.new_cli
+  // CHECK-NEXT: %grid2 = omp.new_cli
+  %grid2 = omp.new_cli
+  // CHECK-NEXT: %intratile1 = omp.new_cli
+  %intratile1 = omp.new_cli
+  // CHECK-NEXT: %intratile2 = omp.new_cli
+  %intratile2 = omp.new_cli
+  // CHECK-NEXT:  omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc1]]) {
+  omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc1) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc2]]) {
+    omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc2) {
+      // CHECK: omp.terminator
+      omp.terminator
+    }
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+  // CHECK:  omp.tile (%grid1, %grid2, %intratile1, %intratile2) <- (%canonloop, %canonloop_d1) sizes(%[[ts1]], %[[ts2]] : i32, i32)
+  omp.tile (%grid1, %grid2, %intratile1, %intratile2) <- (%cli_outer, %cli_inner) sizes(%ts1, %ts2 : i32, i32)
+  return
+}
+
+
+// Three-dimensional tiling
+// CHECK-LABEL: @omp_tile_3d_pretty(
+// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) {
+func.func @omp_tile_3d_pretty(%tc : i32, %ts : i32) -> () {
+  // CHECK-NEXT: %canonloop = omp.new_cli
+  %cli_outer = omp.new_cli
+  // CHECK-NEXT: %canonloop_d1 = omp.new_cli
+  %cli_middle = omp.new_cli
+  // CHECK-NEXT: %canonloop_d2 = omp.new_cli
+  %cli_inner = omp.new_cli
+  // CHECK-NEXT: %grid1 = omp.new_cli
+  %grid1 = omp.new_cli
+  // CHECK-NEXT: %grid2 = omp.new_cli
+  %grid2 = omp.new_cli
+  // CHECK-NEXT: %grid3 = omp.new_cli
+  %grid3 = omp.new_cli
+  // CHECK-NEXT: %intratile1 = omp.new_cli
+  %intratile1 = omp.new_cli
+  // CHECK-NEXT: %intratile2 = omp.new_cli
+  %intratile2 = omp.new_cli
+  // CHECK-NEXT: %intratile3 = omp.new_cli
+  %intratile3 = omp.new_cli
+  // CHECK-NEXT:  omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) {
+    omp.canonical_loop(%cli_middle) %iv_middle : i32 in range(%tc) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d2) %iv_d2 : i32 in range(%[[tc]]) {
+      omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc) {
+        // CHECK: omp.terminator
+        omp.terminator
+      }
+      // CHECK: omp.terminator
+      omp.terminator
+    }
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+  // CHECK:  omp.tile (%grid1, %grid2, %grid3, %intratile1, %intratile2, %intratile3) <- (%canonloop, %canonloop_d1, %canonloop_d2) sizes(%[[ts]], %[[ts]], %[[ts]] : i32, i32, i32)
+  omp.tile (%grid1, %grid2, %grid3, %intratile1, %intratile2, %intratile3) <- (%cli_outer, %cli_middle, %cli_inner) sizes(%ts, %ts, %ts: i32, i32, i32)
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir b/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir
index cda7d0b..16884f4 100644
--- a/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir
+++ b/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir
@@ -1,18 +1,18 @@
-// RUN: mlir-opt %s            | FileCheck %s
-// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// RUN: mlir-opt %s            | FileCheck %s --enable-var-scope
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope
 
 
 // CHECK-LABEL: @omp_unroll_heuristic_raw(
 // CHECK-SAME: %[[tc:.+]]: i32) {
 func.func @omp_unroll_heuristic_raw(%tc : i32) -> () {
-  // CHECK-NEXT: %canonloop_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop = omp.new_cli
   %canonloop = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
   "omp.canonical_loop" (%tc, %canonloop) ({
     ^bb0(%iv: i32):
       omp.terminator
   }) : (i32, !omp.cli) -> ()
-  // CHECK: omp.unroll_heuristic(%canonloop_s0)
+  // CHECK: omp.unroll_heuristic(%canonloop)
   "omp.unroll_heuristic" (%canonloop) : (!omp.cli) -> ()
   return
 }
@@ -22,12 +22,12 @@ func.func @omp_unroll_heuristic_raw(%tc : i32) -> () {
 // CHECK-SAME: %[[tc:.+]]: i32) {
 func.func @omp_unroll_heuristic_pretty(%tc : i32) -> () {
   // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
-  %canonloop = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
+  %canonloop = omp.new_cli
+  // CHECK-NEXT:  omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
   omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
     omp.terminator
   }
-  // CHECK: omp.unroll_heuristic(%canonloop_s0)
+  // CHECK: omp.unroll_heuristic(%canonloop)
   omp.unroll_heuristic(%canonloop)
   return
 }
@@ -36,13 +36,13 @@ func.func @omp_unroll_heuristic_pretty(%tc : i32) -> () {
 // CHECK-LABEL: @omp_unroll_heuristic_nested_pretty(
 // CHECK-SAME: %[[tc:.+]]: i32) {
 func.func @omp_unroll_heuristic_nested_pretty(%tc : i32) -> () {
-  // CHECK-NEXT: %canonloop_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop = omp.new_cli
   %cli_outer = omp.new_cli
-  // CHECK-NEXT: %canonloop_s0_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop_d1 = omp.new_cli
   %cli_inner = omp.new_cli
-  // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
   omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc) {
-    // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%[[tc]]) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) {
     omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc) {
       // CHECK: omp.terminator
       omp.terminator
@@ -51,9 +51,9 @@ func.func @omp_unroll_heuristic_nested_pretty(%tc : i32) -> () {
     omp.terminator
   }
 
-  // CHECK: omp.unroll_heuristic(%canonloop_s0)
+  // CHECK: omp.unroll_heuristic(%canonloop)
   omp.unroll_heuristic(%cli_outer)
-  // CHECK-NEXT: omp.unroll_heuristic(%canonloop_s0_s0)
+  // CHECK-NEXT: omp.unroll_heuristic(%canonloop_d1)
   omp.unroll_heuristic(%cli_inner)
   return
 }
diff --git a/mlir/test/Dialect/OpenMP/invalid-tile.mlir b/mlir/test/Dialect/OpenMP/invalid-tile.mlir
new file mode 100644
index 0000000..e63a062
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/invalid-tile.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s
+
+
+func.func @missing_sizes(%tc : i32, %ts : i32) {
+  %canonloop = omp.new_cli
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    omp.terminator
+  }
+
+  // expected-error@+1 {{'omp.tile' op there must be one tile size for each applyee}}
+  omp.tile <-(%canonloop)
+
+  llvm.return
+}
+
+// -----
+
+func.func @no_loop(%tc : i32, %ts : i32) {
+  // expected-error@+1 {{'omp.tile' op must apply to at least one loop}}
+  omp.tile <-()
+
+  return
+}
+
+// -----
+
+func.func @missing_generator(%tc : i32, %ts : i32) {
+  // expected-error@+1 {{'omp.new_cli' op CLI has no generator}}
+  %canonloop = omp.new_cli
+
+  // expected-note@+1 {{see consumer here: "omp.tile"(%0, %arg1) <{operandSegmentSizes = array<i32: 0, 1, 1>}> : (!omp.cli, i32) -> ()}}
+  omp.tile <-(%canonloop) sizes(%ts : i32)
+
+  return
+}
+
+// -----
+
+func.func @insufficient_sizes(%tc : i32, %ts : i32) {
+  %canonloop1 = omp.new_cli
+  %canonloop2 = omp.new_cli
+  omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc) {
+    omp.terminator
+  }
+  omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc) {
+    omp.terminator
+  }
+
+  // expected-error@+1 {{'omp.tile' op there must be one tile size for each applyee}}
+  omp.tile <-(%canonloop1, %canonloop2) sizes(%ts : i32)
+
+  llvm.return
+}
+
+// -----
+
+func.func @insufficient_applyees(%tc : i32, %ts : i32) {
+  %canonloop = omp.new_cli
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    omp.terminator
+  }
+
+  // expected-error@+1 {{omp.tile' op there must be one tile size for each applyee}}
+  omp.tile <- (%canonloop) sizes(%ts, %ts : i32, i32)
+
+  return
+}
+
+// -----
+
+func.func @insufficient_generatees(%tc : i32, %ts : i32) {
+  %canonloop = omp.new_cli
+  %grid = omp.new_cli
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    omp.terminator
+  }
+
+  // expected-error@+1 {{'omp.tile' op expecting two times the number of generatees than applyees}}
+  omp.tile (%grid) <- (%canonloop) sizes(%ts : i32)
+
+  return
+}
+
+// -----
+
+func.func @not_perfectly_nested(%tc : i32, %ts : i32) {
+  %canonloop1 = omp.new_cli
+  %canonloop2 = omp.new_cli
+  omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%tc) {
+    %v = arith.constant 42 : i32
+    omp.canonical_loop(%canonloop2) %iv2 : i32 in range(%tc) {
+      omp.terminator
+    }
+    omp.terminator
+  }
+
+  // expected-error@+1 {{'omp.tile' op tiled loop nest must be perfectly nested}}
+  omp.tile <-(%canonloop1, %canonloop2) sizes(%ts, %ts : i32, i32)
+
+  llvm.return
+}
+
+// -----
+
+func.func @non_nectangular(%tc : i32, %ts : i32) {
+  %canonloop1 = omp.new_cli
+  %canonloop2 = omp.new_cli
+  omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%tc) {
+    omp.canonical_loop(%canonloop2) %iv2 : i32 in range(%iv1) {
+      omp.terminator
+    }
+    omp.terminator
+  }
+
+  // expected-error@+1 {{'omp.tile' op tiled loop nest must be rectangular}}
+  omp.tile <-(%canonloop1, %canonloop2) sizes(%ts, %ts : i32, i32)
+
+  llvm.return
+}
diff --git a/mlir/test/Dialect/Transform/test-promote-tensors.mlir b/mlir/test/Dialect/Transform/test-promote-tensors.mlir
new file mode 100644
index 0000000..bc9a05a
--- /dev/null
+++ b/mlir/test/Dialect/Transform/test-promote-tensors.mlir
@@ -0,0 +1,104 @@
+// RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
+
+// CHECK-LABEL: @promote_in0
+// CHECK-SAME:  (%[[ARG0:.+]]: tensor<?x42xf32>, %{{.*}}, %{{.*}})
+// CHECK:  %[[C0:.+]] = arith.constant 0
+// CHECK:  %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+// CHECK:  %[[ALLOC:.+]] = bufferization.alloc_tensor(%[[DIM]]) {memory_space = 1 : i64}
+// CHECK:  %[[MAT:.+]] = bufferization.materialize_in_destination %[[ARG0]] in %[[ALLOC]]
+// CHECK:  linalg.matmul ins(%[[MAT]], %{{.*}}
+func.func @promote_in0(%arg0: tensor<?x42xf32>, %arg1: tensor<42x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+    %0 = linalg.matmul ins(%arg0, %arg1: tensor<?x42xf32>, tensor<42x?xf32>)
+                       outs(%arg2: tensor<?x?xf32>) -> tensor<?x?xf32>
+    return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%root: !transform.any_op) {
+        %mm = transform.structured.match ops{["linalg.matmul"]} in %root
+            : (!transform.any_op) -> !transform.any_op
+        %op0 = transform.get_operand %mm[0]
+            : (!transform.any_op) -> !transform.any_value
+        transform.structured.promote_tensor to 1 %op0 : !transform.any_value
+        transform.yield
+    }
+}
+
+// -----
+
+// CHECK-LABEL: @promote_out
+// CHECK-SAME: (%{{.*}}: tensor<?x42xf32>, %{{.*}}: tensor<?x42xf32>, %[[ARG2:.+]]: tensor<?x?xf32>)
+func.func @promote_out(%arg0: tensor<?x42xf32>, %arg1: tensor<?x42xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+    // CHECK:  %[[C0:.+]] = arith.constant 0
+    // CHECK:  %[[DIM0:.+]] = tensor.dim %[[ARG2]], %[[C0]]
+    // CHECK:  %[[C1:.+]] = arith.constant 1
+    // CHECK:  %[[DIM1:.+]] = tensor.dim %[[ARG2]], %[[C1]]
+    // CHECK:  %[[ALLOC:.+]] = bufferization.alloc_tensor(%[[DIM0]], %[[DIM1]]) {memory_space = 1 : i64}
+    // CHECK-NOT: materialize_in_destination
+    // CHECK:  linalg.add {{.*}} outs(%[[ALLOC]]
+    %0 = linalg.add ins(%arg0, %arg1 : tensor<?x42xf32>, tensor<?x42xf32>)
+                    outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+    return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%root: !transform.any_op) {
+        %la = transform.structured.match ops{["linalg.add"]} in %root
+            : (!transform.any_op) -> !transform.any_op
+        %init = transform.get_operand %la[2]
+                : (!transform.any_op) -> !transform.any_value
+        transform.structured.promote_tensor to 1 %init : !transform.any_value
+
+        transform.yield
+    }
+}
+
+// -----
+
+// CHECK-LABEL: @promote_in0_out_bufferize
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x42xf32>, %{{.*}}: tensor<42x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>)
+func.func @promote_in0_out_bufferize(%arg0: tensor<?x42xf32>, %arg1: tensor<42x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+    // CHECK:  %[[IN1:.+]] = bufferization.to_buffer %arg1 : tensor<42x?xf32> to memref<42x?xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %[[IN0:.+]] = bufferization.to_buffer %arg0 : tensor<?x42xf32> to memref<?x42xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %{{.+}} = bufferization.to_buffer %arg0 : tensor<?x42xf32> to memref<?x42xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %{{.+}} = bufferization.to_buffer %arg2 : tensor<?x?xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %{{.+}} = bufferization.to_buffer %arg2 : tensor<?x?xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %[[C0:.+]] = arith.constant 0 : index
+    // CHECK:  %{{.+}} = memref.dim %{{.+}}, %[[C0]] : memref<?x?xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %[[C1:.+]] = arith.constant 1 : index
+    // CHECK:  %{{.+}} = memref.dim %{{.+}}, %[[C1]] : memref<?x?xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %[[ALLOC_OUT:.+]] = memref.alloc(%{{.+}}, %{{.+}}) {alignment = 64 : i64} : memref<?x?xf32, 1>
+    // CHECK:  %{{.+}} = arith.constant 0 : index
+    // CHECK:  %{{.+}} = memref.dim %{{.+}}, %{{.+}} : memref<?x42xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %[[ALLOC_IN:.+]] = memref.alloc(%{{.+}}) {alignment = 64 : i64} : memref<?x42xf32, 1>
+    // CHECK:  memref.copy %[[IN0]], %[[ALLOC_IN]] : memref<?x42xf32, strided<[?, ?], offset: ?>> to memref<?x42xf32, 1>
+    // CHECK: linalg.add ins(%[[ALLOC_IN]], %[[IN1]] : memref<?x42xf32, 1>, memref<42x?xf32, strided<[?, ?], offset: ?>>) outs(%[[ALLOC_OUT]] : memref<?x?xf32, 1>)
+    %0 = linalg.add ins(%arg0, %arg1: tensor<?x42xf32>, tensor<42x?xf32>)
+                    outs(%arg2: tensor<?x?xf32>) -> tensor<?x?xf32>
+    return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%root: !transform.any_op) {
+        %la = transform.structured.match ops{["linalg.add"]} in %root
+            : (!transform.any_op) -> !transform.any_op
+        %op0 = transform.get_operand %la[0]
+            : (!transform.any_op) -> !transform.any_value
+        transform.structured.promote_tensor to 1 %op0 : !transform.any_value
+
+        %init = transform.get_operand %la[2]
+                : (!transform.any_op) -> !transform.any_value
+        transform.structured.promote_tensor to 1 %init : !transform.any_value
+
+        %func = transform.structured.match ops{["func.func"]} in %root
+                : (!transform.any_op) -> !transform.any_op
+
+        %bufferized = transform.bufferization.one_shot_bufferize %func
+            : (!transform.any_op) -> !transform.any_op
+
+        transform.yield
+    }
+}
+
+
+
diff --git a/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir b/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir
index 2e5f433..efc3890 100644
--- a/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir
+++ b/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir
@@ -19,3 +19,88 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func private @f()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    // expected-error@below {{'selected_region' attribute specifies region at index 2 while op has only 2 regions}}
+    transform.tune.alternatives<"bifurcation"> selected_region = 2 {
+      transform.yield
+    }, {
+      transform.yield
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @f()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %singleton_of_c0 = transform.param.constant [0] -> !transform.any_param
+    // expected-error@below {{param should hold exactly one integer attribute, got: [0]}}
+    transform.tune.alternatives<"bifurcation"> selected_region = %singleton_of_c0 : !transform.any_param {
+      transform.yield
+    }, {
+      transform.yield
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @f()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %c0 = transform.param.constant 0 -> !transform.any_param
+    %c1 = transform.param.constant 1 -> !transform.any_param
+    %c0_and_c1 = transform.merge_handles %c0, %c1 : !transform.any_param
+    // expected-error@below {{param should hold exactly one integer attribute}}
+    transform.tune.alternatives<"bifurcation"> selected_region = %c0_and_c1 : !transform.any_param {
+      transform.yield
+    }, {
+      transform.yield
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @f()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %c2 = transform.param.constant 2 -> !transform.any_param
+    // expected-error@below {{'selected_region' attribute/param specifies region at index 2 while op has only 2 regions}}
+    transform.tune.alternatives<"bifurcation"> selected_region = %c2 : !transform.any_param {
+      transform.yield
+    }, {
+      transform.yield
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @f()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    // expected-error@below {{non-deterministic choice "bifurcation" is only resolved through providing a `selected_region` attr/param}}
+    transform.tune.alternatives<"bifurcation"> {
+      transform.yield
+    }, {
+      transform.yield
+    }
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Transform/test-tune-extension.mlir b/mlir/test/Dialect/Transform/test-tune-extension.mlir
index 0a253c6..5da48a2 100644
--- a/mlir/test/Dialect/Transform/test-tune-extension.mlir
+++ b/mlir/test/Dialect/Transform/test-tune-extension.mlir
@@ -59,3 +59,129 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+
+// -----
+
+// CHECK-LABEL: schedule_with_two_independent_choices_already_made
+func.func @schedule_with_two_independent_choices_already_made(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32> {
+//      CHECK-NOT: scf.forall
+//      CHECK:     scf.for
+//      CHECK-NOT:   scf.for
+//      CHECK:       scf.forall
+//      CHECK-NOT:   scf.for
+//      CHECK:         tensor.extract_slice
+//      CHECK:         tensor.extract_slice
+//      CHECK:         tensor.extract_slice
+//      CHECK:         linalg.matmul
+//      CHECK:         scf.forall.in_parallel
+//      CHECK:           tensor.parallel_insert_slice
+//      CHECK:       tensor.insert_slice
+//      CHECK:       scf.yield
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>) -> tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+
+    %tiled_matmul = transform.tune.alternatives<"outer_par_or_seq_tiling"> selected_region = 0 -> !transform.any_op
+    { // First alternative/region, with index = 0
+      %contained_matmul, %loop = transform.structured.tile_using_for %matmul tile_sizes [8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }, { // Second alternative/region, with index = 1
+      %contained_matmul, %loop = transform.structured.tile_using_forall %matmul tile_sizes [8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }
+
+    transform.tune.alternatives<"inner_par_or_seq_tiling"> selected_region = 1 -> !transform.any_op {
+      %contained_matmul, %loop = transform.structured.tile_using_for %tiled_matmul tile_sizes [0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }, {
+      %contained_matmul, %loop = transform.structured.tile_using_forall %tiled_matmul tile_sizes [0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }
+
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: subschedule_with_choice_resolved_in_main_schedule
+func.func @subschedule_with_choice_resolved_in_main_schedule(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32> {
+//      CHECK-NOT: scf.for
+//      CHECK:     scf.forall
+//      CHECK-NOT:   scf.forall
+//      CHECK:       scf.for
+//      CHECK-NOT:   scf.forall
+//      CHECK:         tensor.extract_slice
+//      CHECK:         tensor.extract_slice
+//      CHECK:         tensor.extract_slice
+//      CHECK:         linalg.matmul
+//      CHECK:         tensor.insert_slice
+//      CHECK:         scf.yield
+//      CHECK:       scf.forall.in_parallel
+//      CHECK:         tensor.parallel_insert_slice
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>) -> tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @subschedule_with_embedded_choice(%matmul: !transform.any_op {transform.readonly},
+                                                             %par_or_seq: !transform.param<i64> {transform.readonly},
+                                                             %tile_size: !transform.param<i64> {transform.readonly}) -> !transform.any_op {
+    %tiled_matmul = transform.tune.alternatives<"par_or_seq_tiling"> selected_region = %par_or_seq : !transform.param<i64> -> !transform.any_op {
+      %contained_matmul, %loop = transform.structured.tile_using_for %matmul tile_sizes [%tile_size] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }, {
+      %contained_matmul, %loop = transform.structured.tile_using_forall %matmul tile_sizes [%tile_size] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }
+    transform.yield %tiled_matmul : !transform.any_op
+  }
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %outer_par = transform.param.constant 1 -> !transform.param<i64>
+    %outer_tile_size = transform.param.constant 32 -> !transform.param<i64>
+    %inner_seq = transform.tune.knob<"inner_par_or_seq"> = 0 from options = [0, 1] -> !transform.param<i64>
+    %inner_tile_size = transform.param.constant 8 -> !transform.param<i64>
+    %tiled_matmul = transform.include @subschedule_with_embedded_choice failures(propagate) (%matmul, %outer_par, %outer_tile_size) : (!transform.any_op, !transform.param<i64>, !transform.param<i64>) -> !transform.any_op
+    %tiled_tiled_matmul = transform.include @subschedule_with_embedded_choice failures(propagate) (%tiled_matmul, %inner_seq, %inner_tile_size) : (!transform.any_op, !transform.param<i64>, !transform.param<i64>) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: eeny_meeny_miny_moe
+func.func private @eeny_meeny_miny_moe()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+
+    %tiled_matmul = transform.tune.alternatives<"4way"> selected_region = 3 -> !transform.any_param
+    { // First alternative/region, with index = 0
+      %out = transform.param.constant "eeny" -> !transform.any_param
+      transform.yield %out : !transform.any_param
+    }, { // Second alternative/region, with index = 1
+      %out = transform.param.constant "meeny" -> !transform.any_param
+      transform.yield %out : !transform.any_param
+    }, { // Third alternative/region, with index = 2
+      %out = transform.param.constant "miny" -> !transform.any_param
+      transform.yield %out : !transform.any_param
+    }, { // Fourth alternative/region, with index = 3
+      %out = transform.param.constant "moe" -> !transform.any_param
+      transform.yield %out : !transform.any_param
+    }
+    transform.yield
+  }
+}
+\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
new file mode 100644
index 0000000..40b66d1
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -0,0 +1,575 @@
+// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -test-xegpu-sg-distribute -allow-unregistered-dialect \
+// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @store_nd_1d
+// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:         %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
+// CHECK-SAME:      -> (vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
+// CHECK:           gpu.yield %{{.*}} : vector<16xf32>,
+// CHECK-SAME:        !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
+// CHECK-NEXT:    }
+// CHECK-NEXT:    %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
+// CHECK-NEXT:    xegpu.store_nd %[[W]]#0, %[[T1]][%[[W]]#2]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+gpu.module @xevm_module{
+  gpu.func @store_nd_1d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      %cst = "some_op"() : () -> vector<16xf32>
+      xegpu.store_nd %cst, %0 [%c0] {layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @store_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
+// CHECK-SAME:    -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[CAST:.*]] = vector.shape_cast %[[W]]#0 : vector<16x1xf16> to vector<16xf16>
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.store_nd %[[CAST]], %[[T1]][%[[W]]#2, %[[W]]#3]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @xevm_module{
+  gpu.func @store_nd_2d(%laneid : index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      %cst = "some_op"() : () -> vector<16x16xf16>
+      xegpu.store_nd %cst, %0 [%c0, %c0] {layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+        : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    }
+    gpu.return
+  }
+}
+
+
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<1xf32>,
+// CHECK-SAME:    !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.load_nd %[[T1]][%[[W]]#2]  : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
+gpu.module @xevm_module{
+  gpu.func @load_nd_1d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      %1 = xegpu.load_nd %0 [%c0]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
+        !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
+      gpu.yield %1 : vector<16xf32>
+    }
+    "some_user_op"(%r) : (vector<1xf32>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:     #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK:       vector.shape_cast %[[T2]] : vector<16xf16> to vector<16x1xf16>
+gpu.module @xevm_module{
+  gpu.func @load_nd_2d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+        : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+      gpu.yield %1 : vector<16x16xf16>
+    }
+    "some_user_op"(%r) : (vector<16x1xf16>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_array_length
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<2x16x1xf16>,
+// CHECK-SAME:    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<2x16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<
+// CHECK-SAME:      array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16],
+// CHECK-SAME:      lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+// CHECK-NEXT:  %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3]  : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
+// CHECK-NEXT:  vector.shape_cast %[[T2]] : vector<32xf16> to vector<2x16x1xf16>
+gpu.module @xevm_module{
+  gpu.func @load_nd_array_length(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x16x1xf16>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
+        #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+        : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
+          #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16>
+      gpu.yield %1 : vector<2x16x16xf16>
+    }
+    "some_user_op"(%r) : (vector<2x16x1xf16>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @dpas
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] ->
+// CHECK-SAME:    (vector<8x1xf32>, vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) {
+// CHECK:         gpu.yield %{{.*}} : vector<8x16xf32>, vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
+// CHECK-NEXT:  }
+// CHECK-DAG:   %[[T1:.*]] = vector.shape_cast %[[W]]#1 : vector<8x1xf16> to vector<8xf16>
+// CHECK-DAG:   %[[T2:.*]] = vector.shape_cast %[[W]]#2 : vector<16x1xf16> to vector<16xf16>
+// CHECK-DAG:   %[[T3:.*]] = vector.shape_cast %[[W]]#3 : vector<8x1xf32> to vector<8xf32>
+// CHECK-NEXT:  %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T2]], %[[T3]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
+// CHECK-NEXT:  vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
+gpu.module @xevm_module{
+  gpu.func @dpas(%laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
+      %0 = "some_op"() : () -> vector<8x16xf16>
+      %1 = "some_op"() : () -> vector<16x16xf16>
+      %2 = "some_op"() : () -> vector<8x16xf32>
+      %3 = xegpu.dpas %0, %1, %2
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+          layout_operand_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      gpu.yield %3 : vector<8x16xf32>
+    }
+    "some_user_op"(%r) : (vector<8x1xf32>) -> ()
+    gpu.return
+  }
+}
+
+
+// -----
+// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG1]])[16] -> (!xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64) {
+// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = xegpu.create_nd_tdesc %[[W]]#1, shape : [64, 128], strides : [128, 1] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK-NEXT:  builtin.unrealized_conversion_cast %[[T1]] : !xegpu.tensor_desc<16x16xf16> to !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> {resolve_simt_type_mismatch}
+gpu.module @xevm_module{
+  gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+      %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 ->
+        !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      gpu.yield %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    }
+    "some_user_op"(%r)
+      : (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @prefetch_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-SAME:      , index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.prefetch_nd %[[T1]][%[[W]]#1, %[[W]]#2]
+// CHECK-SAME:    <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
+gpu.module @xevm_module{
+  gpu.func @prefetch_2d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : ()
+        -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      xegpu.prefetch_nd %0[%c0, %c0]
+        <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+        : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @prefetch_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16xf16,
+// CHECK-SAME:     #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
+// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.prefetch_nd %[[T1]][%[[W]]#1] <{l1_hint = #xegpu.cache_hint<cached>,
+// CHECK-SAME:    l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
+gpu.module @xevm_module{
+  gpu.func @prefetch_1d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : ()
+        -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      xegpu.prefetch_nd %0[%c0]
+        <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+        : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) {
+// CHECK:  gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
+// CHECK:     gpu.yield %{{.*}}
+// CHECK:  }
+// CHECK:  %{{.*}} = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
+// CHECK:  gpu.barrier
+gpu.module @xevm_module{
+  gpu.func @gpu_barrier(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf16>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      %1 = xegpu.load_nd %0[%c0]
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
+      gpu.barrier
+      gpu.yield %1 : vector<16xf16>
+    }
+    "some_user_op"(%r) : (vector<1xf16>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
+// CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
+// CHECK-SAME:    -> (vector<2xf32>, vector<16x2xf32>, vector<2xf32>) {
+// CHECK:         %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x32xf32>
+// CHECK:         gpu.yield %{{.*}}, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<16x32xf32>, vector<32xf32>
+// CHECK-NEXT:  }
+// CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
+// CHECK-SAME:    {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T3:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32>
+// CHECK:       %[[T4:.*]] = vector.reduction <add>, %[[T2]], %[[T3]] : vector<16xf32> into f32
+// CHECK:       %[[T5:.*]] = vector.extract_strided_slice %[[W]]#1
+// CHECK-SAME:    {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T7:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
+// CHECK:       %[[T8:.*]] = vector.reduction <add>, %[[T6]], %[[T7]] : vector<16xf32> into f32
+// CHECK:       %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<2xf32>
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : () -> (vector<16x32xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
+      dense<0.0>  : vector<32xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+    {
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
+      layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
+    }  [0]
+    : vector<16x32xf32> to vector<32xf32>
+    gpu.yield %1 : vector<32xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
+// CHECK:      %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK-NEXT:   %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
+// CHECK-NEXT:   %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT:   %[[T3:.*]] = vector.reduction <add>, %[[T2]], %cst : vector<16xf32> into f32
+// CHECK-NEXT:   %[[T4:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT:   %[[T5:.*]] = vector.reduction <add>, %[[T4]], %cst : vector<16xf32> into f32
+// CHECK-NEXT:   %[[T6:.*]] = vector.from_elements %[[T3]], %[[T5]] : vector<2xf32>
+// CHECK-NEXT:   gpu.yield %[[T6]] : vector<2xf32>
+// CHECK-NEXT: }
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : () -> (vector<2x16xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+      dense<0.0>  : vector<2xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>,
+        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
+      }
+      [1] : vector<2x16xf32> to vector<2xf32>
+    gpu.yield %1 : vector<2xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
+// CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
+// CHECK:         %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<32x16xf32>
+// CHECK:         gpu.yield %9, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<32x16xf32>, vector<32xf32>
+// CHECK:       }
+// CHECK:       %[[T1:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32>
+// CHECK:       %[[T2:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32>
+// CHECK:       %[[T3:.*]] = vector.reduction <add>, %[[T1]], %[[T2]] : vector<16xf32> into f32
+// CHECK:       %[[T4:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
+// CHECK:       %[[T5:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
+// CHECK:       %[[T6:.*]] = vector.reduction <add>, %[[T4]], %[[T5]] : vector<16xf32> into f32
+// CHECK:       %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32>
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+      : () -> (vector<32x16xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}
+      dense<0.0>  : vector<32xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>,
+        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>
+      }
+      [1] : vector<32x16xf32> to vector<32xf32>
+    gpu.yield %1 : vector<32xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
+// CHECK:     %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK:       %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x2xf32>
+// CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:    {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]] {{.*}} : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
+// CHECK:       %[[T4:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:     {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T5:.*]] = vector.shape_cast %[[T4]] {{.*}} : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32
+// CHECK:       %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32>
+// CHECK:       gpu.yield %[[T7]] : vector<2xf32>
+// CHECK:     }
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+      : () -> (vector<16x2xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+      dense<0.0>  : vector<2xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>,
+        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
+      }
+      [0] : vector<16x2xf32> to vector<2xf32>
+    gpu.yield %1 : vector<2xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) {
+// CHECK:       %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex>
+// CHECK:       %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1>
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
+// CHECK-SAME:    -> (vector<1x8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
+// CHECK:         gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] :
+// CHECK-SAME:      vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
+// CHECK-SAME:    : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK-NEXT:  xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
+// CHECK-SAME:    : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+gpu.module @xevm_module{
+  gpu.func @scatter_ops_chunksize(%laneid: index, %src: memref<256xf16>) {
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %1 = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<1>: vector<16xi1>
+      %offset = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<12> : vector<16xindex>
+      %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
+        {
+          layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+          layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+        }
+        : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+      xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>,
+          layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+          layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+        }
+        : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
+// CHECK:       %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex>
+// CHECK:       %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1>
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
+// CHECK-SAME:    -> (vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
+// CHECK:         gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]]
+// CHECK-SAME:    : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3
+// CHECK-SAME:    : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+// CHECK-NEXT:  xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3
+// CHECK-SAME:    : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+gpu.module @xevm_module{
+  gpu.func @scatter_ops(%src: memref<256xf16>, %laneid: index) {
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %1 = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<1> : vector<16xi1>
+      %offset = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<12> : vector<16xindex>
+      %3 = xegpu.load %src[%offset], %1
+      {
+        layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+      } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+      xegpu.store %3, %src[%offset], %1
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+      }
+      : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (index, memref<256x256xf16>) {
+// CHECK:         gpu.yield %{{.*}}, %{{.*}} : index, memref<256x256xf16>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[W]]#1 : memref<256x256xf16> -> index
+// CHECK-NEXT:  arith.index_cast %[[INTPTR]] : index to i64
+gpu.module @xevm_module{
+  gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (index) {
+      %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
+      gpu.yield %ptr : index
+    }
+    %ptr_i64 = arith.index_cast %r : index to i64
+    "some_user_op"(%ptr_i64) : (i64) -> ()
+    gpu.return
+  }
+}
+
+
+// -----
+// CHECK-LABEL: gpu.func @vector_transpose(
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2x1xf32>, vector<1x2xf32>) {
+// CHECK:         %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<16x2xf32>
+// CHECK:         gpu.yield %{{.*}}, %[[SRC]] : vector<2x16xf32>, vector<16x2xf32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = vector.transpose %[[W]]#1, [1, 0] : vector<1x2xf32> to vector<2x1xf32>
+gpu.module @xevm_module{
+  gpu.func @vector_transpose(%arg0: memref<2x16xf32>, %laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) {
+      %cst = "some_op"()
+        {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+        : () -> (vector<16x2xf32>)
+      %transpose = vector.transpose %cst, [1, 0]
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [16 , 1], lane_data = [1, 1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : vector<16x2xf32> to vector<2x16xf32>
+      gpu.yield %transpose : vector<2x16xf32>
+    }
+    "some_user_op"(%r) : (vector<2x1xf32>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_bitcast(
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<4x1xi16>, vector<4x2xi8>) {
+// CHECK:         %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<4x32xi8>
+// CHECK:         gpu.yield %{{.*}}, %[[SRC]] : vector<4x16xi16>, vector<4x32xi8>
+// CHECK:       }
+// CHECK:       vector.bitcast %[[W]]#1 : vector<4x2xi8> to vector<4x1xi16>
+gpu.module @xevm_module{
+  gpu.func @vector_bitcast(%arg0: memref<4x16xi16>, %laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) {
+      %cst = "some_op"()
+        {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+        : () -> (vector<4x32xi8>)
+      %bitcast = vector.bitcast %cst
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : vector<4x32xi8> to vector<4x16xi16>
+      gpu.yield %bitcast : vector<4x16xi16>
+    }
+    "some_user_op"(%r) : (vector<4x1xi16>) -> ()
+    gpu.return
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 59fac26..0e1365a 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -1,198 +1,76 @@
 // RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -xegpu-subgroup-distribute \
 // RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
 
-// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \
-// RUN: -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \
-// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION
-
-// CHECK-LABEL: gpu.func @store_nd_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
-// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
-// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-// CHECK: gpu.return
-gpu.module @xevm_module{
-  gpu.func @store_nd_1d(%arg0: memref<16xf32>) {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf32>
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %cst, %0 [%c0]  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @store_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16>
-// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf16>
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %cst, %0 [%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
-// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-gpu.module @xevm_module{
-  gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    %1 = xegpu.load_nd %0 [%c0]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
-    %2 = xegpu.create_nd_tdesc %arg1 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %1, %2 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_array_length
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
-// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16>
-// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16>
-// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16>
-    %2 = vector.extract %1[%c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16> from vector<2x16x16xf16>
-    %3 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %2, %3[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_dpas_store
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}]  <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
-// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
-gpu.module @xevm_module{
-  gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    %3 = xegpu.load_nd %2[%c0, %c0]   {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-    %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-    %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %4, %5[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-
-// -----
 // CHECK-LABEL: gpu.func @load_dpas_postop_store
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
-// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
-// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32>
-// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>,
+// CHECK-SAME:      %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK:         %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+// CHECK:         %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK:         %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK:         %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK-DAG:     %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
+// CHECK:         %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
+// CHECK:         %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32>
+// CHECK-DAG:     %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG:     %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK:         xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
 gpu.module @xevm_module{
   gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
     %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    %3 = xegpu.load_nd %2[%c0, %c0]   {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-    %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-    %5 = math.exp %4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>
-    %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
+    %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
+      -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %1 = xegpu.load_nd %0[%c0, %c0]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+      !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+
+    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    %3 = xegpu.load_nd %2[%c0, %c0]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+      -> vector<16x16xf16>
+
+    %4 = xegpu.dpas %1, %3
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 
-// -----
-// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index,
-// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index,
-// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %5 = math.exp %4
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xf32>
+
+    %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
+      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>,
+      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
 }
 
 // -----
-// TODO: gemm does not use update_nd_offset because of an issue in scf-for distribution.
 // CHECK-LABEL: gpu.func @gemm
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
-// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x
-// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y
-// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
-// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
-// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) {
-// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
-// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
-// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
-// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
-// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>,
+// CHECK-SAME:     %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
+// CHECK-DAG:         %[[BLOCK_ID_X:.*]] = gpu.block_id x
+// CHECK-DAG:         %[[BLOCK_ID_Y:.*]] = gpu.block_id y
+// CHECK-DAG:         %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
+// CHECK-DAG:         %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
+// CHECK:             %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK-NEXT:        %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+// CHECK-NEXT:        %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
+// CHECK:             %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]])
+// CHECK-SAME:          -> (vector<8x1xf32>) {
+// CHECK-DAG:           %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK-DAG:           %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK-DAG:           %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK-DAG:           %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK-DAG:           %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-NEXT:          %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]]
+// CHECK-SAME:            : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
+// CHECK-NEXT:          %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
+// CHECK-NEXT:          scf.yield %[[T16]] : vector<8x1xf32>
+// CHECK-NEXT:        }
+// CHECK-NEXT:        %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-NEXT:        xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
 gpu.module @xevm_module{
 gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
   %c0 = arith.constant 0 : index
@@ -203,213 +81,56 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
   %block_id_y = gpu.block_id  y
   %0 = arith.muli %block_id_x, %c8 : index
   %1 = arith.muli %block_id_y, %c16 : index
-  %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %3 = xegpu.load_nd %2[%0, %1]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
-  %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
-    %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    %7 = xegpu.load_nd %5[%0, %arg3]   {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
-    %8 = xegpu.load_nd %6[%arg3, %1]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
-    %9 = xegpu.dpas %7, %8, %arg4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
-    scf.yield %9 : vector<8x16xf32>
-  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-  xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+  %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %3 = xegpu.load_nd %2[%0, %1]
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
 
-// -----
-// CHECK-LABEL: gpu.func @prefetch_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.prefetch_nd %0[%c0, %c0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @prefetch_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}]  <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
-gpu.module @xevm_module{
-  gpu.func @prefetch_1d(%arg0: memref<256xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0: memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.prefetch_nd %0[%c0]  <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
+  %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
 
-// -----
-// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}]  : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
-// CHECK-NEXT: gpu.barrier
-// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<1xf16>, !xegpu.tensor_desc<16xf16>
-gpu.module @xevm_module{
-  gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    %1 = xegpu.load_nd %0[%c0]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
-    gpu.barrier
-    %2 = xegpu.create_nd_tdesc %arg1 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %1, %2[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
+    %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
 
-// -----
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
-// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] ->
-// CHECK-SAME:    (!xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, vector<16x2xf32>) {
-// CHECK:             %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x32xf32>
-// CHECK-NEXT:        gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, vector<16x32xf32>
-// CHECK-NEXT:  }
-// CHECK:       %[[COL0:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-NEXT:  %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-NEXT:  %[[RED0:.*]] = vector.reduction <add>, %[[CAST0]], %{{.*}} : vector<16xf32> into f32
-// CHECK:       %[[COL1:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-NEXT:  %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-NEXT:  %[[RED1:.*]] = vector.reduction <add>, %[[CAST1]], %{{.*}} : vector<16xf32> into f32
-// CHECK-NEXT:  vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}  : () -> (vector<16x32xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.0>  : vector<32xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}  [0]
-    : vector<16x32xf32> to vector<32xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : vector<32xf32> to vector<1x32xf32>
-  xegpu.store_nd %3, %0[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+    %7 = xegpu.load_nd %5[%0, %arg3]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+    %8 = xegpu.load_nd %6[%arg3, %1]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
 
-// -----
-// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK-REDUCTION:         %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<2x16xf32,
-// CHECK-REDUCTION-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, f32, f32) {
-// CHECK-REDUCTION:           %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<2x16xf32>
-// CHECK-REDUCTION-NEXT:      %[[ROW0:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
-// CHECK-REDUCTION-NEXT:      %[[R0:.*]] = vector.reduction <add>, %[[ROW0]], %{{.*}} : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:      %[[ROW1:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-REDUCTION-NEXT:      %[[R1:.*]] = vector.reduction <add>, %[[ROW1]], %{{.*}} : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:      gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, f32, f32
-// CHECK-REDUCTION-NEXT:    }
-// CHECK-REDUCTION-NEXT:    vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}  : () -> (vector<2x16xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} dense<0.0>  : vector<2xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-    [1] : vector<2x16xf32> to vector<2xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : vector<2xf32> to vector<2x1xf32>
-  %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<2x1xf32> to vector<2x16xf32>
-  xegpu.store_nd %4, %0[%c0, %c0] : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+    %9 = xegpu.dpas %7, %8, %arg4
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
 
-// -----
-// CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
-// CHECK:             %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%0)[16] ->
-// CHECK-SAME:          (!xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<2x16xf32>) {
-// CHECK:                 %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<32x16xf32>
-// CHECK-NEXT:            gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<32x16xf32>
-// CHECK-NEXT:        }
-// CHECK:             %[[ROW0:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:        %[[R0:.*]] = vector.reduction <add>, %[[ROW0]], %{{.*}} : vector<16xf32> into f32
-// CHECK:             %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:        %[[R1:.*]] = vector.reduction <add>, %[[ROW1]], %{{.*}} : vector<16xf32> into f32
-// CHECK-NEXT:        vector.from_elements %[[R0]], %[[R1]] : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}  : () -> (vector<32x16xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>} dense<0.0>  : vector<32xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}  [1]
-    : vector<32x16xf32> to vector<32xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-    : vector<32xf32> to vector<32x1xf32>
-  xegpu.store_nd %3, %0[%c0, %c0] : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+    scf.yield %9 : vector<8x16xf32>
+  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 
-// -----
-// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK-REDUCTION:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<16x2xf32,
-// CHECK-REDUCTION-SAME:    #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, f32, f32) {
-// CHECK-REDUCTION:          %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<16x2xf32>
-// CHECK-REDUCTION-NEXT:     %[[COL0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-REDUCTION-NEXT:     %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-REDUCTION-NEXT:     %[[R0:.*]] = vector.reduction <add>, %[[CAST0]], %{{.*}} : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:     %[[COL1:.*]] = vector.extract_strided_slice %5 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-REDUCTION-NEXT:     %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-REDUCTION-NEXT:     %[[R1:.*]] = vector.reduction <add>, %[[CAST1]], %cst : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:     gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, f32, f32
-// CHECK-REDUCTION-NEXT:   }
-// CHECK-REDUCTION-NEXT:   vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}  : () -> (vector<16x2xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>} dense<0.0>  : vector<2xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-    [0] : vector<16x2xf32> to vector<2xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-    : vector<2xf32> to vector<1x2xf32>
-  %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : vector<1x2xf32> to vector<16x2xf32>
-  xegpu.store_nd %4, %0[%c0, %c0] : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+  xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>,
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 }
 
 // -----
-// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) {
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @xevm_module{
-  gpu.func @scatter_ops_chunksize(%src: memref<256xf16>) {
-    %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-    %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
-      layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
-    } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-    xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}},
-// CHECK-SAME: %[[PREDICATE:.*]]: i1) {
-// CHECK: %[[DEFAULT:.*]] = arith.constant dense<1.200000e+01> : vector<8xf16>
-// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK: %[[PREDICATED_LOAD:.*]] = scf.if %[[PREDICATE]] -> (vector<8xf16>) {
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: scf.yield %[[LOADED]] : vector<8xf16>
-// CHECK-NEXT: } else {
-// CHECK-NEXT:   scf.yield %[[DEFAULT]] : vector<8xf16>
-// CHECK-NEXT: }
-// CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+// CHECK-LABEL: gpu.func @scatter_ops_scf_yield
+// CHECK:         (%{{.*}}: memref<256xf16>, %[[PREDICATE:[a-zA-Z0-9]+]]: i1) {
+// CHECK-DAG:      %[[CST:.*]] = arith.constant dense<1.200000e+01> : vector<1x8xf16>
+// CHECK-DAG:      %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
+// CHECK-DAG:      %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
+// CHECK:          %[[IF:.*]] = scf.if %[[PREDICATE]] -> (vector<1x8xf16>) {
+// CHECK-NEXT:        %[[LD:.*]] = xegpu.load %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:          : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK-NEXT:        %[[LD_CAST:.*]] = vector.shape_cast %[[LD]] : vector<8xf16> to vector<1x8xf16>
+// CHECK-NEXT:        scf.yield %[[LD_CAST]] : vector<1x8xf16>
+// CHECK-NEXT:      } else {
+// CHECK-NEXT:        scf.yield %[[CST]] : vector<1x8xf16>
+// CHECK-NEXT:      }
+// CHECK-NEXT:      %[[IF_CAST:.*]] = vector.shape_cast %[[IF]] : vector<1x8xf16> to vector<8xf16>
+// CHECK-NEXT:      xegpu.store %[[IF_CAST]], %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:        vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
 gpu.module @xevm_module{
   gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) {
     %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
@@ -432,13 +153,15 @@ gpu.module @xevm_module{
 
 // -----
 // CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) {
-// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1
-// CHECK: scf.if %[[PREDICATE]] {
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-// CHECK-NEXT: }
+// CHECK:         %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
+// CHECK:         %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
+// CHECK:         %[[PREDICATE:.*]] = llvm.mlir.poison : i1
+// CHECK:         scf.if %[[PREDICATE]] {
+// CHECK-NEXT:      %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:         memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK-NEXT:      xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:         vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+// CHECK-NEXT:    }
 gpu.module @xevm_module{
   gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) {
     %pred = llvm.mlir.poison : i1
@@ -455,88 +178,13 @@ gpu.module @xevm_module{
 }
 
 // -----
-// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @xevm_module{
-  gpu.func @scatter_ops(%src: memref<256xf16>) {
-    %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-    %3 = xegpu.load %src[%offset], %1 {
-      layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
-    } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-    xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
-// CHECK:  %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index
-gpu.module @xevm_module{
-  gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf16>
-    %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
-    %ptr_i64 = arith.index_cast %ptr : index to i64
-    %tdesc = xegpu.create_nd_tdesc %ptr_i64, shape: [16], strides: [16] : i64
-      -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %cst, %tdesc[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
-
-
-// -----
-// CHECK-LABEL: gpu.func @vector_transpose(
-// CHECK:         %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32>
-// CHECK:         %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32>
-// CHECK:         xegpu.store_nd %[[CST]], %[[DEST]][{{.*}}] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32>
-gpu.module @xevm_module{
-  gpu.func @vector_transpose(%arg0: memref<2x16xf32>) {
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} dense<1.000000e+00>
-      : vector<16x2xf32>
-    %c0 = arith.constant 0 : index
-    %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<16x2xf32> to vector<2x16xf32>
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<2x16xf32>
-      -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %transpose, %0[%c0, %c0] : vector<2x16xf32>,
-      !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @vector_bitcast(
-// CHECK:      %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16>
-// CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16>
-// CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16>
-// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]][{{.*}}]  : vector<4xi16>, !xegpu.tensor_desc<4x16xi16>
-gpu.module @xevm_module{
-  gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) {
-    %cst = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
-      : () -> (vector<4x32xi8>)
-    %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<4x32xi8> to vector<4x16xi16>
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xi16>
-      -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %bitcast, %0[%c0, %c0] : vector<4x16xi16>,
-      !xegpu.tensor_desc<4x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
 // CHECK-LABEL: gpu.func @mma_transpose_b(
 // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK-DAG:     %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
 // CHECK-DAG:     %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32>
 // CHECK-DAG:     %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK-DAG:     %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32>
+// CHECK-DAG:     %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}>
+// CHECK-SAME:      !xegpu.tensor_desc<16x8xi32> -> vector<8xi32>
 // CHECK-NEXT:    %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32>
 // CHECK-NEXT:    %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16>
 // CHECK-NEXT:    %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 03c6386..38392fd 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -282,15 +282,20 @@ gpu.module @test_distribution {
   // CHECK-LABEL: @store_scatter
   // CHECK-SAME: %[[ARG0:.*]]: memref<256xf16>
   gpu.func @store_scatter(%dest : memref<256xf16>) {
-    // CHECK: %[[VAL:.*]] = arith.constant dense<2.550000e+01> : vector<8xf16>
-    // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex>
-    // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
+    // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16>
+    // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex>
+    // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1>
     // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
+    // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>,
+    // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>}
     // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
-    %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<25.5> : vector<256xf16>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<0> : vector<256xindex>
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<1> : vector<256xi1>
-    xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [32], sg_data = [8]>, l1_hint = #xegpu.cache_hint<cached>}
+    %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<25.5> : vector<256xf16>
+    %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<0> : vector<256xindex>
+    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<1> : vector<256xi1>
+    xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>, 
+                                             layout_operand_2 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
+                                             layout_operand_3 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
+                                             l1_hint = #xegpu.cache_hint<cached>}
       : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
     gpu.return
   }
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir
new file mode 100644
index 0000000..0d559b6
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir
@@ -0,0 +1,94 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope
+
+
+llvm.func @tile_trivial_loop(%baseptr: !llvm.ptr, %tc: i32, %ts: i32) -> () {
+  %literal_cli = omp.new_cli
+  omp.canonical_loop(%literal_cli) %iv : i32 in range(%tc) {
+    %ptr = llvm.getelementptr inbounds %baseptr[%iv] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+    %val = llvm.mlir.constant(42.0 : f32) : f32
+    llvm.store %val, %ptr : f32, !llvm.ptr
+    omp.terminator
+  }
+  omp.tile <- (%literal_cli) sizes(%ts : i32)
+  llvm.return
+}
+
+
+// CHECK-LABEL: define void @tile_trivial_loop(
+// CHECK-SAME:    ptr %[[TMP0:.+]], i32 %[[TMP1:.+]], i32 %[[TMP2:.+]]) {
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT:    %[[TMP4:.+]] = udiv i32 %[[TMP1:.+]], %[[TMP2:.+]]
+// CHECK-NEXT:    %[[TMP5:.+]] = urem i32 %[[TMP1:.+]], %[[TMP2:.+]]
+// CHECK-NEXT:    %[[TMP6:.+]] = icmp ne i32 %[[TMP5:.+]], 0
+// CHECK-NEXT:    %[[TMP7:.+]] = zext i1 %[[TMP6:.+]] to i32
+// CHECK-NEXT:    %[[OMP_FLOOR0_TRIPCOUNT:.+]] = add nuw i32 %[[TMP4:.+]], %[[TMP7:.+]]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER:.+]] ], [ %[[OMP_FLOOR0_NEXT:.+]], %[[OMP_FLOOR0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_COND]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV:.+]], %[[OMP_FLOOR0_TRIPCOUNT:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_FLOOR0_CMP:.+]], label %[[OMP_FLOOR0_BODY:.+]], label %[[OMP_FLOOR0_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_BODY]]:
+// CHECK-NEXT:    %[[TMP8:.+]] = icmp eq i32 %[[OMP_FLOOR0_IV:.+]], %[[TMP4:.+]]
+// CHECK-NEXT:    %[[TMP9:.+]] = select i1 %[[TMP8:.+]], i32 %[[TMP5:.+]], i32 %[[TMP2:.+]]
+// CHECK-NEXT:    br label %[[OMP_TILE0_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_TILE0_IV:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER:.+]] ], [ %[[OMP_TILE0_NEXT:.+]], %[[OMP_TILE0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_TILE0_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_COND]]:
+// CHECK-NEXT:    %[[OMP_TILE0_CMP:.+]] = icmp ult i32 %[[OMP_TILE0_IV:.+]], %[[TMP9:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_TILE0_CMP:.+]], label %[[OMP_TILE0_BODY:.+]], label %[[OMP_TILE0_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_BODY]]:
+// CHECK-NEXT:    %[[TMP10:.+]] = mul nuw i32 %[[TMP2:.+]], %[[OMP_FLOOR0_IV:.+]]
+// CHECK-NEXT:    %[[TMP11:.+]] = add nuw i32 %[[TMP10:.+]], %[[OMP_TILE0_IV:.+]]
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_BODY:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_LOOP_REGION:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_LOOP_REGION]]:
+// CHECK-NEXT:    %[[TMP12:.+]] = getelementptr inbounds float, ptr %[[TMP0:.+]], i32 %[[TMP11:.+]]
+// CHECK-NEXT:    store float 4.200000e+01, ptr %[[TMP12:.+]], align 4
+// CHECK-NEXT:    br label %[[OMP_REGION_CONT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_REGION_CONT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_INC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_INC]]:
+// CHECK-NEXT:    %[[OMP_TILE0_NEXT:.+]] = add nuw i32 %[[OMP_TILE0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_INC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_INC]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_AFTER]]:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir
new file mode 100644
index 0000000..22c2973
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir
@@ -0,0 +1,184 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope
+
+
+llvm.func @tile_2d_loop(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32, %ts1: i32, %ts2: i32) -> () {
+  %literal_outer = omp.new_cli
+  %literal_inner = omp.new_cli
+  omp.canonical_loop(%literal_outer) %iv1 : i32 in range(%tc1) {
+    omp.canonical_loop(%literal_inner) %iv2 : i32 in range(%tc2) {
+      %idx = llvm.add %iv1, %iv2 : i32
+      %ptr = llvm.getelementptr inbounds %baseptr[%idx] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+      %val = llvm.mlir.constant(42.0 : f32) : f32
+      llvm.store %val, %ptr : f32, !llvm.ptr
+      omp.terminator
+    }
+    omp.terminator
+  }
+  omp.tile <- (%literal_outer, %literal_inner) sizes(%ts1, %ts2 : i32,i32)
+  llvm.return
+}
+
+
+// CHECK-LABEL: define void @tile_2d_loop(
+// CHECK-SAME:      ptr %[[TMP0:.+]], i32 %[[TMP1:.+]], i32 %[[TMP2:.+]], i32 %[[TMP3:.+]], i32 %[[TMP4:.+]]) {
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT:    %[[TMP6:.+]] = udiv i32 %[[TMP1:.+]], %[[TMP3:.+]]
+// CHECK-NEXT:    %[[TMP7:.+]] = urem i32 %[[TMP1:.+]], %[[TMP3:.+]]
+// CHECK-NEXT:    %[[TMP8:.+]] = icmp ne i32 %[[TMP7:.+]], 0
+// CHECK-NEXT:    %[[TMP9:.+]] = zext i1 %[[TMP8:.+]] to i32
+// CHECK-NEXT:    %[[OMP_FLOOR0_TRIPCOUNT:.+]] = add nuw i32 %[[TMP6:.+]], %[[TMP9:.+]]
+// CHECK-NEXT:    %[[TMP10:.+]] = udiv i32 %[[TMP2:.+]], %[[TMP4:.+]]
+// CHECK-NEXT:    %[[TMP11:.+]] = urem i32 %[[TMP2:.+]], %[[TMP4:.+]]
+// CHECK-NEXT:    %[[TMP12:.+]] = icmp ne i32 %[[TMP11:.+]], 0
+// CHECK-NEXT:    %[[TMP13:.+]] = zext i1 %[[TMP12:.+]] to i32
+// CHECK-NEXT:    %[[OMP_FLOOR1_TRIPCOUNT:.+]] = add nuw i32 %[[TMP10:.+]], %[[TMP13:.+]]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_HEADER:.+]]:
+// CHECK-NEXT:    %[[OMP_OMP_LOOP_IV:.+]] = phi i32 [ %[[OMP_OMP_LOOP_NEXT:.+]], %[[OMP_OMP_LOOP_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_COND]]:
+// CHECK-NEXT:    %[[OMP_OMP_LOOP_CMP:.+]] = icmp ult i32 %[[TMP19:.+]], %[[TMP1:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_OMP_LOOP_CMP:.+]], label %[[OMP_OMP_LOOP_BODY:.+]], label %[[OMP_OMP_LOOP_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_LOOP_REGION:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_LOOP_REGION]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_PREHEADER1:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_PREHEADER1]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_BODY4:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER:.+]] ], [ %[[OMP_FLOOR0_NEXT:.+]], %[[OMP_FLOOR0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_COND]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV:.+]], %[[OMP_FLOOR0_TRIPCOUNT:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_FLOOR0_CMP:.+]], label %[[OMP_FLOOR0_BODY:.+]], label %[[OMP_FLOOR0_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR1_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR1_HEADER]]:
+// CHECK-NEXT:    %[[OMP_FLOOR1_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR1_PREHEADER:.+]] ], [ %[[OMP_FLOOR1_NEXT:.+]], %[[OMP_FLOOR1_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR1_COND]]:
+// CHECK-NEXT:    %[[OMP_FLOOR1_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR1_IV:.+]], %[[OMP_FLOOR1_TRIPCOUNT:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_FLOOR1_CMP:.+]], label %[[OMP_FLOOR1_BODY:.+]], label %[[OMP_FLOOR1_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR1_BODY]]:
+// CHECK-NEXT:    %[[TMP14:.+]] = icmp eq i32 %[[OMP_FLOOR0_IV:.+]], %[[TMP6:.+]]
+// CHECK-NEXT:    %[[TMP15:.+]] = select i1 %[[TMP14:.+]], i32 %[[TMP7:.+]], i32 %[[TMP3:.+]]
+// CHECK-NEXT:    %[[TMP16:.+]] = icmp eq i32 %[[OMP_FLOOR1_IV:.+]], %[[TMP10:.+]]
+// CHECK-NEXT:    %[[TMP17:.+]] = select i1 %[[TMP16:.+]], i32 %[[TMP11:.+]], i32 %[[TMP4:.+]]
+// CHECK-NEXT:    br label %[[OMP_TILE0_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_TILE0_IV:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER:.+]] ], [ %[[OMP_TILE0_NEXT:.+]], %[[OMP_TILE0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_TILE0_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_COND]]:
+// CHECK-NEXT:    %[[OMP_TILE0_CMP:.+]] = icmp ult i32 %[[OMP_TILE0_IV:.+]], %[[TMP15:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_TILE0_CMP:.+]], label %[[OMP_TILE0_BODY:.+]], label %[[OMP_TILE0_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE1_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE1_HEADER]]:
+// CHECK-NEXT:    %[[OMP_TILE1_IV:.+]] = phi i32 [ 0, %[[OMP_TILE1_PREHEADER:.+]] ], [ %[[OMP_TILE1_NEXT:.+]], %[[OMP_TILE1_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_TILE1_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE1_COND]]:
+// CHECK-NEXT:    %[[OMP_TILE1_CMP:.+]] = icmp ult i32 %[[OMP_TILE1_IV:.+]], %[[TMP17:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_TILE1_CMP:.+]], label %[[OMP_TILE1_BODY:.+]], label %[[OMP_TILE1_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE1_BODY]]:
+// CHECK-NEXT:    %[[TMP18:.+]] = mul nuw i32 %[[TMP3:.+]], %[[OMP_FLOOR0_IV:.+]]
+// CHECK-NEXT:    %[[TMP19:.+]] = add nuw i32 %[[TMP18:.+]], %[[OMP_TILE0_IV:.+]]
+// CHECK-NEXT:    %[[TMP20:.+]] = mul nuw i32 %[[TMP4:.+]], %[[OMP_FLOOR1_IV:.+]]
+// CHECK-NEXT:    %[[TMP21:.+]] = add nuw i32 %[[TMP20:.+]], %[[OMP_TILE1_IV:.+]]
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_BODY:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_BODY4]]:
+// CHECK-NEXT:    br label %[[OMP_LOOP_REGION12:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_LOOP_REGION12]]:
+// CHECK-NEXT:    %[[TMP22:.+]] = add i32 %[[TMP19:.+]], %[[TMP21:.+]]
+// CHECK-NEXT:    %[[TMP23:.+]] = getelementptr inbounds float, ptr %[[TMP0:.+]], i32 %[[TMP22:.+]]
+// CHECK-NEXT:    store float 4.200000e+01, ptr %[[TMP23:.+]], align 4
+// CHECK-NEXT:    br label %[[OMP_REGION_CONT11:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_REGION_CONT11]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_INC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE1_INC]]:
+// CHECK-NEXT:    %[[OMP_TILE1_NEXT:.+]] = add nuw i32 %[[OMP_TILE1_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_TILE1_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE1_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE1_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_INC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_INC]]:
+// CHECK-NEXT:    %[[OMP_TILE0_NEXT:.+]] = add nuw i32 %[[OMP_TILE0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_TILE0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_INC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR1_INC]]:
+// CHECK-NEXT:    %[[OMP_FLOOR1_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR1_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR1_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR1_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_INC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_INC]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_FLOOR0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_REGION_CONT:.+]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_INC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_INC]]:
+// CHECK-NEXT:    %[[OMP_OMP_LOOP_NEXT:.+]] = add nuw i32 %[[TMP19:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_AFTER]]:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index e043a8c..00ee6b7 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1340,6 +1340,34 @@ llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) {
   llvm.return
 }
 
+// CHECK-LABEL: rocdl.cvt.scalef32.pk8
+// CHECK-SAME:(<8 x float> %[[V8F32:.+]], <8 x half> %[[V8F16:.+]], <8 x bfloat> %[[V8BF16:.+]], float %[[SCALE:.+]])
+llvm.func @rocdl.cvt.scalef32.pk8(%v8xf32: vector<8xf32>, %v8xf16: vector<8xf16>, %v8xbf16: vector<8xbf16>, %scale: f32) {
+
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> %[[V8F32]], float %[[SCALE]])
+  %0 = rocdl.cvt.scalef32.pk8.fp8.f32 %v8xf32, %scale : vector<2xi32>
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> %[[V8F32]], float %[[SCALE]])
+  %1 = rocdl.cvt.scalef32.pk8.bf8.f32 %v8xf32, %scale : vector<2xi32>
+  // CHECK: call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> %[[V8F32]], float %[[SCALE]])
+  %2 = rocdl.cvt.scalef32.pk8.fp4.f32 %v8xf32, %scale : i32
+
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> %[[V8F16]], float %[[SCALE]])
+  %3 = rocdl.cvt.scalef32.pk8.fp8.f16 %v8xf16, %scale : vector<2xi32>
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> %[[V8F16]], float %[[SCALE]])
+  %4 = rocdl.cvt.scalef32.pk8.bf8.f16 %v8xf16, %scale : vector<2xi32>
+  // CHECK: call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> %[[V8F16]], float %[[SCALE]])
+  %5 = rocdl.cvt.scalef32.pk8.fp4.f16 %v8xf16, %scale : i32
+
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> %[[V8BF16]], float %[[SCALE]])
+  %6 = rocdl.cvt.scalef32.pk8.fp8.bf16 %v8xbf16, %scale : vector<2xi32>
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> %[[V8BF16]], float %[[SCALE]])
+  %7 = rocdl.cvt.scalef32.pk8.bf8.bf16 %v8xbf16, %scale : vector<2xi32>
+  // CHECK: call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> %[[V8BF16]], float %[[SCALE]])
+  %8 = rocdl.cvt.scalef32.pk8.fp4.bf16 %v8xbf16, %scale : i32
+
+  llvm.return
+}
+
 // CHECK-LABEL: @rocdl.cvt.scale.pk16
 // CHECK-SAME:(<3 x i32> %[[SRC0:.+]], i32 %[[SCALE:.+]])
 llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) {
diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt b/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt
index 103bc94..7d32577 100644
--- a/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt
@@ -12,5 +12,7 @@ add_mlir_library(MLIRTestIRDLToCppDialect
 mlir_target_link_libraries(MLIRTestIRDLToCppDialect PUBLIC
   MLIRIR
   MLIRPass
+  MLIRSCFDialect
   MLIRTransforms
+  MLIRTestDialect
 )
diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp b/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp
index 9550e4c..421db7e 100644
--- a/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp
+++ b/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp
@@ -13,6 +13,7 @@
 // #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Region.h"
 
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
@@ -54,16 +55,34 @@ struct TestOpConversion : public OpConversionPattern<test_irdl_to_cpp::BeefOp> {
   }
 };
 
+struct TestRegionConversion
+    : public OpConversionPattern<test_irdl_to_cpp::ConditionalOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(mlir::test_irdl_to_cpp::ConditionalOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Just exercising the C++ API even though these are not enforced in the
+    // dialect definition
+    assert(op.getThen().getBlocks().size() == 1);
+    assert(adaptor.getElse().getBlocks().size() == 1);
+    auto ifOp = scf::IfOp::create(rewriter, op.getLoc(), op.getInput());
+    rewriter.replaceOp(op, ifOp);
+    return success();
+  }
+};
+
 struct ConvertTestDialectToSomethingPass
     : PassWrapper<ConvertTestDialectToSomethingPass, OperationPass<ModuleOp>> {
   void runOnOperation() override {
     MLIRContext *ctx = &getContext();
     RewritePatternSet patterns(ctx);
-    patterns.add<TestOpConversion>(ctx);
+    patterns.add<TestOpConversion, TestRegionConversion>(ctx);
     ConversionTarget target(getContext());
-    target.addIllegalOp<test_irdl_to_cpp::BeefOp>();
-    target.addLegalOp<test_irdl_to_cpp::BarOp>();
-    target.addLegalOp<test_irdl_to_cpp::HashOp>();
+    target.addIllegalOp<test_irdl_to_cpp::BeefOp,
+                        test_irdl_to_cpp::ConditionalOp>();
+    target.addLegalOp<test_irdl_to_cpp::BarOp, test_irdl_to_cpp::HashOp,
+                      scf::IfOp, scf::YieldOp>();
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
       signalPassFailure();
@@ -73,6 +92,10 @@ struct ConvertTestDialectToSomethingPass
   StringRef getDescription() const final {
     return "Checks the convertability of an irdl dialect";
   }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<scf::SCFDialect>();
+  }
 };
 
 void registerIrdlTestDialect(mlir::DialectRegistry &registry) {
diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir
index f6233ee..1915324 100644
--- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir
+++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir
@@ -1,15 +1,29 @@
 // RUN: mlir-opt %s --pass-pipeline="builtin.module(test-irdl-conversion-check)" | FileCheck %s
 // CHECK-LABEL: module {
 module {
-    // CHECK: func.func @test() {
+    // CHECK: func.func @test(%[[test_arg:[^ ]*]]: i1) {
     // CHECK: %[[v0:[^ ]*]] = "test_irdl_to_cpp.bar"() : () -> i32
     // CHECK: %[[v1:[^ ]*]] = "test_irdl_to_cpp.bar"() : () -> i32
     // CHECK: %[[v2:[^ ]*]] = "test_irdl_to_cpp.hash"(%[[v0]], %[[v0]]) : (i32, i32) -> i32
+    // CHECK: scf.if %[[test_arg]]
     // CHECK: return
     // CHECK: }
-    func.func @test() {
+    func.func @test(%test_arg: i1) {
         %0 = "test_irdl_to_cpp.bar"() : () -> i32
         %1 = "test_irdl_to_cpp.beef"(%0, %0) : (i32, i32) -> i32
+        "test_irdl_to_cpp.conditional"(%test_arg) ({
+        ^cond(%test: i1):
+          %3 = "test_irdl_to_cpp.bar"() : () -> i32
+          "test.terminator"() : ()->()
+        }, {
+        ^then(%what: i1, %ever: i32):
+          %4 = "test_irdl_to_cpp.bar"() : () -> i32
+          "test.terminator"() : ()->()
+        }, {
+        ^else():
+          %5 = "test_irdl_to_cpp.bar"() : () -> i32
+          "test.terminator"() : ()->()
+        }) : (i1) -> ()
         return
     }
 
diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir
index 42e713e..85fb8cb 100644
--- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir
+++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir
@@ -2,7 +2,7 @@
 
 // CHECK: class TestIrdlToCpp
 irdl.dialect @test_irdl_to_cpp {
-    
+
     // CHECK: class FooType
     irdl.type @foo
 
@@ -32,4 +32,53 @@ irdl.dialect @test_irdl_to_cpp {
         irdl.operands(lhs: %0, rhs: %0)
         irdl.results(res: %0)
     }
+
+    // CHECK: ConditionalOp declarations
+    // CHECK: ConditionalOpGenericAdaptorBase
+    // CHECK:  ::mlir::Region &getCond() { return *getRegions()[0]; }
+    // CHECK:  ::mlir::Region &getThen() { return *getRegions()[1]; }
+    // CHECK:  ::mlir::Region &getElse() { return *getRegions()[2]; }
+    //
+    // CHECK: class ConditionalOp : public ::mlir::Op<ConditionalOp, ::mlir::OpTrait::NRegions<3>::Impl, ::mlir::OpTrait::OpInvariants>
+    // CHECK:  ::mlir::Region &getCond() { return (*this)->getRegion(0); }
+    // CHECK:  ::mlir::Region &getThen() { return (*this)->getRegion(1); }
+    // CHECK:  ::mlir::Region &getElse() { return (*this)->getRegion(2); }
+
+    // CHECK: ConditionalOp definitions
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_cond
+    // CHECK: if (!(region.getNumArguments() == 1)) {
+    // CHECK: failed to verify constraint: region with 1 entry block argument(s)
+
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_then
+    // CHECK: if (!(true)) {
+
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_else
+    // CHECK: if (!(region.getNumArguments() == 0)) {
+    // CHECK: failed to verify constraint: region with 0 entry block argument(s)
+
+    // CHECK:  ConditionalOp::build
+    // CHECK: for (unsigned i = 0; i != 3; ++i)
+    // CHECK-NEXT: (void)odsState.addRegion();
+
+    // CHECK: ConditionalOp::verifyInvariantsImpl
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_cond
+    // CHECK: failure
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_then
+    // CHECK: failure
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_else
+    // CHECK: failure
+    // CHECK: success
+    irdl.operation @conditional {
+        %r0 = irdl.region      // Unconstrained region
+        %r1 = irdl.region()    // Region with no entry block arguments
+
+        // TODO(#161018): support irdl.is in irdl-to-cpp
+        // %v0 = irdl.is i1       // Type constraint: i1 (boolean)
+        %v0 = irdl.any
+        %r2 = irdl.region(%v0) // Region with one i1 entry block argument
+        irdl.regions(cond: %r2, then: %r0, else: %r1)
+
+        %0 = irdl.any
+        irdl.operands(input: %0)
+    }
 }
diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir
index 403b492..cc27456 100644
--- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir
+++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir
@@ -7,7 +7,7 @@ irdl.dialect @test_irdl_to_cpp {
     irdl.results(res: %1)
   }
 }
-// ----- 
+// -----
 
 irdl.dialect @test_irdl_to_cpp {
   irdl.operation @operands_no_any_of {
@@ -42,7 +42,7 @@ irdl.dialect @test_irdl_to_cpp {
 
 irdl.dialect @test_irdl_to_cpp {
   irdl.type @ty {
-    %0 = irdl.any 
+    %0 = irdl.any
     // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.parameters operation}}
     irdl.parameters(ty: %0)
   }
@@ -51,29 +51,8 @@ irdl.dialect @test_irdl_to_cpp {
 // -----
 
 irdl.dialect @test_irdl_to_cpp {
-  irdl.operation @test_op {
-    // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.region operation}}
-    %0 = irdl.region()
-    irdl.regions(reg: %0)
-  }
-  
-}
-
-// -----
-
-irdl.dialect @test_irdl_to_cpp {
-  irdl.operation @test_op {
-    // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.regions operation}}
-    irdl.regions()
-  }
-  
-}
-
-// -----
-
-irdl.dialect @test_irdl_to_cpp {
   irdl.type @test_derived {
     // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.base operation}}
     %0 = irdl.base "!builtin.integer"
-  }    
+  }
 }
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 094ef0a..6ba7a00 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -173,8 +173,6 @@ struct TestXeGPUUnrollingPatterns
 
 #undef DEBUG_TYPE
 #define DEBUG_TYPE "test-xegpu-layout-interface"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
 
 // Test pattern for distributing vector::StepOp from workgroup to subgroup.
 // Validates DistributeLayoutAttr interfaces for offset computation
@@ -220,6 +218,35 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
   }
 };
 
+struct TestXeGPUSGDistribute
+    : public PassWrapper<TestXeGPUSGDistribute,
+                         OperationPass<gpu::GPUModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUSGDistribute)
+
+  StringRef getArgument() const final { return "test-xegpu-sg-distribute"; }
+
+  StringRef getDescription() const final {
+    return "Test the implementation of XeGPU Subgroup Distribution";
+  }
+
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect>();
+    registry.insert<memref::MemRefDialect>();
+    registry.insert<xegpu::XeGPUDialect>();
+    registry.insert<vector::VectorDialect>();
+    registry.insert<index::IndexDialect>();
+  }
+
+  TestXeGPUSGDistribute() = default;
+  TestXeGPUSGDistribute(const TestXeGPUSGDistribute &pass) = default;
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+  }
+};
+
 struct TestXeGPULayoutInterface
     : public PassWrapper<TestXeGPULayoutInterface,
                          OperationPass<gpu::GPUModuleOp>> {
@@ -284,6 +311,7 @@ namespace test {
 void registerTestXeGPULowerings() {
   PassRegistration<TestXeGPUUnrollingPatterns>();
   PassRegistration<TestXeGPULayoutInterface>();
+  PassRegistration<TestXeGPUSGDistribute>();
 }
 } // namespace test
 } // namespace mlir
diff --git a/mlir/test/mlir-tblgen/op-format-invalid.td b/mlir/test/mlir-tblgen/op-format-invalid.td
index 2f29543..0a022ad 100644
--- a/mlir/test/mlir-tblgen/op-format-invalid.td
+++ b/mlir/test/mlir-tblgen/op-format-invalid.td
@@ -307,7 +307,7 @@ def DirectiveTypeZOperandInvalidI : TestFormat_Op<[{
 def LiteralInvalidA : TestFormat_Op<[{
   `a:`
 }]>;
-// CHECK: error: expected valid literal but got '1': single character literal must be a letter or one of '_:,=<>()[]{}?+*'
+// CHECK: error: expected valid literal but got '1': single character literal must be a letter or one of '_:,=<>()[]{}?+-*'
 def LiteralInvalidB : TestFormat_Op<[{
   `1`
 }]>;
diff --git a/mlir/test/mlir-tblgen/op-format-spec.td b/mlir/test/mlir-tblgen/op-format-spec.td
index 1541cd0..1ac2311 100644
--- a/mlir/test/mlir-tblgen/op-format-spec.td
+++ b/mlir/test/mlir-tblgen/op-format-spec.td
@@ -123,7 +123,7 @@ def DirectiveTypeValid : TestFormat_Op<[{
 
 // CHECK-NOT: error
 def LiteralValid : TestFormat_Op<[{
-  `_` `:` `,` `=` `<` `>` `(` `)` `[` `]` `?` `+` `*` ` ` `` `->` `\n` `abc$._`
+  `_` `:` `,` `=` `<` `>` `(` `)` `[` `]` `?` `+` `-` `*` ` ` `` `->` `\n` `abc$._`
   attr-dict
 }]>;
 
diff --git a/mlir/test/python/dialects/transform_tune_ext.py b/mlir/test/python/dialects/transform_tune_ext.py
index dfb9359..eb2a083 100644
--- a/mlir/test/python/dialects/transform_tune_ext.py
+++ b/mlir/test/python/dialects/transform_tune_ext.py
@@ -1,21 +1,21 @@
 # RUN: %PYTHON %s | FileCheck %s
 
-from mlir.ir import *
+from mlir import ir
 from mlir.dialects import transform
 from mlir.dialects.transform import tune, debug
 
 
 def run(f):
-    print("\nTEST:", f.__name__)
-    with Context(), Location.unknown():
-        module = Module.create()
-        with InsertionPoint(module.body):
+    print("\n// TEST:", f.__name__)
+    with ir.Context(), ir.Location.unknown():
+        module = ir.Module.create()
+        with ir.InsertionPoint(module.body):
             sequence = transform.SequenceOp(
                 transform.FailurePropagationMode.Propagate,
                 [],
                 transform.AnyOpType.get(),
             )
-            with InsertionPoint(sequence.body):
+            with ir.InsertionPoint(sequence.body):
                 f(sequence.bodyTarget)
                 transform.YieldOp()
         print(module)
@@ -29,10 +29,10 @@ def testKnobOp(target):
 
     # CHECK: %[[HEADS_OR_TAILS:.*]] = transform.tune.knob<"coin"> options = [true, false] -> !transform.any_param
     heads_or_tails = tune.KnobOp(
-        result=any_param, name=StringAttr.get("coin"), options=[True, False]
+        result=any_param, name=ir.StringAttr.get("coin"), options=[True, False]
     )
     # CHECK: transform.tune.knob<"animal"> options = ["cat", "dog", unit] -> !transform.any_param
-    tune.KnobOp(any_param, name="animal", options=["cat", "dog", UnitAttr.get()])
+    tune.KnobOp(any_param, name="animal", options=["cat", "dog", ir.UnitAttr.get()])
     # CHECK: transform.tune.knob<"tile_size"> options = [2, 4, 8, 16, 24, 32] -> !transform.any_param
     tune.KnobOp(any_param, "tile_size", [2, 4, 8, 16, 24, 32])
     # CHECK: transform.tune.knob<"magic_value"> options = [2.000000e+00, 2.250000e+00, 2.500000e+00, 2.750000e+00, 3.000000e+00] -> !transform.any_param
@@ -45,7 +45,10 @@ def testKnobOp(target):
     heads = tune.KnobOp(any_param, "coin", options=[True, False], selected=True)
     # CHECK: transform.tune.knob<"animal"> = "dog" from options = ["cat", "dog", unit] -> !transform.any_param
     tune.KnobOp(
-        any_param, name="animal", options=["cat", "dog", UnitAttr.get()], selected="dog"
+        any_param,
+        name="animal",
+        options=["cat", "dog", ir.UnitAttr.get()],
+        selected="dog",
     )
     # CHECK: transform.tune.knob<"tile_size"> = 8 : i64 from options = [2, 4, 8, 16, 24, 32] -> !transform.any_param
     tune.KnobOp(any_param, "tile_size", [2, 4, 8, 16, 24, 32], selected=8)
@@ -57,16 +60,90 @@ def testKnobOp(target):
 
     # CHECK: transform.tune.knob<"range_as_a_dict"> = 4 : i64 from options = {start = 2 : i64, step = 2 : i64, stop = 16 : i64} -> !transform.any_param
     # NB: Membership of `selected` in non-ArrayAttr `options` is _not_ verified.
-    i64 = IntegerType.get_signless(64)
+    i64 = ir.IntegerType.get_signless(64)
     tune.knob(
         any_param,
         "range_as_a_dict",
-        DictAttr.get(
+        ir.DictAttr.get(
             {
-                "start": IntegerAttr.get(i64, 2),
-                "stop": IntegerAttr.get(i64, 16),
-                "step": IntegerAttr.get(i64, 2),
+                "start": ir.IntegerAttr.get(i64, 2),
+                "stop": ir.IntegerAttr.get(i64, 16),
+                "step": ir.IntegerAttr.get(i64, 2),
             }
         ),
         selected=4,
     )
+
+
+# CHECK-LABEL: TEST: testAlternativesOp
+@run
+def testAlternativesOp(target):
+    any_param = transform.AnyParamType.get()
+
+    # CHECK: %[[LEFT_OR_RIGHT_OUTCOME:.*]] = transform.tune.alternatives<"left_or_right"> -> !transform.any_param {
+    left_or_right = tune.AlternativesOp(
+        [transform.AnyParamType.get()], "left_or_right", 2
+    )
+    idx_for_left, idx_for_right = 0, 1
+    with ir.InsertionPoint(left_or_right.alternatives[idx_for_left].blocks[0]):
+        # CHECK: %[[C0:.*]] = transform.param.constant 0
+        i32_0 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 0)
+        c0 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_0)
+        # CHECK: transform.yield %[[C0]]
+        transform.yield_(c0)
+    # CHECK-NEXT: }, {
+    with ir.InsertionPoint(left_or_right.alternatives[idx_for_right].blocks[0]):
+        # CHECK: %[[C1:.*]] = transform.param.constant 1
+        i32_1 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 1)
+        c1 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_1)
+        # CHECK: transform.yield %[[C1]]
+        transform.yield_(c1)
+    # CHECK-NEXT: }
+    outcome_of_left_or_right_decision = left_or_right.results[0]
+
+    # CHECK: transform.tune.alternatives<"fork_in_the_road"> selected_region = 0 -> !transform.any_param {
+    fork_in_the_road = tune.AlternativesOp(
+        [transform.AnyParamType.get()], "fork_in_the_road", 2, selected_region=0
+    )
+    with ir.InsertionPoint(fork_in_the_road.alternatives[idx_for_left].blocks[0]):
+        # CHECK: %[[C0:.*]] = transform.param.constant 0
+        i32_0 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 0)
+        c0 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_0)
+        # CHECK: transform.yield %[[C0]]
+        transform.yield_(c0)
+    # CHECK-NEXT: }, {
+    with ir.InsertionPoint(fork_in_the_road.alternatives[idx_for_right].blocks[0]):
+        # CHECK: %[[C1:.*]] = transform.param.constant 1
+        i32_1 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 1)
+        c1 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_1)
+        # CHECK: transform.yield %[[C1]]
+        transform.yield_(c1)
+    # CHECK-NEXT: }
+
+    # CHECK: transform.tune.alternatives<"left_or_right_as_before"> selected_region = %[[LEFT_OR_RIGHT_OUTCOME]] : !transform.any_param {
+    left_or_right_as_before = tune.AlternativesOp(
+        [],
+        "left_or_right_as_before",
+        2,
+        selected_region=outcome_of_left_or_right_decision,
+    )
+    with ir.InsertionPoint(
+        left_or_right_as_before.alternatives[idx_for_left].blocks[0]
+    ):
+        # CHECK: transform.param.constant 1337
+        i32_1337 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 1337)
+        c1337 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_1337)
+        # CHECK: transform.debug.emit_param_as_remark
+        debug.emit_param_as_remark(c1337)
+        transform.yield_([])
+    # CHECK-NEXT: }, {
+    with ir.InsertionPoint(
+        left_or_right_as_before.alternatives[idx_for_right].blocks[0]
+    ):
+        # CHECK: transform.param.constant 42
+        i32_42 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 42)
+        c42 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_42)
+        # CHECK: transform.debug.emit_param_as_remark
+        debug.emit_param_as_remark(c42)
+        transform.yield_([])
+    # CHECK-NEXT: }
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index 4a3625c..cb4cfc8c 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -696,6 +696,7 @@ def testOperationPrint():
     # CHECK: resource1: "0x08
     module.operation.print(large_elements_limit=2)
 
+
 # CHECK-LABEL: TEST: testKnownOpView
 @run
 def testKnownOpView():
@@ -969,6 +970,13 @@ def testOperationLoc():
         assert op.location == loc
         assert op.operation.location == loc
 
+        another_loc = Location.name("another_loc")
+        op.location = another_loc
+        assert op.location == another_loc
+        assert op.operation.location == another_loc
+        # CHECK: loc("another_loc")
+        print(op.location)
+
 
 # CHECK-LABEL: TEST: testModuleMerge
 @run