6 files changed, 938 insertions, 662 deletions
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index cb69058..1484d7e 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -358,6 +358,41 @@ func.func @acc_loop_multiple_block() {
 
 // -----
 
+acc.firstprivate.recipe @firstprivatization_memref_10xf32 : memref<10xf32> init {
+^bb0(%arg0: memref<10xf32>):
+  %0 = memref.alloca() : memref<10xf32>
+  acc.yield %0 : memref<10xf32>
+} copy {
+^bb0(%arg0: memref<10xf32>, %arg1: memref<10xf32>):
+  memref.copy %arg0, %arg1 : memref<10xf32> to memref<10xf32>
+  acc.terminator
+} destroy {
+^bb0(%arg0: memref<10xf32>):
+  acc.terminator
+}
+
+func.func @testloopfirstprivate(%a: memref<10xf32>, %b: memref<10xf32>) -> () {
+  %c0 = arith.constant 0 : index
+  %c10 = arith.constant 10 : index
+  %c1 = arith.constant 1 : index
+  %firstprivate = acc.firstprivate varPtr(%a : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32>
+  acc.loop firstprivate(@firstprivatization_memref_10xf32 -> %firstprivate : memref<10xf32>) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
+    "test.openacc_dummy_op"() : () -> ()
+    acc.yield
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+  return
+}
+
+// CHECK-LABEL: func.func @testloopfirstprivate(
+// CHECK-SAME:    %[[ARG0:.*]]: memref<10xf32>, %[[ARG1:.*]]: memref<10xf32>)
+// CHECK:         %[[FIRSTPRIVATE:.*]] = acc.firstprivate varPtr(%[[ARG0]] : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32>
+// CHECK:         acc.loop firstprivate(@firstprivatization_memref_10xf32 -> %[[FIRSTPRIVATE]] : memref<10xf32>) control(%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
+// CHECK:           "test.openacc_dummy_op"() : () -> ()
+// CHECK:           acc.yield
+// CHECK:         } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+
+// -----
+
 acc.private.recipe @privatization_memref_10_f32 : memref<10xf32> init {
 ^bb0(%arg0: memref<10xf32>):
   %0 = memref.alloc() : memref<10xf32>
@@ -535,6 +570,7 @@ acc.firstprivate.recipe @firstprivatization_memref_10xf32 : memref<10xf32> init
   acc.yield %0 : memref<10xf32>
 } copy {
 ^bb0(%arg0: memref<10xf32>, %arg1: memref<10xf32>):
+  memref.copy %arg0, %arg1 : memref<10xf32> to memref<10xf32>
   acc.terminator
 } destroy {
 ^bb0(%arg0: memref<10xf32>):
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
new file mode 100644
index 0000000..40b66d1
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -0,0 +1,575 @@
+// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -test-xegpu-sg-distribute -allow-unregistered-dialect \
+// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @store_nd_1d
+// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:         %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
+// CHECK-SAME:      -> (vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
+// CHECK:           gpu.yield %{{.*}} : vector<16xf32>,
+// CHECK-SAME:        !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
+// CHECK-NEXT:    }
+// CHECK-NEXT:    %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
+// CHECK-NEXT:    xegpu.store_nd %[[W]]#0, %[[T1]][%[[W]]#2]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+gpu.module @xevm_module{
+  gpu.func @store_nd_1d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      %cst = "some_op"() : () -> vector<16xf32>
+      xegpu.store_nd %cst, %0 [%c0] {layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @store_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
+// CHECK-SAME:    -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[CAST:.*]] = vector.shape_cast %[[W]]#0 : vector<16x1xf16> to vector<16xf16>
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.store_nd %[[CAST]], %[[T1]][%[[W]]#2, %[[W]]#3]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @xevm_module{
+  gpu.func @store_nd_2d(%laneid : index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      %cst = "some_op"() : () -> vector<16x16xf16>
+      xegpu.store_nd %cst, %0 [%c0, %c0] {layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+        : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    }
+    gpu.return
+  }
+}
+
+
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<1xf32>,
+// CHECK-SAME:    !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.load_nd %[[T1]][%[[W]]#2]  : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
+gpu.module @xevm_module{
+  gpu.func @load_nd_1d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      %1 = xegpu.load_nd %0 [%c0]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
+        !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
+      gpu.yield %1 : vector<16xf32>
+    }
+    "some_user_op"(%r) : (vector<1xf32>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:     #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK:       vector.shape_cast %[[T2]] : vector<16xf16> to vector<16x1xf16>
+gpu.module @xevm_module{
+  gpu.func @load_nd_2d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+        : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+      gpu.yield %1 : vector<16x16xf16>
+    }
+    "some_user_op"(%r) : (vector<16x1xf16>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_array_length
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<2x16x1xf16>,
+// CHECK-SAME:    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<2x16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<
+// CHECK-SAME:      array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16],
+// CHECK-SAME:      lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+// CHECK-NEXT:  %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3]  : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
+// CHECK-NEXT:  vector.shape_cast %[[T2]] : vector<32xf16> to vector<2x16x1xf16>
+gpu.module @xevm_module{
+  gpu.func @load_nd_array_length(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x16x1xf16>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
+        #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+        : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
+          #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16>
+      gpu.yield %1 : vector<2x16x16xf16>
+    }
+    "some_user_op"(%r) : (vector<2x16x1xf16>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @dpas
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] ->
+// CHECK-SAME:    (vector<8x1xf32>, vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) {
+// CHECK:         gpu.yield %{{.*}} : vector<8x16xf32>, vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
+// CHECK-NEXT:  }
+// CHECK-DAG:   %[[T1:.*]] = vector.shape_cast %[[W]]#1 : vector<8x1xf16> to vector<8xf16>
+// CHECK-DAG:   %[[T2:.*]] = vector.shape_cast %[[W]]#2 : vector<16x1xf16> to vector<16xf16>
+// CHECK-DAG:   %[[T3:.*]] = vector.shape_cast %[[W]]#3 : vector<8x1xf32> to vector<8xf32>
+// CHECK-NEXT:  %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T2]], %[[T3]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
+// CHECK-NEXT:  vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
+gpu.module @xevm_module{
+  gpu.func @dpas(%laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
+      %0 = "some_op"() : () -> vector<8x16xf16>
+      %1 = "some_op"() : () -> vector<16x16xf16>
+      %2 = "some_op"() : () -> vector<8x16xf32>
+      %3 = xegpu.dpas %0, %1, %2
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+          layout_operand_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      gpu.yield %3 : vector<8x16xf32>
+    }
+    "some_user_op"(%r) : (vector<8x1xf32>) -> ()
+    gpu.return
+  }
+}
+
+
+// -----
+// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG1]])[16] -> (!xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64) {
+// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = xegpu.create_nd_tdesc %[[W]]#1, shape : [64, 128], strides : [128, 1] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK-NEXT:  builtin.unrealized_conversion_cast %[[T1]] : !xegpu.tensor_desc<16x16xf16> to !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> {resolve_simt_type_mismatch}
+gpu.module @xevm_module{
+  gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+      %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 ->
+        !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      gpu.yield %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    }
+    "some_user_op"(%r)
+      : (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @prefetch_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-SAME:      , index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.prefetch_nd %[[T1]][%[[W]]#1, %[[W]]#2]
+// CHECK-SAME:    <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
+gpu.module @xevm_module{
+  gpu.func @prefetch_2d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : ()
+        -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      xegpu.prefetch_nd %0[%c0, %c0]
+        <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+        : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @prefetch_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16xf16,
+// CHECK-SAME:     #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
+// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.prefetch_nd %[[T1]][%[[W]]#1] <{l1_hint = #xegpu.cache_hint<cached>,
+// CHECK-SAME:    l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
+gpu.module @xevm_module{
+  gpu.func @prefetch_1d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : ()
+        -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      xegpu.prefetch_nd %0[%c0]
+        <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+        : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) {
+// CHECK:  gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
+// CHECK:     gpu.yield %{{.*}}
+// CHECK:  }
+// CHECK:  %{{.*}} = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
+// CHECK:  gpu.barrier
+gpu.module @xevm_module{
+  gpu.func @gpu_barrier(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf16>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      %1 = xegpu.load_nd %0[%c0]
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
+      gpu.barrier
+      gpu.yield %1 : vector<16xf16>
+    }
+    "some_user_op"(%r) : (vector<1xf16>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
+// CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
+// CHECK-SAME:    -> (vector<2xf32>, vector<16x2xf32>, vector<2xf32>) {
+// CHECK:         %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x32xf32>
+// CHECK:         gpu.yield %{{.*}}, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<16x32xf32>, vector<32xf32>
+// CHECK-NEXT:  }
+// CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
+// CHECK-SAME:    {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T3:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32>
+// CHECK:       %[[T4:.*]] = vector.reduction <add>, %[[T2]], %[[T3]] : vector<16xf32> into f32
+// CHECK:       %[[T5:.*]] = vector.extract_strided_slice %[[W]]#1
+// CHECK-SAME:    {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T7:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
+// CHECK:       %[[T8:.*]] = vector.reduction <add>, %[[T6]], %[[T7]] : vector<16xf32> into f32
+// CHECK:       %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<2xf32>
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : () -> (vector<16x32xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
+      dense<0.0>  : vector<32xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+    {
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
+      layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
+    }  [0]
+    : vector<16x32xf32> to vector<32xf32>
+    gpu.yield %1 : vector<32xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
+// CHECK:      %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK-NEXT:   %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
+// CHECK-NEXT:   %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT:   %[[T3:.*]] = vector.reduction <add>, %[[T2]], %cst : vector<16xf32> into f32
+// CHECK-NEXT:   %[[T4:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT:   %[[T5:.*]] = vector.reduction <add>, %[[T4]], %cst : vector<16xf32> into f32
+// CHECK-NEXT:   %[[T6:.*]] = vector.from_elements %[[T3]], %[[T5]] : vector<2xf32>
+// CHECK-NEXT:   gpu.yield %[[T6]] : vector<2xf32>
+// CHECK-NEXT: }
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : () -> (vector<2x16xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+      dense<0.0>  : vector<2xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>,
+        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
+      }
+      [1] : vector<2x16xf32> to vector<2xf32>
+    gpu.yield %1 : vector<2xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
+// CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
+// CHECK:         %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<32x16xf32>
+// CHECK:         gpu.yield %9, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<32x16xf32>, vector<32xf32>
+// CHECK:       }
+// CHECK:       %[[T1:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32>
+// CHECK:       %[[T2:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32>
+// CHECK:       %[[T3:.*]] = vector.reduction <add>, %[[T1]], %[[T2]] : vector<16xf32> into f32
+// CHECK:       %[[T4:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
+// CHECK:       %[[T5:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
+// CHECK:       %[[T6:.*]] = vector.reduction <add>, %[[T4]], %[[T5]] : vector<16xf32> into f32
+// CHECK:       %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32>
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+      : () -> (vector<32x16xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}
+      dense<0.0>  : vector<32xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>,
+        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>
+      }
+      [1] : vector<32x16xf32> to vector<32xf32>
+    gpu.yield %1 : vector<32xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
+// CHECK:     %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK:       %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x2xf32>
+// CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:    {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]] {{.*}} : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
+// CHECK:       %[[T4:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:     {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T5:.*]] = vector.shape_cast %[[T4]] {{.*}} : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32
+// CHECK:       %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32>
+// CHECK:       gpu.yield %[[T7]] : vector<2xf32>
+// CHECK:     }
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+      : () -> (vector<16x2xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+      dense<0.0>  : vector<2xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>,
+        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
+      }
+      [0] : vector<16x2xf32> to vector<2xf32>
+    gpu.yield %1 : vector<2xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) {
+// CHECK:       %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex>
+// CHECK:       %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1>
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
+// CHECK-SAME:    -> (vector<1x8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
+// CHECK:         gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] :
+// CHECK-SAME:      vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
+// CHECK-SAME:    : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK-NEXT:  xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
+// CHECK-SAME:    : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+gpu.module @xevm_module{
+  gpu.func @scatter_ops_chunksize(%laneid: index, %src: memref<256xf16>) {
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %1 = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<1>: vector<16xi1>
+      %offset = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<12> : vector<16xindex>
+      %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
+        {
+          layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+          layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+        }
+        : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+      xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>,
+          layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+          layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+        }
+        : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
+// CHECK:       %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex>
+// CHECK:       %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1>
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
+// CHECK-SAME:    -> (vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
+// CHECK:         gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]]
+// CHECK-SAME:    : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3
+// CHECK-SAME:    : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+// CHECK-NEXT:  xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3
+// CHECK-SAME:    : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+gpu.module @xevm_module{
+  gpu.func @scatter_ops(%src: memref<256xf16>, %laneid: index) {
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %1 = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<1> : vector<16xi1>
+      %offset = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<12> : vector<16xindex>
+      %3 = xegpu.load %src[%offset], %1
+      {
+        layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+      } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+      xegpu.store %3, %src[%offset], %1
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+      }
+      : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (index, memref<256x256xf16>) {
+// CHECK:         gpu.yield %{{.*}}, %{{.*}} : index, memref<256x256xf16>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[W]]#1 : memref<256x256xf16> -> index
+// CHECK-NEXT:  arith.index_cast %[[INTPTR]] : index to i64
+gpu.module @xevm_module{
+  gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (index) {
+      %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
+      gpu.yield %ptr : index
+    }
+    %ptr_i64 = arith.index_cast %r : index to i64
+    "some_user_op"(%ptr_i64) : (i64) -> ()
+    gpu.return
+  }
+}
+
+
+// -----
+// CHECK-LABEL: gpu.func @vector_transpose(
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2x1xf32>, vector<1x2xf32>) {
+// CHECK:         %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<16x2xf32>
+// CHECK:         gpu.yield %{{.*}}, %[[SRC]] : vector<2x16xf32>, vector<16x2xf32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = vector.transpose %[[W]]#1, [1, 0] : vector<1x2xf32> to vector<2x1xf32>
+gpu.module @xevm_module{
+  gpu.func @vector_transpose(%arg0: memref<2x16xf32>, %laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) {
+      %cst = "some_op"()
+        {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+        : () -> (vector<16x2xf32>)
+      %transpose = vector.transpose %cst, [1, 0]
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [16 , 1], lane_data = [1, 1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : vector<16x2xf32> to vector<2x16xf32>
+      gpu.yield %transpose : vector<2x16xf32>
+    }
+    "some_user_op"(%r) : (vector<2x1xf32>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_bitcast(
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<4x1xi16>, vector<4x2xi8>) {
+// CHECK:         %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<4x32xi8>
+// CHECK:         gpu.yield %{{.*}}, %[[SRC]] : vector<4x16xi16>, vector<4x32xi8>
+// CHECK:       }
+// CHECK:       vector.bitcast %[[W]]#1 : vector<4x2xi8> to vector<4x1xi16>
+gpu.module @xevm_module{
+  gpu.func @vector_bitcast(%arg0: memref<4x16xi16>, %laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) {
+      %cst = "some_op"()
+        {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+        : () -> (vector<4x32xi8>)
+      %bitcast = vector.bitcast %cst
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : vector<4x32xi8> to vector<4x16xi16>
+      gpu.yield %bitcast : vector<4x16xi16>
+    }
+    "some_user_op"(%r) : (vector<4x1xi16>) -> ()
+    gpu.return
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 59fac26..0e1365a 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -1,198 +1,76 @@
 // RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -xegpu-subgroup-distribute \
 // RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
 
-// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \
-// RUN: -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \
-// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION
-
-// CHECK-LABEL: gpu.func @store_nd_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
-// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
-// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-// CHECK: gpu.return
-gpu.module @xevm_module{
-  gpu.func @store_nd_1d(%arg0: memref<16xf32>) {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf32>
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %cst, %0 [%c0]  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @store_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16>
-// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf16>
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %cst, %0 [%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
-// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-gpu.module @xevm_module{
-  gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    %1 = xegpu.load_nd %0 [%c0]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
-    %2 = xegpu.create_nd_tdesc %arg1 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %1, %2 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_array_length
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
-// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16>
-// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16>
-// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16>
-    %2 = vector.extract %1[%c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16> from vector<2x16x16xf16>
-    %3 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %2, %3[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_dpas_store
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}]  <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
-// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
-gpu.module @xevm_module{
-  gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    %3 = xegpu.load_nd %2[%c0, %c0]   {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-    %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-    %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %4, %5[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-
-// -----
 // CHECK-LABEL: gpu.func @load_dpas_postop_store
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
-// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
-// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32>
-// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>,
+// CHECK-SAME:      %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK:         %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+// CHECK:         %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK:         %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK:         %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK-DAG:     %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
+// CHECK:         %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
+// CHECK:         %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32>
+// CHECK-DAG:     %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG:     %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK:         xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
 gpu.module @xevm_module{
   gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
     %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    %3 = xegpu.load_nd %2[%c0, %c0]   {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-    %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-    %5 = math.exp %4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>
-    %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
+    %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
+      -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %1 = xegpu.load_nd %0[%c0, %c0]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+      !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+
+    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    %3 = xegpu.load_nd %2[%c0, %c0]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+      -> vector<16x16xf16>
+
+    %4 = xegpu.dpas %1, %3
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 
-// -----
-// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index,
-// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index,
-// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %5 = math.exp %4
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xf32>
+
+    %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
+      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>,
+      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
 }
 
 // -----
-// TODO: gemm does not use update_nd_offset because of an issue in scf-for distribution.
 // CHECK-LABEL: gpu.func @gemm
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
-// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x
-// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y
-// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
-// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
-// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) {
-// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
-// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
-// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
-// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
-// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>,
+// CHECK-SAME:     %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
+// CHECK-DAG:         %[[BLOCK_ID_X:.*]] = gpu.block_id x
+// CHECK-DAG:         %[[BLOCK_ID_Y:.*]] = gpu.block_id y
+// CHECK-DAG:         %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
+// CHECK-DAG:         %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
+// CHECK:             %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK-NEXT:        %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+// CHECK-NEXT:        %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
+// CHECK:             %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]])
+// CHECK-SAME:          -> (vector<8x1xf32>) {
+// CHECK-DAG:           %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK-DAG:           %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK-DAG:           %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK-DAG:           %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK-DAG:           %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-NEXT:          %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]]
+// CHECK-SAME:            : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
+// CHECK-NEXT:          %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
+// CHECK-NEXT:          scf.yield %[[T16]] : vector<8x1xf32>
+// CHECK-NEXT:        }
+// CHECK-NEXT:        %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-NEXT:        xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
 gpu.module @xevm_module{
 gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
   %c0 = arith.constant 0 : index
@@ -203,213 +81,56 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
   %block_id_y = gpu.block_id  y
   %0 = arith.muli %block_id_x, %c8 : index
   %1 = arith.muli %block_id_y, %c16 : index
-  %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %3 = xegpu.load_nd %2[%0, %1]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
-  %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
-    %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    %7 = xegpu.load_nd %5[%0, %arg3]   {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
-    %8 = xegpu.load_nd %6[%arg3, %1]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
-    %9 = xegpu.dpas %7, %8, %arg4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
-    scf.yield %9 : vector<8x16xf32>
-  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-  xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+  %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %3 = xegpu.load_nd %2[%0, %1]
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
 
-// -----
-// CHECK-LABEL: gpu.func @prefetch_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.prefetch_nd %0[%c0, %c0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @prefetch_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}]  <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
-gpu.module @xevm_module{
-  gpu.func @prefetch_1d(%arg0: memref<256xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0: memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.prefetch_nd %0[%c0]  <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
+  %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
 
-// -----
-// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}]  : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
-// CHECK-NEXT: gpu.barrier
-// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<1xf16>, !xegpu.tensor_desc<16xf16>
-gpu.module @xevm_module{
-  gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    %1 = xegpu.load_nd %0[%c0]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
-    gpu.barrier
-    %2 = xegpu.create_nd_tdesc %arg1 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %1, %2[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
+    %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
 
-// -----
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
-// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] ->
-// CHECK-SAME:    (!xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, vector<16x2xf32>) {
-// CHECK:             %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x32xf32>
-// CHECK-NEXT:        gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, vector<16x32xf32>
-// CHECK-NEXT:  }
-// CHECK:       %[[COL0:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-NEXT:  %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-NEXT:  %[[RED0:.*]] = vector.reduction <add>, %[[CAST0]], %{{.*}} : vector<16xf32> into f32
-// CHECK:       %[[COL1:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-NEXT:  %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-NEXT:  %[[RED1:.*]] = vector.reduction <add>, %[[CAST1]], %{{.*}} : vector<16xf32> into f32
-// CHECK-NEXT:  vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}  : () -> (vector<16x32xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.0>  : vector<32xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}  [0]
-    : vector<16x32xf32> to vector<32xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : vector<32xf32> to vector<1x32xf32>
-  xegpu.store_nd %3, %0[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+    %7 = xegpu.load_nd %5[%0, %arg3]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+    %8 = xegpu.load_nd %6[%arg3, %1]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
 
-// -----
-// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK-REDUCTION:         %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<2x16xf32,
-// CHECK-REDUCTION-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, f32, f32) {
-// CHECK-REDUCTION:           %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<2x16xf32>
-// CHECK-REDUCTION-NEXT:      %[[ROW0:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
-// CHECK-REDUCTION-NEXT:      %[[R0:.*]] = vector.reduction <add>, %[[ROW0]], %{{.*}} : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:      %[[ROW1:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-REDUCTION-NEXT:      %[[R1:.*]] = vector.reduction <add>, %[[ROW1]], %{{.*}} : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:      gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, f32, f32
-// CHECK-REDUCTION-NEXT:    }
-// CHECK-REDUCTION-NEXT:    vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}  : () -> (vector<2x16xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} dense<0.0>  : vector<2xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-    [1] : vector<2x16xf32> to vector<2xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : vector<2xf32> to vector<2x1xf32>
-  %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<2x1xf32> to vector<2x16xf32>
-  xegpu.store_nd %4, %0[%c0, %c0] : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+    %9 = xegpu.dpas %7, %8, %arg4
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
 
-// -----
-// CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
-// CHECK:             %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%0)[16] ->
-// CHECK-SAME:          (!xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<2x16xf32>) {
-// CHECK:                 %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<32x16xf32>
-// CHECK-NEXT:            gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<32x16xf32>
-// CHECK-NEXT:        }
-// CHECK:             %[[ROW0:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:        %[[R0:.*]] = vector.reduction <add>, %[[ROW0]], %{{.*}} : vector<16xf32> into f32
-// CHECK:             %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:        %[[R1:.*]] = vector.reduction <add>, %[[ROW1]], %{{.*}} : vector<16xf32> into f32
-// CHECK-NEXT:        vector.from_elements %[[R0]], %[[R1]] : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}  : () -> (vector<32x16xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>} dense<0.0>  : vector<32xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}  [1]
-    : vector<32x16xf32> to vector<32xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-    : vector<32xf32> to vector<32x1xf32>
-  xegpu.store_nd %3, %0[%c0, %c0] : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+    scf.yield %9 : vector<8x16xf32>
+  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 
-// -----
-// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK-REDUCTION:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<16x2xf32,
-// CHECK-REDUCTION-SAME:    #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, f32, f32) {
-// CHECK-REDUCTION:          %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<16x2xf32>
-// CHECK-REDUCTION-NEXT:     %[[COL0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-REDUCTION-NEXT:     %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-REDUCTION-NEXT:     %[[R0:.*]] = vector.reduction <add>, %[[CAST0]], %{{.*}} : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:     %[[COL1:.*]] = vector.extract_strided_slice %5 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-REDUCTION-NEXT:     %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-REDUCTION-NEXT:     %[[R1:.*]] = vector.reduction <add>, %[[CAST1]], %cst : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:     gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, f32, f32
-// CHECK-REDUCTION-NEXT:   }
-// CHECK-REDUCTION-NEXT:   vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}  : () -> (vector<16x2xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>} dense<0.0>  : vector<2xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-    [0] : vector<16x2xf32> to vector<2xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-    : vector<2xf32> to vector<1x2xf32>
-  %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : vector<1x2xf32> to vector<16x2xf32>
-  xegpu.store_nd %4, %0[%c0, %c0] : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+  xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>,
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 }
 
 // -----
-// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) {
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @xevm_module{
-  gpu.func @scatter_ops_chunksize(%src: memref<256xf16>) {
-    %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-    %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
-      layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
-    } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-    xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}},
-// CHECK-SAME: %[[PREDICATE:.*]]: i1) {
-// CHECK: %[[DEFAULT:.*]] = arith.constant dense<1.200000e+01> : vector<8xf16>
-// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK: %[[PREDICATED_LOAD:.*]] = scf.if %[[PREDICATE]] -> (vector<8xf16>) {
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: scf.yield %[[LOADED]] : vector<8xf16>
-// CHECK-NEXT: } else {
-// CHECK-NEXT:   scf.yield %[[DEFAULT]] : vector<8xf16>
-// CHECK-NEXT: }
-// CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+// CHECK-LABEL: gpu.func @scatter_ops_scf_yield
+// CHECK:         (%{{.*}}: memref<256xf16>, %[[PREDICATE:[a-zA-Z0-9]+]]: i1) {
+// CHECK-DAG:      %[[CST:.*]] = arith.constant dense<1.200000e+01> : vector<1x8xf16>
+// CHECK-DAG:      %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
+// CHECK-DAG:      %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
+// CHECK:          %[[IF:.*]] = scf.if %[[PREDICATE]] -> (vector<1x8xf16>) {
+// CHECK-NEXT:        %[[LD:.*]] = xegpu.load %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:          : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK-NEXT:        %[[LD_CAST:.*]] = vector.shape_cast %[[LD]] : vector<8xf16> to vector<1x8xf16>
+// CHECK-NEXT:        scf.yield %[[LD_CAST]] : vector<1x8xf16>
+// CHECK-NEXT:      } else {
+// CHECK-NEXT:        scf.yield %[[CST]] : vector<1x8xf16>
+// CHECK-NEXT:      }
+// CHECK-NEXT:      %[[IF_CAST:.*]] = vector.shape_cast %[[IF]] : vector<1x8xf16> to vector<8xf16>
+// CHECK-NEXT:      xegpu.store %[[IF_CAST]], %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:        vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
 gpu.module @xevm_module{
   gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) {
     %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
@@ -432,13 +153,15 @@ gpu.module @xevm_module{
 
 // -----
 // CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) {
-// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1
-// CHECK: scf.if %[[PREDICATE]] {
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-// CHECK-NEXT: }
+// CHECK:         %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
+// CHECK:         %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
+// CHECK:         %[[PREDICATE:.*]] = llvm.mlir.poison : i1
+// CHECK:         scf.if %[[PREDICATE]] {
+// CHECK-NEXT:      %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:         memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK-NEXT:      xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:         vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+// CHECK-NEXT:    }
 gpu.module @xevm_module{
   gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) {
     %pred = llvm.mlir.poison : i1
@@ -455,88 +178,13 @@ gpu.module @xevm_module{
 }
 
 // -----
-// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @xevm_module{
-  gpu.func @scatter_ops(%src: memref<256xf16>) {
-    %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-    %3 = xegpu.load %src[%offset], %1 {
-      layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
-    } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-    xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
-// CHECK:  %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index
-gpu.module @xevm_module{
-  gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf16>
-    %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
-    %ptr_i64 = arith.index_cast %ptr : index to i64
-    %tdesc = xegpu.create_nd_tdesc %ptr_i64, shape: [16], strides: [16] : i64
-      -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %cst, %tdesc[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
-
-
-// -----
-// CHECK-LABEL: gpu.func @vector_transpose(
-// CHECK:         %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32>
-// CHECK:         %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32>
-// CHECK:         xegpu.store_nd %[[CST]], %[[DEST]][{{.*}}] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32>
-gpu.module @xevm_module{
-  gpu.func @vector_transpose(%arg0: memref<2x16xf32>) {
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} dense<1.000000e+00>
-      : vector<16x2xf32>
-    %c0 = arith.constant 0 : index
-    %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<16x2xf32> to vector<2x16xf32>
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<2x16xf32>
-      -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %transpose, %0[%c0, %c0] : vector<2x16xf32>,
-      !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @vector_bitcast(
-// CHECK:      %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16>
-// CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16>
-// CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16>
-// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]][{{.*}}]  : vector<4xi16>, !xegpu.tensor_desc<4x16xi16>
-gpu.module @xevm_module{
-  gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) {
-    %cst = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
-      : () -> (vector<4x32xi8>)
-    %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<4x32xi8> to vector<4x16xi16>
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xi16>
-      -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %bitcast, %0[%c0, %c0] : vector<4x16xi16>,
-      !xegpu.tensor_desc<4x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
 // CHECK-LABEL: gpu.func @mma_transpose_b(
 // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK-DAG:     %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
 // CHECK-DAG:     %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32>
 // CHECK-DAG:     %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK-DAG:     %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32>
+// CHECK-DAG:     %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}>
+// CHECK-SAME:      !xegpu.tensor_desc<16x8xi32> -> vector<8xi32>
 // CHECK-NEXT:    %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32>
 // CHECK-NEXT:    %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16>
 // CHECK-NEXT:    %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16>
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir
index 4ac4f02..0d559b6 100644
--- a/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
-
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope
 
 
 llvm.func @tile_trivial_loop(%baseptr: !llvm.ptr, %tc: i32, %ts: i32) -> () {
@@ -15,87 +14,81 @@ llvm.func @tile_trivial_loop(%baseptr: !llvm.ptr, %tc: i32, %ts: i32) -> () {
 }
 
 
-// CHECK: ; ModuleID = 'LLVMDialectModule'
-// CHECK-NEXT: source_filename = "LLVMDialectModule"
-// CHECK-EMPTY:
-// CHECK-NEXT: define void @tile_trivial_loop(ptr %0, i32 %1, i32 %2) {
-// CHECK-NEXT:   br label %omp_omp.loop.preheader
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.preheader:                           ; preds = %3
-// CHECK-NEXT:   %4 = udiv i32 %1, %2
-// CHECK-NEXT:   %5 = urem i32 %1, %2
-// CHECK-NEXT:   %6 = icmp ne i32 %5, 0
-// CHECK-NEXT:   %7 = zext i1 %6 to i32
-// CHECK-NEXT:   %omp_floor0.tripcount = add nuw i32 %4, %7
-// CHECK-NEXT:   br label %omp_floor0.preheader
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.preheader:                             ; preds = %omp_omp.loop.preheader
-// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-LABEL: define void @tile_trivial_loop(
+// CHECK-SAME:    ptr %[[TMP0:.+]], i32 %[[TMP1:.+]], i32 %[[TMP2:.+]]) {
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.header:                                ; preds = %omp_floor0.inc, %omp_floor0.preheader
-// CHECK-NEXT:   %omp_floor0.iv = phi i32 [ 0, %omp_floor0.preheader ], [ %omp_floor0.next, %omp_floor0.inc ]
-// CHECK-NEXT:   br label %omp_floor0.cond
+// CHECK-NEXT:  [[OMP_OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT:    %[[TMP4:.+]] = udiv i32 %[[TMP1:.+]], %[[TMP2:.+]]
+// CHECK-NEXT:    %[[TMP5:.+]] = urem i32 %[[TMP1:.+]], %[[TMP2:.+]]
+// CHECK-NEXT:    %[[TMP6:.+]] = icmp ne i32 %[[TMP5:.+]], 0
+// CHECK-NEXT:    %[[TMP7:.+]] = zext i1 %[[TMP6:.+]] to i32
+// CHECK-NEXT:    %[[OMP_FLOOR0_TRIPCOUNT:.+]] = add nuw i32 %[[TMP4:.+]], %[[TMP7:.+]]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.cond:                                  ; preds = %omp_floor0.header
-// CHECK-NEXT:   %omp_floor0.cmp = icmp ult i32 %omp_floor0.iv, %omp_floor0.tripcount
-// CHECK-NEXT:   br i1 %omp_floor0.cmp, label %omp_floor0.body, label %omp_floor0.exit
+// CHECK-NEXT:  [[OMP_FLOOR0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.body:                                  ; preds = %omp_floor0.cond
-// CHECK-NEXT:   %8 = icmp eq i32 %omp_floor0.iv, %4
-// CHECK-NEXT:   %9 = select i1 %8, i32 %5, i32 %2
-// CHECK-NEXT:   br label %omp_tile0.preheader
+// CHECK-NEXT:  [[OMP_FLOOR0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER:.+]] ], [ %[[OMP_FLOOR0_NEXT:.+]], %[[OMP_FLOOR0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.preheader:                              ; preds = %omp_floor0.body
-// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-NEXT:  [[OMP_FLOOR0_COND]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV:.+]], %[[OMP_FLOOR0_TRIPCOUNT:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_FLOOR0_CMP:.+]], label %[[OMP_FLOOR0_BODY:.+]], label %[[OMP_FLOOR0_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.header:                                 ; preds = %omp_tile0.inc, %omp_tile0.preheader
-// CHECK-NEXT:   %omp_tile0.iv = phi i32 [ 0, %omp_tile0.preheader ], [ %omp_tile0.next, %omp_tile0.inc ]
-// CHECK-NEXT:   br label %omp_tile0.cond
+// CHECK-NEXT:  [[OMP_FLOOR0_BODY]]:
+// CHECK-NEXT:    %[[TMP8:.+]] = icmp eq i32 %[[OMP_FLOOR0_IV:.+]], %[[TMP4:.+]]
+// CHECK-NEXT:    %[[TMP9:.+]] = select i1 %[[TMP8:.+]], i32 %[[TMP5:.+]], i32 %[[TMP2:.+]]
+// CHECK-NEXT:    br label %[[OMP_TILE0_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.cond:                                   ; preds = %omp_tile0.header
-// CHECK-NEXT:   %omp_tile0.cmp = icmp ult i32 %omp_tile0.iv, %9
-// CHECK-NEXT:   br i1 %omp_tile0.cmp, label %omp_tile0.body, label %omp_tile0.exit
+// CHECK-NEXT:  [[OMP_TILE0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.body:                                   ; preds = %omp_tile0.cond
-// CHECK-NEXT:   %10 = mul nuw i32 %2, %omp_floor0.iv
-// CHECK-NEXT:   %11 = add nuw i32 %10, %omp_tile0.iv
-// CHECK-NEXT:   br label %omp_omp.loop.body
+// CHECK-NEXT:  [[OMP_TILE0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_TILE0_IV:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER:.+]] ], [ %[[OMP_TILE0_NEXT:.+]], %[[OMP_TILE0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_TILE0_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.body:                                ; preds = %omp_tile0.body
-// CHECK-NEXT:   br label %omp.loop.region
+// CHECK-NEXT:  [[OMP_TILE0_COND]]:
+// CHECK-NEXT:    %[[OMP_TILE0_CMP:.+]] = icmp ult i32 %[[OMP_TILE0_IV:.+]], %[[TMP9:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_TILE0_CMP:.+]], label %[[OMP_TILE0_BODY:.+]], label %[[OMP_TILE0_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp.loop.region:                                  ; preds = %omp_omp.loop.body
-// CHECK-NEXT:   %12 = getelementptr inbounds float, ptr %0, i32 %11
-// CHECK-NEXT:   store float 4.200000e+01, ptr %12, align 4
-// CHECK-NEXT:   br label %omp.region.cont
+// CHECK-NEXT:  [[OMP_TILE0_BODY]]:
+// CHECK-NEXT:    %[[TMP10:.+]] = mul nuw i32 %[[TMP2:.+]], %[[OMP_FLOOR0_IV:.+]]
+// CHECK-NEXT:    %[[TMP11:.+]] = add nuw i32 %[[TMP10:.+]], %[[OMP_TILE0_IV:.+]]
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_BODY:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp.region.cont:                                  ; preds = %omp.loop.region
-// CHECK-NEXT:   br label %omp_tile0.inc
+// CHECK-NEXT:  [[OMP_OMP_LOOP_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_LOOP_REGION:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.inc:                                    ; preds = %omp.region.cont
-// CHECK-NEXT:   %omp_tile0.next = add nuw i32 %omp_tile0.iv, 1
-// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-NEXT:  [[OMP_LOOP_REGION]]:
+// CHECK-NEXT:    %[[TMP12:.+]] = getelementptr inbounds float, ptr %[[TMP0:.+]], i32 %[[TMP11:.+]]
+// CHECK-NEXT:    store float 4.200000e+01, ptr %[[TMP12:.+]], align 4
+// CHECK-NEXT:    br label %[[OMP_REGION_CONT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.exit:                                   ; preds = %omp_tile0.cond
-// CHECK-NEXT:   br label %omp_tile0.after
+// CHECK-NEXT:  [[OMP_REGION_CONT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.after:                                  ; preds = %omp_tile0.exit
-// CHECK-NEXT:   br label %omp_floor0.inc
+// CHECK-NEXT:  [[OMP_TILE0_INC]]:
+// CHECK-NEXT:    %[[OMP_TILE0_NEXT:.+]] = add nuw i32 %[[OMP_TILE0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.inc:                                   ; preds = %omp_tile0.after
-// CHECK-NEXT:   %omp_floor0.next = add nuw i32 %omp_floor0.iv, 1
-// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-NEXT:  [[OMP_TILE0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.exit:                                  ; preds = %omp_floor0.cond
-// CHECK-NEXT:   br label %omp_floor0.after
+// CHECK-NEXT:  [[OMP_TILE0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.after:                                 ; preds = %omp_floor0.exit
-// CHECK-NEXT:   br label %omp_omp.loop.after
+// CHECK-NEXT:  [[OMP_FLOOR0_INC]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.after:                               ; preds = %omp_floor0.after
-// CHECK-NEXT:   ret void
-// CHECK-NEXT: }
+// CHECK-NEXT:  [[OMP_FLOOR0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: !llvm.module.flags = !{!0}
+// CHECK-NEXT:  [[OMP_FLOOR0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: !0 = !{i32 2, !"Debug Info Version", i32 3}
+// CHECK-NEXT:  [[OMP_OMP_LOOP_AFTER]]:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir
index 6fad81c..22c2973 100644
--- a/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope
 
 
 llvm.func @tile_2d_loop(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32, %ts1: i32, %ts2: i32) -> () {
@@ -19,172 +19,166 @@ llvm.func @tile_2d_loop(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32, %ts1: i32, %t
 }
 
 
-// CHECK: ; ModuleID = 'LLVMDialectModule'
-// CHECK-NEXT: source_filename = "LLVMDialectModule"
+// CHECK-LABEL: define void @tile_2d_loop(
+// CHECK-SAME:      ptr %[[TMP0:.+]], i32 %[[TMP1:.+]], i32 %[[TMP2:.+]], i32 %[[TMP3:.+]], i32 %[[TMP4:.+]]) {
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT:    %[[TMP6:.+]] = udiv i32 %[[TMP1:.+]], %[[TMP3:.+]]
+// CHECK-NEXT:    %[[TMP7:.+]] = urem i32 %[[TMP1:.+]], %[[TMP3:.+]]
+// CHECK-NEXT:    %[[TMP8:.+]] = icmp ne i32 %[[TMP7:.+]], 0
+// CHECK-NEXT:    %[[TMP9:.+]] = zext i1 %[[TMP8:.+]] to i32
+// CHECK-NEXT:    %[[OMP_FLOOR0_TRIPCOUNT:.+]] = add nuw i32 %[[TMP6:.+]], %[[TMP9:.+]]
+// CHECK-NEXT:    %[[TMP10:.+]] = udiv i32 %[[TMP2:.+]], %[[TMP4:.+]]
+// CHECK-NEXT:    %[[TMP11:.+]] = urem i32 %[[TMP2:.+]], %[[TMP4:.+]]
+// CHECK-NEXT:    %[[TMP12:.+]] = icmp ne i32 %[[TMP11:.+]], 0
+// CHECK-NEXT:    %[[TMP13:.+]] = zext i1 %[[TMP12:.+]] to i32
+// CHECK-NEXT:    %[[OMP_FLOOR1_TRIPCOUNT:.+]] = add nuw i32 %[[TMP10:.+]], %[[TMP13:.+]]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_HEADER:.+]]:
+// CHECK-NEXT:    %[[OMP_OMP_LOOP_IV:.+]] = phi i32 [ %[[OMP_OMP_LOOP_NEXT:.+]], %[[OMP_OMP_LOOP_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_COND]]:
+// CHECK-NEXT:    %[[OMP_OMP_LOOP_CMP:.+]] = icmp ult i32 %[[TMP19:.+]], %[[TMP1:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_OMP_LOOP_CMP:.+]], label %[[OMP_OMP_LOOP_BODY:.+]], label %[[OMP_OMP_LOOP_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_LOOP_REGION:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_LOOP_REGION]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_PREHEADER1:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: define void @tile_2d_loop(ptr %0, i32 %1, i32 %2, i32 %3, i32 %4) {
-// CHECK-NEXT:   br label %omp_omp.loop.preheader
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.preheader:                           ; preds = %5
-// CHECK-NEXT:   %6 = udiv i32 %1, %3
-// CHECK-NEXT:   %7 = urem i32 %1, %3
-// CHECK-NEXT:   %8 = icmp ne i32 %7, 0
-// CHECK-NEXT:   %9 = zext i1 %8 to i32
-// CHECK-NEXT:   %omp_floor0.tripcount = add nuw i32 %6, %9
-// CHECK-NEXT:   %10 = udiv i32 %2, %4
-// CHECK-NEXT:   %11 = urem i32 %2, %4
-// CHECK-NEXT:   %12 = icmp ne i32 %11, 0
-// CHECK-NEXT:   %13 = zext i1 %12 to i32
-// CHECK-NEXT:   %omp_floor1.tripcount = add nuw i32 %10, %13
-// CHECK-NEXT:   br label %omp_floor0.preheader
+// CHECK-NEXT:  [[OMP_OMP_LOOP_PREHEADER1]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_BODY4:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.header:                              ; preds = %omp_omp.loop.inc
-// CHECK-NEXT:   %omp_omp.loop.iv = phi i32 [ %omp_omp.loop.next, %omp_omp.loop.inc ]
-// CHECK-NEXT:   br label %omp_omp.loop.cond
+// CHECK-NEXT:  [[OMP_FLOOR0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.cond:                                ; preds = %omp_omp.loop.header
-// CHECK-NEXT:   %omp_omp.loop.cmp = icmp ult i32 %19, %1
-// CHECK-NEXT:   br i1 %omp_omp.loop.cmp, label %omp_omp.loop.body, label %omp_omp.loop.exit
+// CHECK-NEXT:  [[OMP_FLOOR0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER:.+]] ], [ %[[OMP_FLOOR0_NEXT:.+]], %[[OMP_FLOOR0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.body:                                ; preds = %omp_tile1.body, %omp_omp.loop.cond
-// CHECK-NEXT:   br label %omp.loop.region
+// CHECK-NEXT:  [[OMP_FLOOR0_COND]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV:.+]], %[[OMP_FLOOR0_TRIPCOUNT:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_FLOOR0_CMP:.+]], label %[[OMP_FLOOR0_BODY:.+]], label %[[OMP_FLOOR0_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp.loop.region:                                  ; preds = %omp_omp.loop.body
-// CHECK-NEXT:   br label %omp_omp.loop.preheader1
+// CHECK-NEXT:  [[OMP_FLOOR0_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.preheader1:                          ; preds = %omp.loop.region
-// CHECK-NEXT:   br label %omp_omp.loop.body4
+// CHECK-NEXT:  [[OMP_FLOOR1_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.preheader:                             ; preds = %omp_omp.loop.preheader
-// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-NEXT:  [[OMP_FLOOR1_HEADER]]:
+// CHECK-NEXT:    %[[OMP_FLOOR1_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR1_PREHEADER:.+]] ], [ %[[OMP_FLOOR1_NEXT:.+]], %[[OMP_FLOOR1_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.header:                                ; preds = %omp_floor0.inc, %omp_floor0.preheader
-// CHECK-NEXT:   %omp_floor0.iv = phi i32 [ 0, %omp_floor0.preheader ], [ %omp_floor0.next, %omp_floor0.inc ]
-// CHECK-NEXT:   br label %omp_floor0.cond
+// CHECK-NEXT:  [[OMP_FLOOR1_COND]]:
+// CHECK-NEXT:    %[[OMP_FLOOR1_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR1_IV:.+]], %[[OMP_FLOOR1_TRIPCOUNT:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_FLOOR1_CMP:.+]], label %[[OMP_FLOOR1_BODY:.+]], label %[[OMP_FLOOR1_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.cond:                                  ; preds = %omp_floor0.header
-// CHECK-NEXT:   %omp_floor0.cmp = icmp ult i32 %omp_floor0.iv, %omp_floor0.tripcount
-// CHECK-NEXT:   br i1 %omp_floor0.cmp, label %omp_floor0.body, label %omp_floor0.exit
+// CHECK-NEXT:  [[OMP_FLOOR1_BODY]]:
+// CHECK-NEXT:    %[[TMP14:.+]] = icmp eq i32 %[[OMP_FLOOR0_IV:.+]], %[[TMP6:.+]]
+// CHECK-NEXT:    %[[TMP15:.+]] = select i1 %[[TMP14:.+]], i32 %[[TMP7:.+]], i32 %[[TMP3:.+]]
+// CHECK-NEXT:    %[[TMP16:.+]] = icmp eq i32 %[[OMP_FLOOR1_IV:.+]], %[[TMP10:.+]]
+// CHECK-NEXT:    %[[TMP17:.+]] = select i1 %[[TMP16:.+]], i32 %[[TMP11:.+]], i32 %[[TMP4:.+]]
+// CHECK-NEXT:    br label %[[OMP_TILE0_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.body:                                  ; preds = %omp_floor0.cond
-// CHECK-NEXT:   br label %omp_floor1.preheader
+// CHECK-NEXT:  [[OMP_TILE0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.preheader:                             ; preds = %omp_floor0.body
-// CHECK-NEXT:   br label %omp_floor1.header
+// CHECK-NEXT:  [[OMP_TILE0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_TILE0_IV:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER:.+]] ], [ %[[OMP_TILE0_NEXT:.+]], %[[OMP_TILE0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_TILE0_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.header:                                ; preds = %omp_floor1.inc, %omp_floor1.preheader
-// CHECK-NEXT:   %omp_floor1.iv = phi i32 [ 0, %omp_floor1.preheader ], [ %omp_floor1.next, %omp_floor1.inc ]
-// CHECK-NEXT:   br label %omp_floor1.cond
+// CHECK-NEXT:  [[OMP_TILE0_COND]]:
+// CHECK-NEXT:    %[[OMP_TILE0_CMP:.+]] = icmp ult i32 %[[OMP_TILE0_IV:.+]], %[[TMP15:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_TILE0_CMP:.+]], label %[[OMP_TILE0_BODY:.+]], label %[[OMP_TILE0_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.cond:                                  ; preds = %omp_floor1.header
-// CHECK-NEXT:   %omp_floor1.cmp = icmp ult i32 %omp_floor1.iv, %omp_floor1.tripcount
-// CHECK-NEXT:   br i1 %omp_floor1.cmp, label %omp_floor1.body, label %omp_floor1.exit
+// CHECK-NEXT:  [[OMP_TILE0_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.body:                                  ; preds = %omp_floor1.cond
-// CHECK-NEXT:   %14 = icmp eq i32 %omp_floor0.iv, %6
-// CHECK-NEXT:   %15 = select i1 %14, i32 %7, i32 %3
-// CHECK-NEXT:   %16 = icmp eq i32 %omp_floor1.iv, %10
-// CHECK-NEXT:   %17 = select i1 %16, i32 %11, i32 %4
-// CHECK-NEXT:   br label %omp_tile0.preheader
+// CHECK-NEXT:  [[OMP_TILE1_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.preheader:                              ; preds = %omp_floor1.body
-// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-NEXT:  [[OMP_TILE1_HEADER]]:
+// CHECK-NEXT:    %[[OMP_TILE1_IV:.+]] = phi i32 [ 0, %[[OMP_TILE1_PREHEADER:.+]] ], [ %[[OMP_TILE1_NEXT:.+]], %[[OMP_TILE1_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_TILE1_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.header:                                 ; preds = %omp_tile0.inc, %omp_tile0.preheader
-// CHECK-NEXT:   %omp_tile0.iv = phi i32 [ 0, %omp_tile0.preheader ], [ %omp_tile0.next, %omp_tile0.inc ]
-// CHECK-NEXT:   br label %omp_tile0.cond
+// CHECK-NEXT:  [[OMP_TILE1_COND]]:
+// CHECK-NEXT:    %[[OMP_TILE1_CMP:.+]] = icmp ult i32 %[[OMP_TILE1_IV:.+]], %[[TMP17:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_TILE1_CMP:.+]], label %[[OMP_TILE1_BODY:.+]], label %[[OMP_TILE1_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.cond:                                   ; preds = %omp_tile0.header
-// CHECK-NEXT:   %omp_tile0.cmp = icmp ult i32 %omp_tile0.iv, %15
-// CHECK-NEXT:   br i1 %omp_tile0.cmp, label %omp_tile0.body, label %omp_tile0.exit
+// CHECK-NEXT:  [[OMP_TILE1_BODY]]:
+// CHECK-NEXT:    %[[TMP18:.+]] = mul nuw i32 %[[TMP3:.+]], %[[OMP_FLOOR0_IV:.+]]
+// CHECK-NEXT:    %[[TMP19:.+]] = add nuw i32 %[[TMP18:.+]], %[[OMP_TILE0_IV:.+]]
+// CHECK-NEXT:    %[[TMP20:.+]] = mul nuw i32 %[[TMP4:.+]], %[[OMP_FLOOR1_IV:.+]]
+// CHECK-NEXT:    %[[TMP21:.+]] = add nuw i32 %[[TMP20:.+]], %[[OMP_TILE1_IV:.+]]
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_BODY:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.body:                                   ; preds = %omp_tile0.cond
-// CHECK-NEXT:   br label %omp_tile1.preheader
+// CHECK-NEXT:  [[OMP_OMP_LOOP_BODY4]]:
+// CHECK-NEXT:    br label %[[OMP_LOOP_REGION12:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.preheader:                              ; preds = %omp_tile0.body
-// CHECK-NEXT:   br label %omp_tile1.header
+// CHECK-NEXT:  [[OMP_LOOP_REGION12]]:
+// CHECK-NEXT:    %[[TMP22:.+]] = add i32 %[[TMP19:.+]], %[[TMP21:.+]]
+// CHECK-NEXT:    %[[TMP23:.+]] = getelementptr inbounds float, ptr %[[TMP0:.+]], i32 %[[TMP22:.+]]
+// CHECK-NEXT:    store float 4.200000e+01, ptr %[[TMP23:.+]], align 4
+// CHECK-NEXT:    br label %[[OMP_REGION_CONT11:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.header:                                 ; preds = %omp_tile1.inc, %omp_tile1.preheader
-// CHECK-NEXT:   %omp_tile1.iv = phi i32 [ 0, %omp_tile1.preheader ], [ %omp_tile1.next, %omp_tile1.inc ]
-// CHECK-NEXT:   br label %omp_tile1.cond
+// CHECK-NEXT:  [[OMP_REGION_CONT11]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.cond:                                   ; preds = %omp_tile1.header
-// CHECK-NEXT:   %omp_tile1.cmp = icmp ult i32 %omp_tile1.iv, %17
-// CHECK-NEXT:   br i1 %omp_tile1.cmp, label %omp_tile1.body, label %omp_tile1.exit
+// CHECK-NEXT:  [[OMP_TILE1_INC]]:
+// CHECK-NEXT:    %[[OMP_TILE1_NEXT:.+]] = add nuw i32 %[[OMP_TILE1_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_TILE1_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.body:                                   ; preds = %omp_tile1.cond
-// CHECK-NEXT:   %18 = mul nuw i32 %3, %omp_floor0.iv
-// CHECK-NEXT:   %19 = add nuw i32 %18, %omp_tile0.iv
-// CHECK-NEXT:   %20 = mul nuw i32 %4, %omp_floor1.iv
-// CHECK-NEXT:   %21 = add nuw i32 %20, %omp_tile1.iv
-// CHECK-NEXT:   br label %omp_omp.loop.body
+// CHECK-NEXT:  [[OMP_TILE1_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.body4:                               ; preds = %omp_omp.loop.preheader1
-// CHECK-NEXT:   br label %omp.loop.region12
+// CHECK-NEXT:  [[OMP_TILE1_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp.loop.region12:                                ; preds = %omp_omp.loop.body4
-// CHECK-NEXT:   %22 = add i32 %19, %21
-// CHECK-NEXT:   %23 = getelementptr inbounds float, ptr %0, i32 %22
-// CHECK-NEXT:   store float 4.200000e+01, ptr %23, align 4
-// CHECK-NEXT:   br label %omp.region.cont11
+// CHECK-NEXT:  [[OMP_TILE0_INC]]:
+// CHECK-NEXT:    %[[OMP_TILE0_NEXT:.+]] = add nuw i32 %[[OMP_TILE0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp.region.cont11:                                ; preds = %omp.loop.region12
-// CHECK-NEXT:   br label %omp_tile1.inc
+// CHECK-NEXT:  [[OMP_TILE0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.inc:                                    ; preds = %omp.region.cont11
-// CHECK-NEXT:   %omp_tile1.next = add nuw i32 %omp_tile1.iv, 1
-// CHECK-NEXT:   br label %omp_tile1.header
+// CHECK-NEXT:  [[OMP_TILE0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.exit:                                   ; preds = %omp_tile1.cond
-// CHECK-NEXT:   br label %omp_tile1.after
+// CHECK-NEXT:  [[OMP_FLOOR1_INC]]:
+// CHECK-NEXT:    %[[OMP_FLOOR1_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR1_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.after:                                  ; preds = %omp_tile1.exit
-// CHECK-NEXT:   br label %omp_tile0.inc
+// CHECK-NEXT:  [[OMP_FLOOR1_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.inc:                                    ; preds = %omp_tile1.after
-// CHECK-NEXT:   %omp_tile0.next = add nuw i32 %omp_tile0.iv, 1
-// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-NEXT:  [[OMP_FLOOR1_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.exit:                                   ; preds = %omp_tile0.cond
-// CHECK-NEXT:   br label %omp_tile0.after
+// CHECK-NEXT:  [[OMP_FLOOR0_INC]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.after:                                  ; preds = %omp_tile0.exit
-// CHECK-NEXT:   br label %omp_floor1.inc
+// CHECK-NEXT:  [[OMP_FLOOR0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.inc:                                   ; preds = %omp_tile0.after
-// CHECK-NEXT:   %omp_floor1.next = add nuw i32 %omp_floor1.iv, 1
-// CHECK-NEXT:   br label %omp_floor1.header
+// CHECK-NEXT:  [[OMP_FLOOR0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.exit:                                  ; preds = %omp_floor1.cond
-// CHECK-NEXT:   br label %omp_floor1.after
+// CHECK-NEXT:  [[OMP_REGION_CONT:.+]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.after:                                 ; preds = %omp_floor1.exit
-// CHECK-NEXT:   br label %omp_floor0.inc
+// CHECK-NEXT:  [[OMP_OMP_LOOP_INC]]:
+// CHECK-NEXT:    %[[OMP_OMP_LOOP_NEXT:.+]] = add nuw i32 %[[TMP19:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.inc:                                   ; preds = %omp_floor1.after
-// CHECK-NEXT:   %omp_floor0.next = add nuw i32 %omp_floor0.iv, 1
-// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-NEXT:  [[OMP_OMP_LOOP_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.exit:                                  ; preds = %omp_floor0.cond
-// CHECK-NEXT:   br label %omp_floor0.after
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.after:                                 ; preds = %omp_floor0.exit
-// CHECK-NEXT:   br label %omp_omp.loop.after
-// CHECK-EMPTY:
-// CHECK-NEXT: omp.region.cont:                                  ; No predecessors!
-// CHECK-NEXT:   br label %omp_omp.loop.inc
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.inc:                                 ; preds = %omp.region.cont
-// CHECK-NEXT:   %omp_omp.loop.next = add nuw i32 %19, 1
-// CHECK-NEXT:   br label %omp_omp.loop.header
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.exit:                                ; preds = %omp_omp.loop.cond
-// CHECK-NEXT:   br label %omp_omp.loop.after
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.after:                               ; preds = %omp_floor0.after, %omp_omp.loop.exit
-// CHECK-NEXT:   ret void
-// CHECK-NEXT: }
-// CHECK-EMPTY:
-// CHECK-NEXT: !llvm.module.flags = !{!0}
-// CHECK-EMPTY:
-// CHECK-NEXT: !0 = !{i32 2, !"Debug Info Version", i32 3}
+// CHECK-NEXT:  [[OMP_OMP_LOOP_AFTER]]:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index e51cac4..6ba7a00 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -218,6 +218,35 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
   }
 };
 
+struct TestXeGPUSGDistribute
+    : public PassWrapper<TestXeGPUSGDistribute,
+                         OperationPass<gpu::GPUModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUSGDistribute)
+
+  StringRef getArgument() const final { return "test-xegpu-sg-distribute"; }
+
+  StringRef getDescription() const final {
+    return "Test the implementation of XeGPU Subgroup Distribution";
+  }
+
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect>();
+    registry.insert<memref::MemRefDialect>();
+    registry.insert<xegpu::XeGPUDialect>();
+    registry.insert<vector::VectorDialect>();
+    registry.insert<index::IndexDialect>();
+  }
+
+  TestXeGPUSGDistribute() = default;
+  TestXeGPUSGDistribute(const TestXeGPUSGDistribute &pass) = default;
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+  }
+};
+
 struct TestXeGPULayoutInterface
     : public PassWrapper<TestXeGPULayoutInterface,
                          OperationPass<gpu::GPUModuleOp>> {
@@ -282,6 +311,7 @@ namespace test {
 void registerTestXeGPULowerings() {
   PassRegistration<TestXeGPUUnrollingPatterns>();
   PassRegistration<TestXeGPULayoutInterface>();
+  PassRegistration<TestXeGPUSGDistribute>();
 }
 } // namespace test
 } // namespace mlir