6 files changed, 281 insertions, 125 deletions
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 87a31ca..1adc418 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -8,11 +8,11 @@
 
 // RUN: mlir-opt --allow-unregistered-dialect \
 // RUN:   --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \
-// RUN:   | FileCheck %s --check-prefix=CHECK-GFX9
+// RUN:   | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX9
 
 // RUN: mlir-opt --allow-unregistered-dialect \
 // RUN:   --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx1030" %s \
-// RUN:   | FileCheck %s --check-prefix=CHECK-GFX10
+// RUN:   | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX10
 
 // CHECK-SUB:  gpu.module @kernels {
 // CHECK-SHFL: gpu.module @kernels {
@@ -24,8 +24,7 @@ gpu.module @kernels {
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<5xf16>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel0(
-  // CHECK-GFX9-LABEL: gpu.func @kernel0(
-  // CHECK-GFX10-LABEL: gpu.func @kernel0(
+  // CHECK-GFX-LABEL: gpu.func @kernel0(
   gpu.func @kernel0(%arg0: vector<5xf16>) kernel {
     // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
     // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
@@ -56,8 +55,7 @@ gpu.module @kernels {
 
     // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
     // CHECK-SUB: "test.consume"
-    // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}}
-    // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}}
+    // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}}
     %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum2) : (vector<5xf16>) -> ()
 
@@ -74,8 +72,7 @@ gpu.module @kernels {
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<1xf32>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel1(
-  // CHECK-GFX9-LABEL: gpu.func @kernel1(
-  // CHECK-GFX10-LABEL: gpu.func @kernel1(
+  // CHECK-GFX-LABEL: gpu.func @kernel1(
   gpu.func @kernel1(%arg0: vector<1xf32>) kernel {
     // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
     // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
@@ -100,17 +97,14 @@ gpu.module @kernels {
     // Note stride is dropped because it is == 1.
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
     // CHECK-SUB: "test.consume"
-    // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} quad_perm
-    // CHECK-GFX9: amdgpu.dpp {{.+}} row_half_mirror
-    // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} quad_perm
-    // CHECK-GFX10: amdgpu.dpp {{.+}} row_half_mirror
+    // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} quad_perm
+    // CHECK-GFX: amdgpu.dpp {{.+}} row_half_mirror
     %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum2) : (vector<1xf32>) -> ()
 
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
     // CHECK-SUB: "test.consume"
-    // CHECK-GFX9-NOT: amdgpu.dpp
-    // CHECK-GFX10-NOT: amdgpu.dpp
+    // CHECK-GFX-NOT: amdgpu.dpp
     // CHECK-GFX10-NOT: rocdl.permlanex16
     %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum3) : (vector<1xf32>) -> ()
@@ -126,11 +120,8 @@ gpu.module @kernels {
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel2(
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel2(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel2(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel2(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel {
     // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
     // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
@@ -148,8 +139,7 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel3(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
-  // CHECK-GFX9-LABEL: gpu.func @kernel3(
-  // CHECK-GFX10-LABEL: gpu.func @kernel3(
+  // CHECK-GFX-LABEL: gpu.func @kernel3(
   gpu.func @kernel3(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -169,9 +159,9 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32
     // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
     // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
-    
+
     // CHECK-GFX9-COUNT-6: amdgpu.dpp
-    
+
     // CHECK-GFX10-COUNT-4: amdgpu.dpp
     // CHECK-GFX10: rocdl.permlanex16
     // CHECK-GFX10-COUNT-2: rocdl.readlane
@@ -185,11 +175,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered(
-  // CHECK-GFX9-SAME:    %[[ARG0:.+]]: i32)
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered(
-  // CHECK-GFX10-SAME:    %[[ARG0:.+]]: i32)
+  // CHECK-GFX-LABEL: gpu.func @kernel3_clustered(
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: i32)
   gpu.func @kernel3_clustered(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -204,19 +191,13 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
     // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
 
-    // CHECK-GFX9: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX9: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
-    // CHECK-GFX9: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX9: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
-    // CHECK-GFX9: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : i32
-    // CHECK-GFX9: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
-
-    // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
-    // CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
-    // CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : i32
-    // CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
+    // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
+    // CHECK-GFX: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
+    // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
+    // CHECK-GFX: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
+    // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : i32
+    // CHECK-GFX: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
+
     // CHECK-GFX10: "test.consume"(%[[A2]]) : (i32) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32
     "test.consume"(%sum0) : (i32) -> ()
@@ -228,11 +209,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered_strided(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered_strided(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel3_clustered_strided(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel3_clustered_strided(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
@@ -256,11 +234,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel4(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<2xf16>)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel4(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel4(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel4(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel4(%arg0: vector<2xf16>) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -298,11 +273,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<2xf16>)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel4_clustered(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel4_clustered(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel4_clustered(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -319,10 +291,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel5(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel5(
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel5(
-  // CHECK-GFX10-SAME:    %[[ARG0:.+]]: i16)
+  // CHECK-GFX-LABEL: gpu.func @kernel5(
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: i16)
   gpu.func @kernel5(%arg0: i16) kernel {
     // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
     // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -334,7 +304,7 @@ gpu.module @kernels {
     // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
-    
+
     // CHECK-GFX9-COUNT-6: amdgpu.dpp
 
     // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
@@ -361,11 +331,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel5_clustered
-  // CHECK-GFX9-SAME:    %[[ARG0:.+]]: i16)
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel5_clustered
-  // CHECK-GFX10-SAME:    %[[ARG0:.+]]: i16)
+  // CHECK-GFX-LABEL: gpu.func @kernel5_clustered
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: i16)
   gpu.func @kernel5_clustered(%arg0: i16) kernel {
     // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
     // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -378,25 +345,15 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
 
-    // CHECK-GFX9: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
-    // CHECK-GFX9: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
-    // CHECK-GFX9: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
-    // CHECK-GFX9: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
-    // CHECK-GFX9: "test.consume"(%[[VAR7]]) : (i16) -> ()
-
-    // CHECK-GFX10: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
-    // CHECK-GFX10: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
-    // CHECK-GFX10: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
-    // CHECK-GFX10: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
-    // CHECK-GFX10: "test.consume"(%[[VAR7]]) : (i16) -> ()
+    // CHECK-GFX: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
+    // CHECK-GFX: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
+    // CHECK-GFX: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
+    // CHECK-GFX: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
+    // CHECK-GFX: "test.consume"(%[[VAR7]]) : (i16) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
     "test.consume"(%sum0) : (i16) -> ()
 
@@ -407,11 +364,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel6(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel6(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel6(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel6(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel6(%arg0: vector<3xi8>) kernel {
     // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
     // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
@@ -433,6 +387,44 @@ gpu.module @kernels {
     gpu.return
   }
 
+  // CHECK-GFX-LABEL: gpu.func @kernel7(
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: f32)
+  //
+  //   Checks, common to gfx942 and gfx1030, of
+  //     (1) quad_perm, followed by reduction resulting in reduction over 2 consecutive lanes,
+  //     (2) quad_perm, followed by reduction resulting in reduction over 4 consecutive lanes,
+  //     (3) row_half_mirror, followed by reduction resulting in reduction over 8 consecutive lanes, and
+  //     (4) row_mirror, followed by reduction resulting in reduction over 16 consecutive lanes.
+  // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A0:.+]] = arith.addf %[[ARG0]], %[[D0]] : f32
+  // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A1:.+]] = arith.addf %[[A0]], %[[D1]] : f32
+  // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A2:.+]] = arith.addf %[[A1]], %[[D2]] : f32
+  // CHECK-GFX: %[[D3:.+]] = amdgpu.dpp %[[A2]] %[[A2]]  row_mirror(unit) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A3:.+]] = arith.addf %[[A2]], %[[D3]] : f32
+  //
+  //   Now, on gfx942:
+  //     (1) Lane 15 gets broadcast to lanes [16, 32) and lane 31 gets broadcast to lanes [48, 64], after which
+  //         the reduction in lanes [16, 32) is over the full cluster of the first 32 lanes, and the reduction in lanes
+  //         [48, 64) is over the full cluster of the last 32 lanes.
+  //     (2) Update the reduction value in lanes [0, 16) and [32, 48) with the final reduction result from
+  //         lanes [16, 32) and [48, 64), respectively.
+  // CHECK-GFX9: %[[BCAST15:.+]] = amdgpu.dpp %[[A3]] %[[A3]]  row_bcast_15(unit) {row_mask = 10 : i32} : f32
+  // CHECK-GFX9: %[[SUM:.+]] = arith.addf %[[A3]], %[[BCAST15]] : f32
+  // CHECK-GFX9: %[[SWIZ:.+]] = amdgpu.swizzle_bitmode %[[SUM]] 0 31 0 : f32
+  // CHECK-GFX9: "test.consume"(%[[SWIZ]]) : (f32) -> ()
+  //
+  //   On gfx1030, the final step is to permute the lanes and perform final reduction:
+  // CHECK-GFX10: rocdl.permlanex16
+  // CHECK-GFX10: arith.addf
+  // CHECK-GFX10: "test.consume"
+   gpu.func @kernel7(%arg0: f32) kernel {
+     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (f32) -> (f32)
+     "test.consume"(%sum0) : (f32) -> ()
+     gpu.return
+   }
+
   // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
   //
diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
index d289d73..2780212 100644
--- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
+++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s
 
 gpu.module @test {
 gpu.func @empty()  {
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
new file mode 100644
index 0000000..58461b8
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -0,0 +1,128 @@
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func.func @dpas_f16(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  {layout_result_0 = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+gpu.module @test {
+
+func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %3 = xegpu.load_nd %1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+  %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+}
+
+// -----
+gpu.module @test_kernel {
+  gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %m = arith.muli %block_id_x, %c32 : index
+
+    %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+    %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+    %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+
+    %out:3 = scf.for %k = %c0 to %c1024 step %c32
+      iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
+      -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
+      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x32xf16>
+      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+
+      //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x32xf16>
+      %c = arith.addf %a, %b : vector<16x32xf16>
+
+      //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>>
+      xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
+
+      //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
+        : !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
+    }
+    gpu.return
+  }
+}
+
+// -----
+gpu.module @test_kernel {
+  gpu.func @elementwise_with_inst_data_12(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %m = arith.muli %block_id_x, %c32 : index
+
+    %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+    %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+    %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+
+    %out:3 = scf.for %k = %c0 to %c1024 step %c32
+      iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
+      -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
+      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+      //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<12x32xf16>
+      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+
+      //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<12x32xf16>
+      %c = arith.addf %a, %b : vector<12x32xf16>
+
+      //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>>>
+      xegpu.store_nd %c, %arg2: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
+
+      //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+      %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+      %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+      scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
+        : !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
+    }
+    gpu.return
+  }
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @scatter_ops_chunksize(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}>
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8], lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
+  %1 = arith.constant dense<1>: vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
+      : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+  xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
+      : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+  return
+}
+}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 30f785d..543e119 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt -xegpu-propagate-layout -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout -split-input-file %s | FileCheck %s
 
+gpu.module @test {
 // CHECK-LABEL: func.func @dpas_f16(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
@@ -25,8 +26,10 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me
   xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @dpas_i8(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) {
 // CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16],
@@ -37,8 +40,10 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre
   xegpu.store_nd %0, %1  : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_with_transpose_effect(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array<i64: 1, 0>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
@@ -55,8 +60,10 @@ func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x
   xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_transpose(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16> to vector<16x16xf16>
@@ -73,8 +80,10 @@ func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %
   xegpu.store_nd %5, %6  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @extf_truncf(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, %[[ARG1:[0-9a-zA-Z]+]]:
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>) -> vector<8x16xf32> {
@@ -88,8 +97,10 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor
   %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
   return %4 : vector<8x16xf32>
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_gather_with_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
@@ -113,8 +124,10 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256
   xegpu.store_nd %5, %6  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_gather_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
@@ -132,8 +145,9 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf
   xegpu.store_nd %1, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @store_scatter_with_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> ->
@@ -148,8 +162,9 @@ func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) {
   xegpu.store %cst, %0, %cst_0 : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @store_scatter_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}}  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>,
@@ -161,8 +176,9 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
   xegpu.store %arg0, %0, %cst_0  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scatter_ops_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
@@ -179,8 +195,9 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
       : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scatter_ops(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
@@ -195,8 +212,9 @@ func.func @scatter_ops(%src: memref<256xf16>) {
   xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
 // CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
@@ -219,8 +237,9 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1
   xegpu.store_nd %6, %7  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16(
 // CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
@@ -239,8 +258,9 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8
   xegpu.store_nd %6, %7  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32(
 // CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16>
@@ -255,8 +275,9 @@ func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16
   xegpu.store_nd %3, %1  : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle(
 // CHECK:     %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32>
 // CHECK:     %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -270,9 +291,10 @@ func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %a
   xegpu.store_nd %3, %1  : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16>
   return
 }
-
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @binary_op_one_use(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
@@ -291,8 +313,9 @@ func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.
   xegpu.store_nd %4, %arg2  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @binary_op_multiple_uses(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
@@ -312,8 +335,9 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !
   xegpu.store_nd %2, %arg3  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @for_op(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -353,8 +377,9 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me
   xegpu.store_nd %2#2, %3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @if_single_use(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
@@ -381,8 +406,9 @@ func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tens
   xegpu.store_nd %2, %arg3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @if_multiple_uses(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
@@ -411,8 +437,9 @@ func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
   xegpu.store_nd %1, %arg4  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_outer_reduction(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [0] : vector<16x16xf32> to vector<16xf32>
@@ -422,8 +449,9 @@ func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_inner_reduction(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [1] : vector<16x16xf32> to vector<16xf32>
@@ -433,8 +461,9 @@ func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @update_nd_offset_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
@@ -448,8 +477,9 @@ func.func @update_nd_offset_1d(%arg0: memref<256xf32>){
   xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @update_nd_offset_2d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -463,8 +493,9 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
   xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_2d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -475,8 +506,9 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){
   xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
@@ -487,8 +519,9 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){
   xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scf_while_and_condition(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>)
@@ -520,8 +553,9 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32
   }
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
@@ -541,8 +575,9 @@ func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc
   xegpu.store_nd %5, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
 // CHECK-SAME:     %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:     %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
@@ -563,3 +598,4 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc
   xegpu.store_nd %5, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
+}
diff --git a/mlir/test/mlir-runner/memref-reshape.mlir b/mlir/test/mlir-runner/memref-reshape.mlir
index 8c17f1fd..b264e02 100644
--- a/mlir/test/mlir-runner/memref-reshape.mlir
+++ b/mlir/test/mlir-runner/memref-reshape.mlir
@@ -65,8 +65,8 @@ func.func @reshape_ranked_memref_to_ranked(%input : memref<2x3xf32>,
 func.func @reshape_unranked_memref_to_ranked(%input : memref<2x3xf32>,
                                         %shape : memref<2xindex>) {
   %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32>
-  %output = memref.reshape %input(%shape)
-                : (memref<2x3xf32>, memref<2xindex>) -> memref<?x?xf32>
+  %output = memref.reshape %unranked_input(%shape)
+                : (memref<*xf32>, memref<2xindex>) -> memref<?x?xf32>
 
   %unranked_output = memref.cast %output : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%unranked_output) : (memref<*xf32>) -> ()
@@ -95,8 +95,8 @@ func.func @reshape_unranked_memref_to_unranked(%input : memref<2x3xf32>,
                                           %shape : memref<2xindex>) {
   %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32>
   %dyn_size_shape = memref.cast %shape : memref<2xindex> to memref<?xindex>
-  %output = memref.reshape %input(%dyn_size_shape)
-                : (memref<2x3xf32>, memref<?xindex>) -> memref<*xf32>
+  %output = memref.reshape %unranked_input(%dyn_size_shape)
+                : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
 
   call @printMemrefF32(%output) : (memref<*xf32>) -> ()
   // CHECK: rank = 2 offset = 0 sizes = [3, 2] strides = [2, 1] data =
diff --git a/mlir/test/mlir-tblgen/op-properties.td b/mlir/test/mlir-tblgen/op-properties.td
index a9c784c..cb9bd3d 100644
--- a/mlir/test/mlir-tblgen/op-properties.td
+++ b/mlir/test/mlir-tblgen/op-properties.td
@@ -32,7 +32,7 @@ def OpWithProps : NS_Op<"op_with_props"> {
     ArrayProp<StringProp>:$strings,
     DefaultValuedProp<I32Prop, "0">:$default_int,
     OptionalProp<I64Prop>:$optional,
-    DefaultI64Array:$intArray
+    DefaultI64Array:$value
   );
 }
 
@@ -94,10 +94,10 @@ def OpWithOptionalPropsAndAttrs :
 // DECL: ::llvm::ArrayRef<std::string> getStrings()
 // DECL: using default_intTy = int32_t;
 // DECL: default_intTy default_int = 0;
-// DECL: intArrayTy intArray = ::llvm::SmallVector<int64_t>{};
-// DECL: ::llvm::ArrayRef<int64_t> getIntArray()
+// DECL: valueTy value = ::llvm::SmallVector<int64_t>{};
+// DECL: ::llvm::ArrayRef<int64_t> getValue()
 // DECL: return ::llvm::ArrayRef<int64_t>{propStorage}
-// DECL: void setIntArray(::llvm::ArrayRef<int64_t> propValue)
+// DECL: void setValue(::llvm::ArrayRef<int64_t> propValue)
 // DECL: propStorage.assign
 // DECL-LABEL: class OpWithProps :
 // DECL: setString(::llvm::StringRef newString)
@@ -111,14 +111,14 @@ def OpWithOptionalPropsAndAttrs :
 // DECL-SAME: ::llvm::ArrayRef<std::string> strings,
 // DECL-SAME: /*optional*/int32_t default_int = 0,
 // DECL-SAME: /*optional*/std::optional<int64_t> optional = std::nullopt,
-// DECL-SAME: /*optional*/::llvm::ArrayRef<int64_t> intArray = ::llvm::ArrayRef<int64_t>{});
+// DECL-SAME: /*optional*/::llvm::ArrayRef<int64_t> value = ::llvm::ArrayRef<int64_t>{});
 
 // DEFS-LABEL: OpWithProps::computePropertiesHash
-// DEFS: hash_intArray
+// DEFS: hash_value_
 // DEFS: using ::llvm::hash_value;
 // DEFS-NEXT: return hash_value(::llvm::ArrayRef<int64_t>{propStorage})
 // DEFS: hash_value(prop.optional)
-// DEFS: hash_intArray(prop.intArray)
+// DEFS: hash_value_(prop.value)
 
 // -----