diff options
Diffstat (limited to 'mlir/test/Dialect')
| -rw-r--r-- | mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir | 172 | ||||
| -rw-r--r-- | mlir/test/Dialect/LLVMIR/nvvm.mlir | 8 | ||||
| -rw-r--r-- | mlir/test/Dialect/Linalg/canonicalize.mlir | 2 | ||||
| -rw-r--r-- | mlir/test/Dialect/Linalg/generalize-named-ops.mlir | 22 | ||||
| -rw-r--r-- | mlir/test/Dialect/Linalg/invalid.mlir | 10 | ||||
| -rw-r--r-- | mlir/test/Dialect/Linalg/one-shot-bufferize.mlir | 2 | ||||
| -rw-r--r-- | mlir/test/Dialect/Linalg/roundtrip.mlir | 18 | ||||
| -rw-r--r-- | mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir | 2 | ||||
| -rw-r--r-- | mlir/test/Dialect/Tensor/bufferize.mlir | 2 | ||||
| -rw-r--r-- | mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir | 2 | ||||
| -rw-r--r-- | mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir | 128 | ||||
| -rw-r--r-- | mlir/test/Dialect/XeGPU/propagate-layout.mlir | 82 |
12 files changed, 306 insertions, 144 deletions
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir index 87a31ca..1adc418 100644 --- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir +++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir @@ -8,11 +8,11 @@ // RUN: mlir-opt --allow-unregistered-dialect \ // RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \ -// RUN: | FileCheck %s --check-prefix=CHECK-GFX9 +// RUN: | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX9 // RUN: mlir-opt --allow-unregistered-dialect \ // RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx1030" %s \ -// RUN: | FileCheck %s --check-prefix=CHECK-GFX10 +// RUN: | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX10 // CHECK-SUB: gpu.module @kernels { // CHECK-SHFL: gpu.module @kernels { @@ -24,8 +24,7 @@ gpu.module @kernels { // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<5xf16>) // // CHECK-SHFL-LABEL: gpu.func @kernel0( - // CHECK-GFX9-LABEL: gpu.func @kernel0( - // CHECK-GFX10-LABEL: gpu.func @kernel0( + // CHECK-GFX-LABEL: gpu.func @kernel0( gpu.func @kernel0(%arg0: vector<5xf16>) kernel { // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16> // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16> @@ -56,8 +55,7 @@ gpu.module @kernels { // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4) // CHECK-SUB: "test.consume" - // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} - // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} + // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum2) : (vector<5xf16>) -> () @@ -74,8 +72,7 @@ gpu.module @kernels { // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<1xf32>) // // CHECK-SHFL-LABEL: gpu.func @kernel1( - // CHECK-GFX9-LABEL: gpu.func @kernel1( - // CHECK-GFX10-LABEL: gpu.func @kernel1( + // CHECK-GFX-LABEL: gpu.func @kernel1( gpu.func @kernel1(%arg0: vector<1xf32>) kernel { // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32> // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32 @@ -100,17 +97,14 @@ gpu.module @kernels { // Note stride is dropped because it is == 1. // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32 // CHECK-SUB: "test.consume" - // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} quad_perm - // CHECK-GFX9: amdgpu.dpp {{.+}} row_half_mirror - // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} quad_perm - // CHECK-GFX10: amdgpu.dpp {{.+}} row_half_mirror + // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} quad_perm + // CHECK-GFX: amdgpu.dpp {{.+}} row_half_mirror %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum2) : (vector<1xf32>) -> () // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32 // CHECK-SUB: "test.consume" - // CHECK-GFX9-NOT: amdgpu.dpp - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-NOT: amdgpu.dpp // CHECK-GFX10-NOT: rocdl.permlanex16 %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum3) : (vector<1xf32>) -> () @@ -126,11 +120,8 @@ gpu.module @kernels { // // CHECK-SHFL-LABEL: gpu.func @kernel2( // - // CHECK-GFX9-LABEL: gpu.func @kernel2( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel2( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel2( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel { // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8> // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> () @@ -148,8 +139,7 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) - // CHECK-GFX9-LABEL: gpu.func @kernel3( - // CHECK-GFX10-LABEL: gpu.func @kernel3( + // CHECK-GFX-LABEL: gpu.func @kernel3( gpu.func @kernel3(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -169,9 +159,9 @@ gpu.module @kernels { // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32 // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32 // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> () - + // CHECK-GFX9-COUNT-6: amdgpu.dpp - + // CHECK-GFX10-COUNT-4: amdgpu.dpp // CHECK-GFX10: rocdl.permlanex16 // CHECK-GFX10-COUNT-2: rocdl.readlane @@ -185,11 +175,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) // - // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered( - // CHECK-GFX9-SAME: %[[ARG0:.+]]: i32) - // - // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered( - // CHECK-GFX10-SAME: %[[ARG0:.+]]: i32) + // CHECK-GFX-LABEL: gpu.func @kernel3_clustered( + // CHECK-GFX-SAME: %[[ARG0:.+]]: i32) gpu.func @kernel3_clustered(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -204,19 +191,13 @@ gpu.module @kernels { // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32 // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> () - // CHECK-GFX9: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX9: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32 - // CHECK-GFX9: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX9: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32 - // CHECK-GFX9: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32 - // CHECK-GFX9: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32 - - // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32 - // CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32 - // CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32 - // CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32 + // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32 + // CHECK-GFX: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32 + // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32 + // CHECK-GFX: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32 + // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32 + // CHECK-GFX: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32 + // CHECK-GFX10: "test.consume"(%[[A2]]) : (i32) -> () %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32 "test.consume"(%sum0) : (i32) -> () @@ -228,11 +209,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) // - // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered_strided( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered_strided( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel3_clustered_strided( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel3_clustered_strided(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32 @@ -256,11 +234,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel4( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>) // - // CHECK-GFX9-LABEL: gpu.func @kernel4( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel4( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel4( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel4(%arg0: vector<2xf16>) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -298,11 +273,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>) // - // CHECK-GFX9-LABEL: gpu.func @kernel4_clustered( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel4_clustered( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel4_clustered( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -319,10 +291,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel5( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16) // - // CHECK-GFX9-LABEL: gpu.func @kernel5( - // - // CHECK-GFX10-LABEL: gpu.func @kernel5( - // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16) + // CHECK-GFX-LABEL: gpu.func @kernel5( + // CHECK-GFX-SAME: %[[ARG0:.+]]: i16) gpu.func @kernel5(%arg0: i16) kernel { // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32 @@ -334,7 +304,7 @@ gpu.module @kernels { // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () - + // CHECK-GFX9-COUNT-6: amdgpu.dpp // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 @@ -361,11 +331,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16) // - // CHECK-GFX9-LABEL: gpu.func @kernel5_clustered - // CHECK-GFX9-SAME: %[[ARG0:.+]]: i16) - // - // CHECK-GFX10-LABEL: gpu.func @kernel5_clustered - // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16) + // CHECK-GFX-LABEL: gpu.func @kernel5_clustered + // CHECK-GFX-SAME: %[[ARG0:.+]]: i16) gpu.func @kernel5_clustered(%arg0: i16) kernel { // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32 @@ -378,25 +345,15 @@ gpu.module @kernels { // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () - // CHECK-GFX9: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16 - // CHECK-GFX9: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16 - // CHECK-GFX9: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16 - // CHECK-GFX9: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16 - // CHECK-GFX9: "test.consume"(%[[VAR7]]) : (i16) -> () - - // CHECK-GFX10: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16 - // CHECK-GFX10: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16 - // CHECK-GFX10: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16 - // CHECK-GFX10: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16 - // CHECK-GFX10: "test.consume"(%[[VAR7]]) : (i16) -> () + // CHECK-GFX: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16 + // CHECK-GFX: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16 + // CHECK-GFX: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16 + // CHECK-GFX: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16 + // CHECK-GFX: "test.consume"(%[[VAR7]]) : (i16) -> () %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16 "test.consume"(%sum0) : (i16) -> () @@ -407,11 +364,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel6( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>) // - // CHECK-GFX9-LABEL: gpu.func @kernel6( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel6( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel6( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel6(%arg0: vector<3xi8>) kernel { // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8> // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8> @@ -433,6 +387,44 @@ gpu.module @kernels { gpu.return } + // CHECK-GFX-LABEL: gpu.func @kernel7( + // CHECK-GFX-SAME: %[[ARG0:.+]]: f32) + // + // Checks, common to gfx942 and gfx1030, of + // (1) quad_perm, followed by reduction resulting in reduction over 2 consecutive lanes, + // (2) quad_perm, followed by reduction resulting in reduction over 4 consecutive lanes, + // (3) row_half_mirror, followed by reduction resulting in reduction over 8 consecutive lanes, and + // (4) row_mirror, followed by reduction resulting in reduction over 16 consecutive lanes. + // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A0:.+]] = arith.addf %[[ARG0]], %[[D0]] : f32 + // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A1:.+]] = arith.addf %[[A0]], %[[D1]] : f32 + // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A2:.+]] = arith.addf %[[A1]], %[[D2]] : f32 + // CHECK-GFX: %[[D3:.+]] = amdgpu.dpp %[[A2]] %[[A2]] row_mirror(unit) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A3:.+]] = arith.addf %[[A2]], %[[D3]] : f32 + // + // Now, on gfx942: + // (1) Lane 15 gets broadcast to lanes [16, 32) and lane 31 gets broadcast to lanes [48, 64], after which + // the reduction in lanes [16, 32) is over the full cluster of the first 32 lanes, and the reduction in lanes + // [48, 64) is over the full cluster of the last 32 lanes. + // (2) Update the reduction value in lanes [0, 16) and [32, 48) with the final reduction result from + // lanes [16, 32) and [48, 64), respectively. + // CHECK-GFX9: %[[BCAST15:.+]] = amdgpu.dpp %[[A3]] %[[A3]] row_bcast_15(unit) {row_mask = 10 : i32} : f32 + // CHECK-GFX9: %[[SUM:.+]] = arith.addf %[[A3]], %[[BCAST15]] : f32 + // CHECK-GFX9: %[[SWIZ:.+]] = amdgpu.swizzle_bitmode %[[SUM]] 0 31 0 : f32 + // CHECK-GFX9: "test.consume"(%[[SWIZ]]) : (f32) -> () + // + // On gfx1030, the final step is to permute the lanes and perform final reduction: + // CHECK-GFX10: rocdl.permlanex16 + // CHECK-GFX10: arith.addf + // CHECK-GFX10: "test.consume" + gpu.func @kernel7(%arg0: f32) kernel { + %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (f32) -> (f32) + "test.consume"(%sum0) : (f32) -> () + gpu.return + } + // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>) // diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir index 0243f5e..2505e56 100644 --- a/mlir/test/Dialect/LLVMIR/nvvm.mlir +++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir @@ -419,8 +419,8 @@ llvm.func private @mbarrier_init_generic(%barrier: !llvm.ptr) { llvm.func private @mbarrier_init_shared(%barrier: !llvm.ptr<3>) { %count = nvvm.read.ptx.sreg.ntid.x : i32 - // CHECK: nvvm.mbarrier.init.shared %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32 - nvvm.mbarrier.init.shared %barrier, %count : !llvm.ptr<3>, i32 + // CHECK: nvvm.mbarrier.init %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32 + nvvm.mbarrier.init %barrier, %count : !llvm.ptr<3>, i32 llvm.return } @@ -433,8 +433,8 @@ llvm.func private @mbarrier_inval_generic(%barrier: !llvm.ptr) { llvm.func private @mbarrier_inval_shared(%barrier: !llvm.ptr<3>) { - // CHECK: nvvm.mbarrier.inval.shared %{{.*}} : !llvm.ptr<3> - nvvm.mbarrier.inval.shared %barrier : !llvm.ptr<3> + // CHECK: nvvm.mbarrier.inval %{{.*}} : !llvm.ptr<3> + nvvm.mbarrier.inval %barrier : !llvm.ptr<3> llvm.return } diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir index 26d2d98..f4020ede 100644 --- a/mlir/test/Dialect/Linalg/canonicalize.mlir +++ b/mlir/test/Dialect/Linalg/canonicalize.mlir @@ -1423,7 +1423,7 @@ func.func @transpose_buffer(%input: memref<?xf32>, func.func @recursive_effect(%arg : tensor<1xf32>) { %init = arith.constant dense<0.0> : tensor<1xf32> %mapped = linalg.map ins(%arg:tensor<1xf32>) outs(%init :tensor<1xf32>) - (%in : f32) { + (%in : f32, %out: f32) { vector.print %in : f32 linalg.yield %in : f32 } diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir index ae07b1b..dcdd6c8 100644 --- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir +++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir @@ -386,18 +386,24 @@ func.func @generalize_batch_reduce_gemm_bf16(%lhs: memref<7x8x9xbf16>, %rhs: mem // ----- -// CHECK-LABEL: generalize_linalg_map -func.func @generalize_linalg_map(%arg0: memref<1x8x8x8xf32>) { +func.func @generalize_linalg_map(%arg0: memref<1x8x8x8xf32>, %arg1: memref<1x8x8x8xf32>, %arg2: memref<1x8x8x8xf32>) { %cst = arith.constant 0.000000e+00 : f32 - // CHECK: linalg.map - // CHECK-NOT: linalg.generic - linalg.map outs(%arg0 : memref<1x8x8x8xf32>) - () { - linalg.yield %cst : f32 - } + linalg.map {arith.addf} ins(%arg0, %arg1: memref<1x8x8x8xf32>, memref<1x8x8x8xf32>) outs(%arg2 : memref<1x8x8x8xf32>) return } +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +// CHECK: @generalize_linalg_map + +// CHECK: linalg.generic +// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP0]]] +// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel"]} +// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<1x8x8x8xf32>, memref<1x8x8x8xf32>) outs(%{{.+}} : memref<1x8x8x8xf32> +// CHECK: ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32) +// CHECK: %[[ADD:.+]] = arith.addf %[[BBARG0]], %[[BBARG1]] : f32 +// CHECK: linalg.yield %[[ADD]] : f32 + // ----- func.func @generalize_add(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>, diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir index 40bf4d1..fabc8e6 100644 --- a/mlir/test/Dialect/Linalg/invalid.mlir +++ b/mlir/test/Dialect/Linalg/invalid.mlir @@ -681,7 +681,7 @@ func.func @map_binary_wrong_yield_operands( %add = linalg.map ins(%lhs, %rhs : tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 // expected-error @+1{{'linalg.yield' op expected number of yield values (2) to match the number of inits / outs operands of the enclosing LinalgOp (1)}} linalg.yield %0, %0: f32, f32 @@ -694,11 +694,11 @@ func.func @map_binary_wrong_yield_operands( func.func @map_input_mapper_arity_mismatch( %lhs: tensor<64xf32>, %rhs: tensor<64xf32>, %init: tensor<64xf32>) -> tensor<64xf32> { - // expected-error@+1{{'linalg.map' op expects number of operands to match the arity of mapper, but got: 2 and 3}} + // expected-error@+1{{'linalg.map' op expects number of operands to match the arity of mapper, but got: 3 and 4}} %add = linalg.map ins(%lhs, %rhs : tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32, %extra_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32, %extra_elem: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 linalg.yield %0: f32 } @@ -714,7 +714,7 @@ func.func @map_input_mapper_type_mismatch( %add = linalg.map ins(%lhs, %rhs : tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f64, %rhs_elem: f64) { + (%lhs_elem: f64, %rhs_elem: f64, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f64 linalg.yield %0: f64 } @@ -730,7 +730,7 @@ func.func @map_input_output_shape_mismatch( %add = linalg.map ins(%lhs, %rhs : tensor<64x64xf32>, tensor<64x64xf32>) outs(%init:tensor<32xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 linalg.yield %0: f32 } diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir index 1df15e8..85cc1ff 100644 --- a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir @@ -339,7 +339,7 @@ func.func @map_binary(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>, %add = linalg.map ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 linalg.yield %0: f32 } diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir index 563013d..7492892 100644 --- a/mlir/test/Dialect/Linalg/roundtrip.mlir +++ b/mlir/test/Dialect/Linalg/roundtrip.mlir @@ -341,7 +341,7 @@ func.func @mixed_parallel_reduced_results(%arg0 : tensor<?x?x?xf32>, func.func @map_no_inputs(%init: tensor<64xf32>) -> tensor<64xf32> { %add = linalg.map outs(%init:tensor<64xf32>) - () { + (%out: f32) { %0 = arith.constant 0.0: f32 linalg.yield %0: f32 } @@ -349,7 +349,7 @@ func.func @map_no_inputs(%init: tensor<64xf32>) -> tensor<64xf32> { } // CHECK-LABEL: func @map_no_inputs // CHECK: linalg.map outs -// CHECK-NEXT: () { +// CHECK-NEXT: (%[[OUT:.*]]: f32) { // CHECK-NEXT: arith.constant // CHECK-NEXT: linalg.yield // CHECK-NEXT: } @@ -361,7 +361,7 @@ func.func @map_binary(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>, %add = linalg.map ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 linalg.yield %0: f32 } @@ -378,7 +378,7 @@ func.func @map_binary_memref(%lhs: memref<64xf32>, %rhs: memref<64xf32>, linalg.map ins(%lhs, %rhs: memref<64xf32>, memref<64xf32>) outs(%init:memref<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem: f32 linalg.yield %0: f32 } @@ -393,7 +393,7 @@ func.func @map_unary(%input: tensor<64xf32>, %init: tensor<64xf32>) -> tensor<64 %abs = linalg.map ins(%input:tensor<64xf32>) outs(%init:tensor<64xf32>) - (%input_elem: f32) { + (%input_elem: f32, %out: f32) { %0 = math.absf %input_elem: f32 linalg.yield %0: f32 } @@ -408,7 +408,7 @@ func.func @map_unary_memref(%input: memref<64xf32>, %init: memref<64xf32>) { linalg.map ins(%input:memref<64xf32>) outs(%init:memref<64xf32>) - (%input_elem: f32) { + (%input_elem: f32, %out: f32) { %0 = math.absf %input_elem: f32 linalg.yield %0: f32 } @@ -604,7 +604,7 @@ func.func @map_arith_with_attr(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>, %add = linalg.map ins(%lhs, %rhs: tensor<64xf32>, tensor<64xf32>) outs(%init:tensor<64xf32>) - (%lhs_elem: f32, %rhs_elem: f32) { + (%lhs_elem: f32, %rhs_elem: f32, %out: f32) { %0 = arith.addf %lhs_elem, %rhs_elem fastmath<fast> : f32 linalg.yield %0: f32 } @@ -622,7 +622,7 @@ func.func @map_arith_with_attr(%lhs: tensor<64xf32>, %rhs: tensor<64xf32>, func.func @map_not_short_form_compatible(%lhs: tensor<1x32xf32>, %rhs: tensor<1x32xf32>, %init: tensor<1x32xf32>) -> tensor<1x32xf32> { %mapped = linalg.map ins(%lhs, %rhs : tensor<1x32xf32>, tensor<1x32xf32>) outs(%init : tensor<1x32xf32>) - (%in_1: f32, %in_2: f32) { + (%in_1: f32, %in_2: f32, %out: f32) { %1 = arith.maximumf %in_1, %in_2 : f32 linalg.yield %in_1 : f32 } @@ -634,7 +634,7 @@ func.func @map_not_short_form_compatible(%lhs: tensor<1x32xf32>, %rhs: tensor<1x // CHECK-NOT: linalg.map { arith.maximumf } ins(%[[LHS]] : tensor<1x32xf32> // CHECK: linalg.map ins(%[[LHS]], %[[RHS]] : tensor<1x32xf32>, tensor<1x32xf32>) // CHECK-SAME: outs(%[[INIT]] : tensor<1x32xf32>) -// CHECK-NEXT: (%[[IN1:.*]]: f32, %[[IN2:.*]]: f32) { +// CHECK-NEXT: (%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32) { // CHECK-NEXT: %[[MAX_RESULT:.*]] = arith.maximumf %[[IN1]], %[[IN2]] : f32 // CHECK-NEXT: linalg.yield %[[IN1]] : f32 // CHECK-NEXT: } diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir index 93a0336..aa2c1da 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir @@ -356,7 +356,7 @@ func.func @vectorize_map(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) { linalg.map ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%arg2 : memref<64xf32>) - (%in: f32, %in_0: f32) { + (%in: f32, %in_0: f32, %out: f32) { %0 = arith.addf %in, %in_0 : f32 linalg.yield %0 : f32 } diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir index 296ca02..5eb2360 100644 --- a/mlir/test/Dialect/Tensor/bufferize.mlir +++ b/mlir/test/Dialect/Tensor/bufferize.mlir @@ -728,7 +728,7 @@ func.func @tensor.concat_dynamic_nonconcat_dim(%f: tensor<?x?xf32>, %g: tensor<? // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc(%[[M]], %[[N]]) {{.*}} : memref<?x3x?xf32> // CHECK: %[[ALLOC_T:.*]] = bufferization.to_tensor %[[ALLOC]] // CHECK: %[[MAPPED:.*]] = linalg.map outs(%[[ALLOC_T]] : tensor<?x3x?xf32>) -// CHECK: () { +// CHECK: (%[[INIT:.*]]: f32) { // CHECK: linalg.yield %[[F]] : f32 // CHECK: } // CHECK: return %[[MAPPED]] : tensor<?x3x?xf32> diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir index d289d73..2780212 100644 --- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir +++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s gpu.module @test { gpu.func @empty() { diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir new file mode 100644 index 0000000..58461b8 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -0,0 +1,128 @@ +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s + +// CHECK-LABEL: func.func @dpas_f16( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : +// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : +// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> +// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> +gpu.module @test { + +func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + return +} +} + +// ----- +gpu.module @test_kernel { + gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c32 : index + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + + %out:3 = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) + -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) { + //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : + //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x32xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> + + //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x32xf16> + %c = arith.addf %a, %b : vector<16x32xf16> + + //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>> + xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16> + + //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> + %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> + scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc + : !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16> + } + gpu.return + } +} + +// ----- +gpu.module @test_kernel { + gpu.func @elementwise_with_inst_data_12(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c32 : index + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + + %out:3 = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) + -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) { + //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : + //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<12x32xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> + + //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<12x32xf16> + %c = arith.addf %a, %b : vector<12x32xf16> + + //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>>> + xegpu.store_nd %c, %arg2: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16> + + //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> + %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> + scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc + : !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16> + } + gpu.return + } +} + +// ----- +gpu.module @test { +// CHECK-LABEL: func.func @scatter_ops_chunksize( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { +// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> +// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> +// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> +// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8], lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +func.func @scatter_ops_chunksize(%src: memref<256xf16>) { + %1 = arith.constant dense<1>: vector<16xi1> + %offset = arith.constant dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> + : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> + xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> + : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + return +} +} diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 30f785d..543e119 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -1,5 +1,6 @@ -// RUN: mlir-opt -xegpu-propagate-layout -split-input-file %s | FileCheck %s +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout -split-input-file %s | FileCheck %s +gpu.module @test { // CHECK-LABEL: func.func @dpas_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32> @@ -25,8 +26,10 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @dpas_i8( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { // CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], @@ -37,8 +40,10 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre xegpu.store_nd %0, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array<i64: 1, 0>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : @@ -55,8 +60,10 @@ func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_transpose( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16> to vector<16x16xf16> @@ -73,8 +80,10 @@ func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, % xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @extf_truncf( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, %[[ARG1:[0-9a-zA-Z]+]]: // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>) -> vector<8x16xf32> { @@ -88,8 +97,10 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> return %4 : vector<8x16xf32> } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_gather_with_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} @@ -113,8 +124,10 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256 xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_gather_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} @@ -132,8 +145,9 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf xegpu.store_nd %1, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @store_scatter_with_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> -> @@ -148,8 +162,9 @@ func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) { xegpu.store %cst, %0, %cst_0 : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @store_scatter_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, @@ -161,8 +176,9 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { xegpu.store %arg0, %0, %cst_0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scatter_ops_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> @@ -179,8 +195,9 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) { : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scatter_ops( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> @@ -195,8 +212,9 @@ func.func @scatter_ops(%src: memref<256xf16>) { xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( // CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} // CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16> @@ -219,8 +237,9 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> @@ -239,8 +258,9 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8 xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} // CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16> @@ -255,8 +275,9 @@ func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16 xegpu.store_nd %3, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> // CHECK: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} @@ -270,9 +291,10 @@ func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %a xegpu.store_nd %3, %1 : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @binary_op_one_use( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, @@ -291,8 +313,9 @@ func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu. xegpu.store_nd %4, %arg2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @binary_op_multiple_uses( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, @@ -312,8 +335,9 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: ! xegpu.store_nd %2, %arg3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @for_op( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> @@ -353,8 +377,9 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me xegpu.store_nd %2#2, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @if_single_use( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, @@ -381,8 +406,9 @@ func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tens xegpu.store_nd %2, %arg3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @if_multiple_uses( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, @@ -411,8 +437,9 @@ func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t xegpu.store_nd %1, %arg4 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_outer_reduction( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) { // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [0] : vector<16x16xf32> to vector<16xf32> @@ -422,8 +449,9 @@ func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_inner_reduction( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) { // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [1] : vector<16x16xf32> to vector<16xf32> @@ -433,8 +461,9 @@ func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @update_nd_offset_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> @@ -448,8 +477,9 @@ func.func @update_nd_offset_1d(%arg0: memref<256xf32>){ xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @update_nd_offset_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> @@ -463,8 +493,9 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){ xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @prefetch_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> @@ -475,8 +506,9 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){ xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @prefetch_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> @@ -487,8 +519,9 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){ xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scf_while_and_condition( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) @@ -520,8 +553,9 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32 } return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { @@ -541,8 +575,9 @@ func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { @@ -563,3 +598,4 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } +} |
