diff options
Diffstat (limited to 'mlir/test')
31 files changed, 2223 insertions, 555 deletions
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir index 45b1a1f..0cbe064 100644 --- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir +++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir @@ -195,6 +195,36 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) { // ----- +// ALL-LABEL: func @distinct_objects +// ALL-SAME: (%[[ARG0:.*]]: memref<?xf16>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: memref<?xf64>) +func.func @distinct_objects(%arg0: memref<?xf16>, %arg1: memref<?xf32>, %arg2: memref<?xf64>) -> (memref<?xf16>, memref<?xf32>, memref<?xf64>) { +// ALL-DAG: %[[CAST_0:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<?xf16> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> +// ALL-DAG: %[[CAST_1:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<?xf32> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> +// ALL-DAG: %[[CAST_2:.*]] = builtin.unrealized_conversion_cast %[[ARG2]] : memref<?xf64> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> +// ALL: %[[PTR_0:.*]] = llvm.extractvalue %[[CAST_0]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> +// ALL: %[[PTR_1:.*]] = llvm.extractvalue %[[CAST_1]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> +// ALL: %[[PTR_2:.*]] = llvm.extractvalue %[[CAST_2]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> +// ALL: %[[TRUE:.*]] = llvm.mlir.constant(true) : i1 +// ALL: llvm.intr.assume %[[TRUE]] ["separate_storage"(%[[PTR_0]], %[[PTR_1]] : !llvm.ptr, !llvm.ptr)] : i1 +// ALL: llvm.intr.assume %[[TRUE]] ["separate_storage"(%[[PTR_0]], %[[PTR_2]] : !llvm.ptr, !llvm.ptr)] : i1 +// ALL: llvm.intr.assume %[[TRUE]] ["separate_storage"(%[[PTR_1]], %[[PTR_2]] : !llvm.ptr, !llvm.ptr)] : i1 + %1, %2, %3 = memref.distinct_objects %arg0, %arg1, %arg2 : memref<?xf16>, memref<?xf32>, memref<?xf64> + return %1, %2, %3 : memref<?xf16>, memref<?xf32>, memref<?xf64> +} + +// ----- + +// ALL-LABEL: func @distinct_objects_noop +// ALL-SAME: (%[[ARG0:.*]]: memref<?xf16>) +func.func @distinct_objects_noop(%arg0: memref<?xf16>) -> memref<?xf16> { +// 1-operand version is noop +// ALL-NEXT: return %[[ARG0]] + %1 = memref.distinct_objects %arg0 : memref<?xf16> + return %1 : memref<?xf16> +} + +// ----- + // CHECK-LABEL: func @assume_alignment_w_offset // CHECK-INTERFACE-LABEL: func @assume_alignment_w_offset func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset: ?>>) { diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir index ca3de3a..2fe0995 100644 --- a/mlir/test/Dialect/Arith/canonicalize.mlir +++ b/mlir/test/Dialect/Arith/canonicalize.mlir @@ -2216,6 +2216,18 @@ func.func @test_mulf1(%arg0 : f32, %arg1 : f32) -> (f32) { return %2 : f32 } +// CHECK-LABEL: @test_mulf2( +func.func @test_mulf2(%arg0 : f32) -> (f32, f32) { + // CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32 + // CHECK-DAG: %[[C0n:.+]] = arith.constant -0.000000e+00 : f32 + // CHECK-NEXT: return %[[C0]], %[[C0n]] + %c0 = arith.constant 0.0 : f32 + %c0n = arith.constant -0.0 : f32 + %0 = arith.mulf %c0, %arg0 fastmath<nnan,nsz> : f32 + %1 = arith.mulf %c0n, %arg0 fastmath<nnan,nsz> : f32 + return %0, %1 : f32, f32 +} + // ----- // CHECK-LABEL: @test_divf( diff --git a/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir b/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir index 99790cc..fcd004a 100644 --- a/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir +++ b/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir @@ -85,3 +85,14 @@ func.func @no_expansion(%x: f32) -> f32 { %y = arith.addf %x, %c : f32 func.return %y : f32 } + +// ----- + +func.func @no_promote_select(%c: i1, %x: bf16, %y: bf16) -> bf16 { +// CHECK-LABEL: @no_promote_select +// CHECK-SAME: (%[[C:.+]]: i1, %[[X:.+]]: bf16, %[[Y:.+]]: bf16) +// CHECK: %[[Z:.+]] = arith.select %[[C]], %[[X]], %[[Y]] : bf16 +// CHECK: return %[[Z]] + %z = arith.select %c, %x, %y : bf16 + func.return %z : bf16 +} diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index 0bad151..6134695 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -1068,6 +1068,38 @@ llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) { // ----- +// CHECK-LABEL: rocdl.cvt.scalef32.pk8 +llvm.func @rocdl.cvt.scalef32.pk8(%v8xf32: vector<8xf32>, + %v8xf16: vector<8xf16>, + %v8xbf16: vector<8xbf16>, + %scale: f32) { + + // CHECK: rocdl.cvt.scalef32.pk8.fp8.f32 + %0 = rocdl.cvt.scalef32.pk8.fp8.f32 %v8xf32, %scale : vector<2xi32> + // CHECK: rocdl.cvt.scalef32.pk8.bf8.f32 + %1 = rocdl.cvt.scalef32.pk8.bf8.f32 %v8xf32, %scale : vector<2xi32> + // CHECK: rocdl.cvt.scalef32.pk8.fp4.f32 + %2 = rocdl.cvt.scalef32.pk8.fp4.f32 %v8xf32, %scale : i32 + + // CHECK: rocdl.cvt.scalef32.pk8.fp8.f16 + %3 = rocdl.cvt.scalef32.pk8.fp8.f16 %v8xf16, %scale : vector<2xi32> + // CHECK: rocdl.cvt.scalef32.pk8.bf8.f16 + %4 = rocdl.cvt.scalef32.pk8.bf8.f16 %v8xf16, %scale : vector<2xi32> + // CHECK: rocdl.cvt.scalef32.pk8.fp4.f16 + %5 = rocdl.cvt.scalef32.pk8.fp4.f16 %v8xf16, %scale : i32 + + // CHECK: rocdl.cvt.scalef32.pk8.fp8.bf16 + %6 = rocdl.cvt.scalef32.pk8.fp8.bf16 %v8xbf16, %scale : vector<2xi32> + // CHECK: rocdl.cvt.scalef32.pk8.bf8.bf16 + %7 = rocdl.cvt.scalef32.pk8.bf8.bf16 %v8xbf16, %scale : vector<2xi32> + // CHECK: rocdl.cvt.scalef32.pk8.fp4.bf16 + %8 = rocdl.cvt.scalef32.pk8.fp4.bf16 %v8xbf16, %scale : i32 + + llvm.return +} + +// ----- + // CHECK-LABEL: rocdl.cvt.scale.pk16 llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) { diff --git a/mlir/test/Dialect/Math/sincos-fusion.mlir b/mlir/test/Dialect/Math/sincos-fusion.mlir new file mode 100644 index 0000000..29fb9f1 --- /dev/null +++ b/mlir/test/Dialect/Math/sincos-fusion.mlir @@ -0,0 +1,86 @@ +// RUN: mlir-opt -math-sincos-fusion %s | FileCheck %s + +// CHECK-LABEL: func.func @sincos_fusion( +// CHECK-SAME: %[[ARG0:.*]]: f32, +// CHECK-SAME: %[[ARG1:.*]]: f32) -> (f32, f32, f32, f32) { +// CHECK: %[[VAL_0:.*]], %[[VAL_1:.*]] = math.sincos %[[ARG0]] : f32 +// CHECK: %[[VAL_2:.*]], %[[VAL_3:.*]] = math.sincos %[[ARG1]] : f32 +// CHECK: return %[[VAL_0]], %[[VAL_1]], %[[VAL_3]], %[[VAL_2]] : f32, f32, f32, f32 +// CHECK: } +func.func @sincos_fusion(%arg0 : f32, %arg1 : f32) -> (f32, f32, f32, f32) { + %0 = math.sin %arg0 : f32 + %1 = math.cos %arg0 : f32 + + %2 = math.cos %arg1 : f32 + %3 = math.sin %arg1 : f32 + + func.return %0, %1, %2, %3 : f32, f32, f32, f32 +} + +func.func private @sink(%arg0 : f32) + +// CHECK: func.func private @sink(f32) +// CHECK-LABEL: func.func @sincos_ensure_ssa_dominance( +// CHECK-SAME: %[[ARG0:.*]]: f32, +// CHECK-SAME: %[[ARG1:.*]]: f32) -> (f32, f32, f32, f32) { +// CHECK: %[[VAL_0:.*]], %[[VAL_1:.*]] = math.sincos %[[ARG0]] : f32 +// CHECK: call @sink(%[[VAL_0]]) : (f32) -> () +// CHECK: %[[VAL_2:.*]], %[[VAL_3:.*]] = math.sincos %[[ARG1]] : f32 +// CHECK: call @sink(%[[VAL_3]]) : (f32) -> () +// CHECK: return %[[VAL_0]], %[[VAL_1]], %[[VAL_3]], %[[VAL_2]] : f32, f32, f32, f32 +// CHECK: } +func.func @sincos_ensure_ssa_dominance(%arg0 : f32, %arg1 : f32) -> (f32, f32, f32, f32) { + %0 = math.sin %arg0 : f32 + func.call @sink(%0) : (f32) -> () + %1 = math.cos %arg0 : f32 + %2 = math.cos %arg1 : f32 + func.call @sink(%2) : (f32) -> () + %3 = math.sin %arg1 : f32 + func.return %0, %1, %2, %3 : f32, f32, f32, f32 +} + +// CHECK-LABEL: func.func @sincos_fusion_no_match_fmf( +// CHECK-SAME: %[[ARG0:.*]]: f32) -> (f32, f32) { +// CHECK: %[[VAL_0:.*]] = math.sin %[[ARG0]] fastmath<contract> : f32 +// CHECK: %[[VAL_1:.*]] = math.cos %[[ARG0]] : f32 +// CHECK: return %[[VAL_0]], %[[VAL_1]] : f32, f32 +// CHECK: } +func.func @sincos_fusion_no_match_fmf(%arg0 : f32) -> (f32, f32) { + %0 = math.sin %arg0 fastmath<contract> : f32 + %1 = math.cos %arg0 : f32 + func.return %0, %1 : f32, f32 +} + +// CHECK-LABEL: func.func @sincos_no_fusion_different_block( +// CHECK-SAME: %[[ARG0:.*]]: f32, +// CHECK-SAME: %[[ARG1:.*]]: i1) -> f32 { +// CHECK: %[[VAL_0:.*]] = scf.if %[[ARG1]] -> (f32) { +// CHECK: %[[VAL_1:.*]] = math.sin %[[ARG0]] : f32 +// CHECK: scf.yield %[[VAL_1]] : f32 +// CHECK: } else { +// CHECK: %[[VAL_2:.*]] = math.cos %[[ARG0]] : f32 +// CHECK: scf.yield %[[VAL_2]] : f32 +// CHECK: } +// CHECK: return %[[VAL_0]] : f32 +// CHECK: } +func.func @sincos_no_fusion_different_block(%arg0 : f32, %flag : i1) -> f32 { + %0 = scf.if %flag -> f32 { + %s = math.sin %arg0 : f32 + scf.yield %s : f32 + } else { + %c = math.cos %arg0 : f32 + scf.yield %c : f32 + } + func.return %0 : f32 +} + +// CHECK-LABEL: func.func @sincos_fusion_preserve_fastmath( +// CHECK-SAME: %[[ARG0:.*]]: f32) -> (f32, f32) { +// CHECK: %[[VAL_0:.*]], %[[VAL_1:.*]] = math.sincos %[[ARG0]] fastmath<contract> : f32 +// CHECK: return %[[VAL_0]], %[[VAL_1]] : f32, f32 +// CHECK: } +func.func @sincos_fusion_preserve_fastmath(%arg0 : f32) -> (f32, f32) { + %0 = math.sin %arg0 fastmath<contract> : f32 + %1 = math.cos %arg0 fastmath<contract> : f32 + func.return %0, %1 : f32, f32 +} diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir index 3f96d90..5ff2920 100644 --- a/mlir/test/Dialect/MemRef/invalid.mlir +++ b/mlir/test/Dialect/MemRef/invalid.mlir @@ -1169,3 +1169,19 @@ func.func @expand_shape_invalid_output_shape( into memref<2x15x20xf32, strided<[60000, 4000, 2], offset: 100>> return } + +// ----- + +func.func @distinct_objects_types_mismatch(%arg0: memref<?xf32>, %arg1: memref<?xi32>) -> (memref<?xi32>, memref<?xf32>) { + // expected-error @+1 {{operand types and result types must match}} + %0, %1 = "memref.distinct_objects"(%arg0, %arg1) : (memref<?xf32>, memref<?xi32>) -> (memref<?xi32>, memref<?xf32>) + return %0, %1 : memref<?xi32>, memref<?xf32> +} + +// ----- + +func.func @distinct_objects_0_operands() { + // expected-error @+1 {{expected at least one operand}} + "memref.distinct_objects"() : () -> () + return +} diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir index 6c2298a..a90c950 100644 --- a/mlir/test/Dialect/MemRef/ops.mlir +++ b/mlir/test/Dialect/MemRef/ops.mlir @@ -302,6 +302,15 @@ func.func @assume_alignment(%0: memref<4x4xf16>) { return } +// CHECK-LABEL: func @distinct_objects +// CHECK-SAME: (%[[ARG0:.*]]: memref<?xf16>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: memref<?xf64>) +func.func @distinct_objects(%arg0: memref<?xf16>, %arg1: memref<?xf32>, %arg2: memref<?xf64>) -> (memref<?xf16>, memref<?xf32>, memref<?xf64>) { + // CHECK: %[[RES:.*]]:3 = memref.distinct_objects %[[ARG0]], %[[ARG1]], %[[ARG2]] : memref<?xf16>, memref<?xf32>, memref<?xf64> + %1, %2, %3 = memref.distinct_objects %arg0, %arg1, %arg2 : memref<?xf16>, memref<?xf32>, memref<?xf64> + // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2 : memref<?xf16>, memref<?xf32>, memref<?xf64> + return %1, %2, %3 : memref<?xf16>, memref<?xf32>, memref<?xf64> +} + // CHECK-LABEL: func @expand_collapse_shape_static func.func @expand_collapse_shape_static( %arg0: memref<3x4x5xf32>, diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir index cb69058..1484d7e 100644 --- a/mlir/test/Dialect/OpenACC/ops.mlir +++ b/mlir/test/Dialect/OpenACC/ops.mlir @@ -358,6 +358,41 @@ func.func @acc_loop_multiple_block() { // ----- +acc.firstprivate.recipe @firstprivatization_memref_10xf32 : memref<10xf32> init { +^bb0(%arg0: memref<10xf32>): + %0 = memref.alloca() : memref<10xf32> + acc.yield %0 : memref<10xf32> +} copy { +^bb0(%arg0: memref<10xf32>, %arg1: memref<10xf32>): + memref.copy %arg0, %arg1 : memref<10xf32> to memref<10xf32> + acc.terminator +} destroy { +^bb0(%arg0: memref<10xf32>): + acc.terminator +} + +func.func @testloopfirstprivate(%a: memref<10xf32>, %b: memref<10xf32>) -> () { + %c0 = arith.constant 0 : index + %c10 = arith.constant 10 : index + %c1 = arith.constant 1 : index + %firstprivate = acc.firstprivate varPtr(%a : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32> + acc.loop firstprivate(@firstprivatization_memref_10xf32 -> %firstprivate : memref<10xf32>) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { + "test.openacc_dummy_op"() : () -> () + acc.yield + } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]} + return +} + +// CHECK-LABEL: func.func @testloopfirstprivate( +// CHECK-SAME: %[[ARG0:.*]]: memref<10xf32>, %[[ARG1:.*]]: memref<10xf32>) +// CHECK: %[[FIRSTPRIVATE:.*]] = acc.firstprivate varPtr(%[[ARG0]] : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32> +// CHECK: acc.loop firstprivate(@firstprivatization_memref_10xf32 -> %[[FIRSTPRIVATE]] : memref<10xf32>) control(%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { +// CHECK: "test.openacc_dummy_op"() : () -> () +// CHECK: acc.yield +// CHECK: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]} + +// ----- + acc.private.recipe @privatization_memref_10_f32 : memref<10xf32> init { ^bb0(%arg0: memref<10xf32>): %0 = memref.alloc() : memref<10xf32> @@ -535,6 +570,7 @@ acc.firstprivate.recipe @firstprivatization_memref_10xf32 : memref<10xf32> init acc.yield %0 : memref<10xf32> } copy { ^bb0(%arg0: memref<10xf32>, %arg1: memref<10xf32>): + memref.copy %arg0, %arg1 : memref<10xf32> to memref<10xf32> acc.terminator } destroy { ^bb0(%arg0: memref<10xf32>): diff --git a/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir b/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir index adadb8b..0e9385e 100644 --- a/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir +++ b/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s | FileCheck %s -// RUN: mlir-opt %s | mlir-opt | FileCheck %s +// RUN: mlir-opt %s | FileCheck %s --enable-var-scope +// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope // CHECK-LABEL: @omp_canonloop_raw( @@ -24,10 +24,10 @@ func.func @omp_canonloop_raw(%tc : i32) -> () { func.func @omp_canonloop_sequential_raw(%tc : i32) -> () { // CHECK-NEXT: %canonloop_s0 = omp.new_cli %canonloop_s0 = "omp.new_cli" () : () -> (!omp.cli) - // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc]]) { "omp.canonical_loop" (%tc, %canonloop_s0) ({ ^bb_first(%iv_first: i32): - // CHECK-NEXT: = llvm.add %iv, %iv : i32 + // CHECK-NEXT: = llvm.add %iv_s0, %iv_s0 : i32 %newval = llvm.add %iv_first, %iv_first : i32 // CHECK-NEXT: omp.terminator omp.terminator @@ -36,7 +36,7 @@ func.func @omp_canonloop_sequential_raw(%tc : i32) -> () { // CHECK-NEXT: %canonloop_s1 = omp.new_cli %canonloop_s1 = "omp.new_cli" () : () -> (!omp.cli) - // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv : i32 in range(%[[tc]]) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc]]) { "omp.canonical_loop" (%tc, %canonloop_s1) ({ ^bb_second(%iv_second: i32): // CHECK: omp.terminator @@ -52,17 +52,17 @@ func.func @omp_canonloop_sequential_raw(%tc : i32) -> () { // CHECK-LABEL: @omp_nested_canonloop_raw( // CHECK-SAME: %[[tc_outer:.+]]: i32, %[[tc_inner:.+]]: i32) func.func @omp_nested_canonloop_raw(%tc_outer : i32, %tc_inner : i32) -> () { - // CHECK-NEXT: %canonloop_s0 = omp.new_cli + // CHECK-NEXT: %canonloop = omp.new_cli %outer = "omp.new_cli" () : () -> (!omp.cli) - // CHECK-NEXT: %canonloop_s0_s0 = omp.new_cli + // CHECK-NEXT: %canonloop_d1 = omp.new_cli %inner = "omp.new_cli" () : () -> (!omp.cli) - // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc_outer]]) { + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc_outer]]) { "omp.canonical_loop" (%tc_outer, %outer) ({ ^bb_outer(%iv_outer: i32): - // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%[[tc_inner]]) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc_inner]]) { "omp.canonical_loop" (%tc_inner, %inner) ({ ^bb_inner(%iv_inner: i32): - // CHECK-NEXT: = llvm.add %iv, %iv_0 : i32 + // CHECK-NEXT: = llvm.add %iv, %iv_d1 : i32 %newval = llvm.add %iv_outer, %iv_inner: i32 // CHECK-NEXT: omp.terminator omp.terminator @@ -108,16 +108,24 @@ func.func @omp_canonloop_constant_pretty() -> () { func.func @omp_canonloop_sequential_pretty(%tc : i32) -> () { // CHECK-NEXT: %canonloop_s0 = omp.new_cli %canonloop_s0 = omp.new_cli - // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) { - omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%tc) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%tc) { // CHECK-NEXT: omp.terminator omp.terminator } // CHECK: %canonloop_s1 = omp.new_cli %canonloop_s1 = omp.new_cli - // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv : i32 in range(%[[tc]]) { - omp.canonical_loop(%canonloop_s1) %iv_0 : i32 in range(%tc) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%tc) { + // CHECK-NEXT: omp.terminator + omp.terminator + } + + // CHECK: %canonloop_s2 = omp.new_cli + %canonloop_s2 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop_s2) %iv_s2 : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop_s2) %iv_s2 : i32 in range(%tc) { // CHECK-NEXT: omp.terminator omp.terminator } @@ -126,17 +134,17 @@ func.func @omp_canonloop_sequential_pretty(%tc : i32) -> () { } -// CHECK-LABEL: @omp_canonloop_nested_pretty( +// CHECK-LABEL: @omp_canonloop_2d_nested_pretty( // CHECK-SAME: %[[tc:.+]]: i32) -func.func @omp_canonloop_nested_pretty(%tc : i32) -> () { - // CHECK-NEXT: %canonloop_s0 = omp.new_cli - %canonloop_s0 = omp.new_cli - // CHECK-NEXT: %canonloop_s0_s0 = omp.new_cli - %canonloop_s0_s0 = omp.new_cli - // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) { - omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%tc) { - // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%[[tc]]) { - omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%tc) { +func.func @omp_canonloop_2d_nested_pretty(%tc : i32) -> () { + // CHECK-NEXT: %canonloop = omp.new_cli + %canonloop = omp.new_cli + // CHECK-NEXT: %canonloop_d1 = omp.new_cli + %canonloop_d1 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%tc) { // CHECK: omp.terminator omp.terminator } @@ -147,6 +155,77 @@ func.func @omp_canonloop_nested_pretty(%tc : i32) -> () { } +// CHECK-LABEL: @omp_canonloop_3d_nested_pretty( +// CHECK-SAME: %[[tc:.+]]: i32) +func.func @omp_canonloop_3d_nested_pretty(%tc : i32) -> () { + // CHECK: %canonloop = omp.new_cli + %canonloop = omp.new_cli + // CHECK: %canonloop_d1 = omp.new_cli + %canonloop_d1 = omp.new_cli + // CHECK: %canonloop_d2 = omp.new_cli + %canonloop_d2 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop_d1) %iv_1d : i32 in range(%tc) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_d2) %iv_d2 : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop_d2) %iv_d2 : i32 in range(%tc) { + // CHECK-NEXT: omp.terminator + omp.terminator + // CHECK-NEXT: } + } + // CHECK-NEXT: omp.terminator + omp.terminator + // CHECK-NEXT: } + } + // CHECK-NEXT: omp.terminator + omp.terminator + } + + return +} + + +// CHECK-LABEL: @omp_canonloop_sequential_nested_pretty( +// CHECK-SAME: %[[tc:.+]]: i32) +func.func @omp_canonloop_sequential_nested_pretty(%tc : i32) -> () { + // CHECK-NEXT: %canonloop_s0 = omp.new_cli + %canonloop_s0 = omp.new_cli + // CHECK-NEXT: %canonloop_s0_d1 = omp.new_cli + %canonloop_s0_d1 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%tc) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_d1) %iv_s0_d1 : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop_s0_d1) %iv_s0_d1 : i32 in range(%tc) { + // CHECK-NEXT: omp.terminator + omp.terminator + // CHECK-NEXT: } + } + // CHECK-NEXT: omp.terminator + omp.terminator + // CHECK-NEXT: } + } + + // CHECK-NEXT: %canonloop_s1 = omp.new_cli + %canonloop_s1 = omp.new_cli + // CHECK-NEXT: %canonloop_s1_d1 = omp.new_cli + %canonloop_s1_d1 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%tc) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_s1_d1) %iv_s1_d1 : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop_s1_d1) %iv_s1d1 : i32 in range(%tc) { + // CHECK-NEXT: omp.terminator + omp.terminator + // CHECK-NEXT: } + } + // CHECK-NEXT: omp.terminator + omp.terminator + } + + return +} + + // CHECK-LABEL: @omp_newcli_unused( // CHECK-SAME: ) func.func @omp_newcli_unused() -> () { @@ -155,3 +234,74 @@ func.func @omp_newcli_unused() -> () { // CHECK-NEXT: return return } + + +// CHECK-LABEL: @omp_canonloop_multiregion_isolatedfromabove( +func.func @omp_canonloop_multiregion_isolatedfromabove() -> () { + omp.private {type = firstprivate} @x.privatizer : !llvm.ptr init { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %c42_i32 = arith.constant 42: i32 + // CHECK: omp.canonical_loop %iv : i32 in range(%c42_i32) { + omp.canonical_loop %iv1 : i32 in range(%c42_i32) { + omp.terminator + } + // CHECK: omp.yield + omp.yield(%arg0 : !llvm.ptr) + } copy { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %c42_i32 = arith.constant 42: i32 + // CHECK: omp.canonical_loop %iv : i32 in range(%c42_i32) { + omp.canonical_loop %iv : i32 in range(%c42_i32) { + // CHECK: omp.canonical_loop %iv_d1 : i32 in range(%c42_i32) { + omp.canonical_loop %iv_d1 : i32 in range(%c42_i32) { + omp.terminator + } + omp.terminator + } + // CHECK: omp.yield + omp.yield(%arg0 : !llvm.ptr) + } dealloc { + ^bb0(%arg0: !llvm.ptr): + %c42_i32 = arith.constant 42: i32 + // CHECK: omp.canonical_loop %iv_s0 : i32 in range(%c42_i32) { + omp.canonical_loop %iv_s0 : i32 in range(%c42_i32) { + omp.terminator + } + // CHECK: omp.canonical_loop %iv_s1 : i32 in range(%c42_i32) { + omp.canonical_loop %iv_s1 : i32 in range(%c42_i32) { + omp.terminator + } + // CHECK: omp.yield + omp.yield + } + + // CHECK: return + return +} + + +// CHECK-LABEL: @omp_canonloop_multiregion( +func.func @omp_canonloop_multiregion(%c : i1) -> () { + %c42_i32 = arith.constant 42: i32 + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + %canonloop3 = omp.new_cli + scf.if %c { + // CHECK: omp.canonical_loop(%canonloop_r0) %iv_r0 : i32 in range(%c42_i32) { + omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%c42_i32) { + omp.terminator + } + } else { + // CHECK: omp.canonical_loop(%canonloop_r1_s0) %iv_r1_s0 : i32 in range(%c42_i32) { + omp.canonical_loop(%canonloop2) %iv2 : i32 in range(%c42_i32) { + omp.terminator + } + // CHECK: omp.canonical_loop(%canonloop_r1_s1) %iv_r1_s1 : i32 in range(%c42_i32) { + omp.canonical_loop(%canonloop3) %iv3 : i32 in range(%c42_i32) { + omp.terminator + } + } + + // CHECK: return + return +} diff --git a/mlir/test/Dialect/OpenMP/cli-tile.mlir b/mlir/test/Dialect/OpenMP/cli-tile.mlir new file mode 100644 index 0000000..73d5478 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/cli-tile.mlir @@ -0,0 +1,138 @@ +// RUN: mlir-opt %s | FileCheck %s --enable-var-scope +// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope + + +// Raw syntax check (MLIR output is always pretty-printed) +// CHECK-LABEL: @omp_tile_raw( +// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) { +func.func @omp_tile_raw(%tc : i32, %ts : i32) -> () { + // CHECK-NEXT: %canonloop = omp.new_cli + %canonloop = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: %grid1 = omp.new_cli + %grid = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: %intratile1 = omp.new_cli + %intratile = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { + "omp.canonical_loop" (%tc, %canonloop) ({ + ^bb0(%iv: i32): + // CHECK: omp.terminator + omp.terminator + }) : (i32, !omp.cli) -> () + // CHECK: omp.tile (%grid1, %intratile1) <- (%canonloop) sizes(%[[ts]] : i32) + "omp.tile"(%grid, %intratile, %canonloop, %ts) <{operandSegmentSizes = array<i32: 2, 1, 1>}> : (!omp.cli, !omp.cli, !omp.cli, i32) -> () + //"omp.tile" (%canonloop) : (!omp.cli) -> () + return +} + + +// Pretty syntax check +// CHECK-LABEL: @omp_tile_pretty( +// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) { +func.func @omp_tile_pretty(%tc : i32, %ts : i32) -> () { + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %grid = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %intratile = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.tile (%grid1, %intratile1) <- (%canonloop) sizes(%[[ts]] : i32) + omp.tile(%grid, %intratile) <- (%canonloop) sizes(%ts : i32) + return +} + + +// Specifying the generatees for omp.tile is optional +// CHECK-LABEL: @omp_tile_optionalgen_pretty( +// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) { +func.func @omp_tile_optionalgen_pretty(%tc : i32, %ts : i32) -> () { + // CHECK-NEXT: %canonloop = omp.new_cli + %canonloop = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.tile <- (%canonloop) sizes(%[[ts]] : i32) + omp.tile <- (%canonloop) sizes(%ts : i32) + return +} + + +// Two-dimensional tiling +// CHECK-LABEL: @omp_tile_2d_pretty( +// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32, %[[ts1:.+]]: i32, %[[ts2:.+]]: i32) { +func.func @omp_tile_2d_pretty(%tc1 : i32, %tc2 : i32, %ts1 : i32, %ts2 : i32) -> () { + // CHECK-NEXT: %canonloop = omp.new_cli + %cli_outer = omp.new_cli + // CHECK-NEXT: %canonloop_d1 = omp.new_cli + %cli_inner = omp.new_cli + // CHECK-NEXT: %grid1 = omp.new_cli + %grid1 = omp.new_cli + // CHECK-NEXT: %grid2 = omp.new_cli + %grid2 = omp.new_cli + // CHECK-NEXT: %intratile1 = omp.new_cli + %intratile1 = omp.new_cli + // CHECK-NEXT: %intratile2 = omp.new_cli + %intratile2 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc1]]) { + omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc1) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc2]]) { + omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc2) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.tile (%grid1, %grid2, %intratile1, %intratile2) <- (%canonloop, %canonloop_d1) sizes(%[[ts1]], %[[ts2]] : i32, i32) + omp.tile (%grid1, %grid2, %intratile1, %intratile2) <- (%cli_outer, %cli_inner) sizes(%ts1, %ts2 : i32, i32) + return +} + + +// Three-dimensional tiling +// CHECK-LABEL: @omp_tile_3d_pretty( +// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) { +func.func @omp_tile_3d_pretty(%tc : i32, %ts : i32) -> () { + // CHECK-NEXT: %canonloop = omp.new_cli + %cli_outer = omp.new_cli + // CHECK-NEXT: %canonloop_d1 = omp.new_cli + %cli_middle = omp.new_cli + // CHECK-NEXT: %canonloop_d2 = omp.new_cli + %cli_inner = omp.new_cli + // CHECK-NEXT: %grid1 = omp.new_cli + %grid1 = omp.new_cli + // CHECK-NEXT: %grid2 = omp.new_cli + %grid2 = omp.new_cli + // CHECK-NEXT: %grid3 = omp.new_cli + %grid3 = omp.new_cli + // CHECK-NEXT: %intratile1 = omp.new_cli + %intratile1 = omp.new_cli + // CHECK-NEXT: %intratile2 = omp.new_cli + %intratile2 = omp.new_cli + // CHECK-NEXT: %intratile3 = omp.new_cli + %intratile3 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { + omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) { + omp.canonical_loop(%cli_middle) %iv_middle : i32 in range(%tc) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_d2) %iv_d2 : i32 in range(%[[tc]]) { + omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.tile (%grid1, %grid2, %grid3, %intratile1, %intratile2, %intratile3) <- (%canonloop, %canonloop_d1, %canonloop_d2) sizes(%[[ts]], %[[ts]], %[[ts]] : i32, i32, i32) + omp.tile (%grid1, %grid2, %grid3, %intratile1, %intratile2, %intratile3) <- (%cli_outer, %cli_middle, %cli_inner) sizes(%ts, %ts, %ts: i32, i32, i32) + return +} diff --git a/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir b/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir index cda7d0b..16884f4 100644 --- a/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir +++ b/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir @@ -1,18 +1,18 @@ -// RUN: mlir-opt %s | FileCheck %s -// RUN: mlir-opt %s | mlir-opt | FileCheck %s +// RUN: mlir-opt %s | FileCheck %s --enable-var-scope +// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope // CHECK-LABEL: @omp_unroll_heuristic_raw( // CHECK-SAME: %[[tc:.+]]: i32) { func.func @omp_unroll_heuristic_raw(%tc : i32) -> () { - // CHECK-NEXT: %canonloop_s0 = omp.new_cli + // CHECK-NEXT: %canonloop = omp.new_cli %canonloop = "omp.new_cli" () : () -> (!omp.cli) - // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) { + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { "omp.canonical_loop" (%tc, %canonloop) ({ ^bb0(%iv: i32): omp.terminator }) : (i32, !omp.cli) -> () - // CHECK: omp.unroll_heuristic(%canonloop_s0) + // CHECK: omp.unroll_heuristic(%canonloop) "omp.unroll_heuristic" (%canonloop) : (!omp.cli) -> () return } @@ -22,12 +22,12 @@ func.func @omp_unroll_heuristic_raw(%tc : i32) -> () { // CHECK-SAME: %[[tc:.+]]: i32) { func.func @omp_unroll_heuristic_pretty(%tc : i32) -> () { // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli - %canonloop = "omp.new_cli" () : () -> (!omp.cli) - // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) { + %canonloop = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { omp.terminator } - // CHECK: omp.unroll_heuristic(%canonloop_s0) + // CHECK: omp.unroll_heuristic(%canonloop) omp.unroll_heuristic(%canonloop) return } @@ -36,13 +36,13 @@ func.func @omp_unroll_heuristic_pretty(%tc : i32) -> () { // CHECK-LABEL: @omp_unroll_heuristic_nested_pretty( // CHECK-SAME: %[[tc:.+]]: i32) { func.func @omp_unroll_heuristic_nested_pretty(%tc : i32) -> () { - // CHECK-NEXT: %canonloop_s0 = omp.new_cli + // CHECK-NEXT: %canonloop = omp.new_cli %cli_outer = omp.new_cli - // CHECK-NEXT: %canonloop_s0_s0 = omp.new_cli + // CHECK-NEXT: %canonloop_d1 = omp.new_cli %cli_inner = omp.new_cli - // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) { + // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) { omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc) { - // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%[[tc]]) { + // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) { omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc) { // CHECK: omp.terminator omp.terminator @@ -51,9 +51,9 @@ func.func @omp_unroll_heuristic_nested_pretty(%tc : i32) -> () { omp.terminator } - // CHECK: omp.unroll_heuristic(%canonloop_s0) + // CHECK: omp.unroll_heuristic(%canonloop) omp.unroll_heuristic(%cli_outer) - // CHECK-NEXT: omp.unroll_heuristic(%canonloop_s0_s0) + // CHECK-NEXT: omp.unroll_heuristic(%canonloop_d1) omp.unroll_heuristic(%cli_inner) return } diff --git a/mlir/test/Dialect/OpenMP/invalid-tile.mlir b/mlir/test/Dialect/OpenMP/invalid-tile.mlir new file mode 100644 index 0000000..e63a062 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/invalid-tile.mlir @@ -0,0 +1,119 @@ +// RUN: mlir-opt -split-input-file -verify-diagnostics %s + + +func.func @missing_sizes(%tc : i32, %ts : i32) { + %canonloop = omp.new_cli + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + omp.terminator + } + + // expected-error@+1 {{'omp.tile' op there must be one tile size for each applyee}} + omp.tile <-(%canonloop) + + llvm.return +} + +// ----- + +func.func @no_loop(%tc : i32, %ts : i32) { + // expected-error@+1 {{'omp.tile' op must apply to at least one loop}} + omp.tile <-() + + return +} + +// ----- + +func.func @missing_generator(%tc : i32, %ts : i32) { + // expected-error@+1 {{'omp.new_cli' op CLI has no generator}} + %canonloop = omp.new_cli + + // expected-note@+1 {{see consumer here: "omp.tile"(%0, %arg1) <{operandSegmentSizes = array<i32: 0, 1, 1>}> : (!omp.cli, i32) -> ()}} + omp.tile <-(%canonloop) sizes(%ts : i32) + + return +} + +// ----- + +func.func @insufficient_sizes(%tc : i32, %ts : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc) { + omp.terminator + } + omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc) { + omp.terminator + } + + // expected-error@+1 {{'omp.tile' op there must be one tile size for each applyee}} + omp.tile <-(%canonloop1, %canonloop2) sizes(%ts : i32) + + llvm.return +} + +// ----- + +func.func @insufficient_applyees(%tc : i32, %ts : i32) { + %canonloop = omp.new_cli + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + omp.terminator + } + + // expected-error@+1 {{omp.tile' op there must be one tile size for each applyee}} + omp.tile <- (%canonloop) sizes(%ts, %ts : i32, i32) + + return +} + +// ----- + +func.func @insufficient_generatees(%tc : i32, %ts : i32) { + %canonloop = omp.new_cli + %grid = omp.new_cli + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) { + omp.terminator + } + + // expected-error@+1 {{'omp.tile' op expecting two times the number of generatees than applyees}} + omp.tile (%grid) <- (%canonloop) sizes(%ts : i32) + + return +} + +// ----- + +func.func @not_perfectly_nested(%tc : i32, %ts : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%tc) { + %v = arith.constant 42 : i32 + omp.canonical_loop(%canonloop2) %iv2 : i32 in range(%tc) { + omp.terminator + } + omp.terminator + } + + // expected-error@+1 {{'omp.tile' op tiled loop nest must be perfectly nested}} + omp.tile <-(%canonloop1, %canonloop2) sizes(%ts, %ts : i32, i32) + + llvm.return +} + +// ----- + +func.func @non_nectangular(%tc : i32, %ts : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%tc) { + omp.canonical_loop(%canonloop2) %iv2 : i32 in range(%iv1) { + omp.terminator + } + omp.terminator + } + + // expected-error@+1 {{'omp.tile' op tiled loop nest must be rectangular}} + omp.tile <-(%canonloop1, %canonloop2) sizes(%ts, %ts : i32, i32) + + llvm.return +} diff --git a/mlir/test/Dialect/Transform/test-promote-tensors.mlir b/mlir/test/Dialect/Transform/test-promote-tensors.mlir new file mode 100644 index 0000000..bc9a05a --- /dev/null +++ b/mlir/test/Dialect/Transform/test-promote-tensors.mlir @@ -0,0 +1,104 @@ +// RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s + +// CHECK-LABEL: @promote_in0 +// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x42xf32>, %{{.*}}, %{{.*}}) +// CHECK: %[[C0:.+]] = arith.constant 0 +// CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] +// CHECK: %[[ALLOC:.+]] = bufferization.alloc_tensor(%[[DIM]]) {memory_space = 1 : i64} +// CHECK: %[[MAT:.+]] = bufferization.materialize_in_destination %[[ARG0]] in %[[ALLOC]] +// CHECK: linalg.matmul ins(%[[MAT]], %{{.*}} +func.func @promote_in0(%arg0: tensor<?x42xf32>, %arg1: tensor<42x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { + %0 = linalg.matmul ins(%arg0, %arg1: tensor<?x42xf32>, tensor<42x?xf32>) + outs(%arg2: tensor<?x?xf32>) -> tensor<?x?xf32> + return %0 : tensor<?x?xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%root: !transform.any_op) { + %mm = transform.structured.match ops{["linalg.matmul"]} in %root + : (!transform.any_op) -> !transform.any_op + %op0 = transform.get_operand %mm[0] + : (!transform.any_op) -> !transform.any_value + transform.structured.promote_tensor to 1 %op0 : !transform.any_value + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @promote_out +// CHECK-SAME: (%{{.*}}: tensor<?x42xf32>, %{{.*}}: tensor<?x42xf32>, %[[ARG2:.+]]: tensor<?x?xf32>) +func.func @promote_out(%arg0: tensor<?x42xf32>, %arg1: tensor<?x42xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { + // CHECK: %[[C0:.+]] = arith.constant 0 + // CHECK: %[[DIM0:.+]] = tensor.dim %[[ARG2]], %[[C0]] + // CHECK: %[[C1:.+]] = arith.constant 1 + // CHECK: %[[DIM1:.+]] = tensor.dim %[[ARG2]], %[[C1]] + // CHECK: %[[ALLOC:.+]] = bufferization.alloc_tensor(%[[DIM0]], %[[DIM1]]) {memory_space = 1 : i64} + // CHECK-NOT: materialize_in_destination + // CHECK: linalg.add {{.*}} outs(%[[ALLOC]] + %0 = linalg.add ins(%arg0, %arg1 : tensor<?x42xf32>, tensor<?x42xf32>) + outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> + return %0 : tensor<?x?xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%root: !transform.any_op) { + %la = transform.structured.match ops{["linalg.add"]} in %root + : (!transform.any_op) -> !transform.any_op + %init = transform.get_operand %la[2] + : (!transform.any_op) -> !transform.any_value + transform.structured.promote_tensor to 1 %init : !transform.any_value + + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @promote_in0_out_bufferize +// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x42xf32>, %{{.*}}: tensor<42x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>) +func.func @promote_in0_out_bufferize(%arg0: tensor<?x42xf32>, %arg1: tensor<42x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { + // CHECK: %[[IN1:.+]] = bufferization.to_buffer %arg1 : tensor<42x?xf32> to memref<42x?xf32, strided<[?, ?], offset: ?>> + // CHECK: %[[IN0:.+]] = bufferization.to_buffer %arg0 : tensor<?x42xf32> to memref<?x42xf32, strided<[?, ?], offset: ?>> + // CHECK: %{{.+}} = bufferization.to_buffer %arg0 : tensor<?x42xf32> to memref<?x42xf32, strided<[?, ?], offset: ?>> + // CHECK: %{{.+}} = bufferization.to_buffer %arg2 : tensor<?x?xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>> + // CHECK: %{{.+}} = bufferization.to_buffer %arg2 : tensor<?x?xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>> + // CHECK: %[[C0:.+]] = arith.constant 0 : index + // CHECK: %{{.+}} = memref.dim %{{.+}}, %[[C0]] : memref<?x?xf32, strided<[?, ?], offset: ?>> + // CHECK: %[[C1:.+]] = arith.constant 1 : index + // CHECK: %{{.+}} = memref.dim %{{.+}}, %[[C1]] : memref<?x?xf32, strided<[?, ?], offset: ?>> + // CHECK: %[[ALLOC_OUT:.+]] = memref.alloc(%{{.+}}, %{{.+}}) {alignment = 64 : i64} : memref<?x?xf32, 1> + // CHECK: %{{.+}} = arith.constant 0 : index + // CHECK: %{{.+}} = memref.dim %{{.+}}, %{{.+}} : memref<?x42xf32, strided<[?, ?], offset: ?>> + // CHECK: %[[ALLOC_IN:.+]] = memref.alloc(%{{.+}}) {alignment = 64 : i64} : memref<?x42xf32, 1> + // CHECK: memref.copy %[[IN0]], %[[ALLOC_IN]] : memref<?x42xf32, strided<[?, ?], offset: ?>> to memref<?x42xf32, 1> + // CHECK: linalg.add ins(%[[ALLOC_IN]], %[[IN1]] : memref<?x42xf32, 1>, memref<42x?xf32, strided<[?, ?], offset: ?>>) outs(%[[ALLOC_OUT]] : memref<?x?xf32, 1>) + %0 = linalg.add ins(%arg0, %arg1: tensor<?x42xf32>, tensor<42x?xf32>) + outs(%arg2: tensor<?x?xf32>) -> tensor<?x?xf32> + return %0 : tensor<?x?xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%root: !transform.any_op) { + %la = transform.structured.match ops{["linalg.add"]} in %root + : (!transform.any_op) -> !transform.any_op + %op0 = transform.get_operand %la[0] + : (!transform.any_op) -> !transform.any_value + transform.structured.promote_tensor to 1 %op0 : !transform.any_value + + %init = transform.get_operand %la[2] + : (!transform.any_op) -> !transform.any_value + transform.structured.promote_tensor to 1 %init : !transform.any_value + + %func = transform.structured.match ops{["func.func"]} in %root + : (!transform.any_op) -> !transform.any_op + + %bufferized = transform.bufferization.one_shot_bufferize %func + : (!transform.any_op) -> !transform.any_op + + transform.yield + } +} + + + diff --git a/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir b/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir index 2e5f433..efc3890 100644 --- a/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir +++ b/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir @@ -19,3 +19,88 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +func.func private @f() + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + // expected-error@below {{'selected_region' attribute specifies region at index 2 while op has only 2 regions}} + transform.tune.alternatives<"bifurcation"> selected_region = 2 { + transform.yield + }, { + transform.yield + } + transform.yield + } +} + +// ----- + +func.func private @f() + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %singleton_of_c0 = transform.param.constant [0] -> !transform.any_param + // expected-error@below {{param should hold exactly one integer attribute, got: [0]}} + transform.tune.alternatives<"bifurcation"> selected_region = %singleton_of_c0 : !transform.any_param { + transform.yield + }, { + transform.yield + } + transform.yield + } +} + +// ----- + +func.func private @f() + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %c0 = transform.param.constant 0 -> !transform.any_param + %c1 = transform.param.constant 1 -> !transform.any_param + %c0_and_c1 = transform.merge_handles %c0, %c1 : !transform.any_param + // expected-error@below {{param should hold exactly one integer attribute}} + transform.tune.alternatives<"bifurcation"> selected_region = %c0_and_c1 : !transform.any_param { + transform.yield + }, { + transform.yield + } + transform.yield + } +} + +// ----- + +func.func private @f() + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %c2 = transform.param.constant 2 -> !transform.any_param + // expected-error@below {{'selected_region' attribute/param specifies region at index 2 while op has only 2 regions}} + transform.tune.alternatives<"bifurcation"> selected_region = %c2 : !transform.any_param { + transform.yield + }, { + transform.yield + } + transform.yield + } +} + +// ----- + +func.func private @f() + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + // expected-error@below {{non-deterministic choice "bifurcation" is only resolved through providing a `selected_region` attr/param}} + transform.tune.alternatives<"bifurcation"> { + transform.yield + }, { + transform.yield + } + transform.yield + } +} diff --git a/mlir/test/Dialect/Transform/test-tune-extension.mlir b/mlir/test/Dialect/Transform/test-tune-extension.mlir index 0a253c6..5da48a2 100644 --- a/mlir/test/Dialect/Transform/test-tune-extension.mlir +++ b/mlir/test/Dialect/Transform/test-tune-extension.mlir @@ -59,3 +59,129 @@ module attributes {transform.with_named_sequence} { transform.yield } } + + +// ----- + +// CHECK-LABEL: schedule_with_two_independent_choices_already_made +func.func @schedule_with_two_independent_choices_already_made( + %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) + -> tensor<128x128xf32> { +// CHECK-NOT: scf.forall +// CHECK: scf.for +// CHECK-NOT: scf.for +// CHECK: scf.forall +// CHECK-NOT: scf.for +// CHECK: tensor.extract_slice +// CHECK: tensor.extract_slice +// CHECK: tensor.extract_slice +// CHECK: linalg.matmul +// CHECK: scf.forall.in_parallel +// CHECK: tensor.parallel_insert_slice +// CHECK: tensor.insert_slice +// CHECK: scf.yield + %0 = linalg.matmul ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>) + outs(%arg2: tensor<128x128xf32>) -> tensor<128x128xf32> + return %0 : tensor<128x128xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op + + %tiled_matmul = transform.tune.alternatives<"outer_par_or_seq_tiling"> selected_region = 0 -> !transform.any_op + { // First alternative/region, with index = 0 + %contained_matmul, %loop = transform.structured.tile_using_for %matmul tile_sizes [8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield %contained_matmul : !transform.any_op + }, { // Second alternative/region, with index = 1 + %contained_matmul, %loop = transform.structured.tile_using_forall %matmul tile_sizes [8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield %contained_matmul : !transform.any_op + } + + transform.tune.alternatives<"inner_par_or_seq_tiling"> selected_region = 1 -> !transform.any_op { + %contained_matmul, %loop = transform.structured.tile_using_for %tiled_matmul tile_sizes [0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield %contained_matmul : !transform.any_op + }, { + %contained_matmul, %loop = transform.structured.tile_using_forall %tiled_matmul tile_sizes [0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield %contained_matmul : !transform.any_op + } + + transform.yield + } +} + +// ----- + +// CHECK-LABEL: subschedule_with_choice_resolved_in_main_schedule +func.func @subschedule_with_choice_resolved_in_main_schedule( + %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) + -> tensor<128x128xf32> { +// CHECK-NOT: scf.for +// CHECK: scf.forall +// CHECK-NOT: scf.forall +// CHECK: scf.for +// CHECK-NOT: scf.forall +// CHECK: tensor.extract_slice +// CHECK: tensor.extract_slice +// CHECK: tensor.extract_slice +// CHECK: linalg.matmul +// CHECK: tensor.insert_slice +// CHECK: scf.yield +// CHECK: scf.forall.in_parallel +// CHECK: tensor.parallel_insert_slice + %0 = linalg.matmul ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>) + outs(%arg2: tensor<128x128xf32>) -> tensor<128x128xf32> + return %0 : tensor<128x128xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @subschedule_with_embedded_choice(%matmul: !transform.any_op {transform.readonly}, + %par_or_seq: !transform.param<i64> {transform.readonly}, + %tile_size: !transform.param<i64> {transform.readonly}) -> !transform.any_op { + %tiled_matmul = transform.tune.alternatives<"par_or_seq_tiling"> selected_region = %par_or_seq : !transform.param<i64> -> !transform.any_op { + %contained_matmul, %loop = transform.structured.tile_using_for %matmul tile_sizes [%tile_size] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op) + transform.yield %contained_matmul : !transform.any_op + }, { + %contained_matmul, %loop = transform.structured.tile_using_forall %matmul tile_sizes [%tile_size] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op) + transform.yield %contained_matmul : !transform.any_op + } + transform.yield %tiled_matmul : !transform.any_op + } + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %outer_par = transform.param.constant 1 -> !transform.param<i64> + %outer_tile_size = transform.param.constant 32 -> !transform.param<i64> + %inner_seq = transform.tune.knob<"inner_par_or_seq"> = 0 from options = [0, 1] -> !transform.param<i64> + %inner_tile_size = transform.param.constant 8 -> !transform.param<i64> + %tiled_matmul = transform.include @subschedule_with_embedded_choice failures(propagate) (%matmul, %outer_par, %outer_tile_size) : (!transform.any_op, !transform.param<i64>, !transform.param<i64>) -> !transform.any_op + %tiled_tiled_matmul = transform.include @subschedule_with_embedded_choice failures(propagate) (%tiled_matmul, %inner_seq, %inner_tile_size) : (!transform.any_op, !transform.param<i64>, !transform.param<i64>) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: eeny_meeny_miny_moe +func.func private @eeny_meeny_miny_moe() + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op + + %tiled_matmul = transform.tune.alternatives<"4way"> selected_region = 3 -> !transform.any_param + { // First alternative/region, with index = 0 + %out = transform.param.constant "eeny" -> !transform.any_param + transform.yield %out : !transform.any_param + }, { // Second alternative/region, with index = 1 + %out = transform.param.constant "meeny" -> !transform.any_param + transform.yield %out : !transform.any_param + }, { // Third alternative/region, with index = 2 + %out = transform.param.constant "miny" -> !transform.any_param + transform.yield %out : !transform.any_param + }, { // Fourth alternative/region, with index = 3 + %out = transform.param.constant "moe" -> !transform.any_param + transform.yield %out : !transform.any_param + } + transform.yield + } +}
\ No newline at end of file diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir new file mode 100644 index 0000000..40b66d1 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -0,0 +1,575 @@ +// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -test-xegpu-sg-distribute -allow-unregistered-dialect \ +// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s + +// CHECK-LABEL: gpu.func @store_nd_1d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] +// CHECK-SAME: -> (vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) { +// CHECK: gpu.yield %{{.*}} : vector<16xf32>, +// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32, +// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch} +// CHECK-NEXT: xegpu.store_nd %[[W]]#0, %[[T1]][%[[W]]#2] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +gpu.module @xevm_module{ + gpu.func @store_nd_1d(%laneid: index) { + %c0 = arith.constant 0 : index + gpu.warp_execute_on_lane_0(%laneid)[16] { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> + %cst = "some_op"() : () -> vector<16xf32> + xegpu.store_nd %cst, %0 [%c0] {layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} + : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> + } + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @store_nd_2d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] +// CHECK-SAME: -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) { +// CHECK: gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[CAST:.*]] = vector.shape_cast %[[W]]#0 : vector<16x1xf16> to vector<16xf16> +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch} +// CHECK-NEXT: xegpu.store_nd %[[CAST]], %[[T1]][%[[W]]#2, %[[W]]#3] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +gpu.module @xevm_module{ + gpu.func @store_nd_2d(%laneid : index) { + %c0 = arith.constant 0 : index + gpu.warp_execute_on_lane_0(%laneid)[16] { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + %cst = "some_op"() : () -> vector<16x16xf16> + xegpu.store_nd %cst, %0 [%c0, %c0] {layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + } + gpu.return + } +} + + + +// ----- +// CHECK-LABEL: gpu.func @load_nd_1d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<1xf32>, +// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) { +// CHECK: gpu.yield %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, +// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32, +// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch} +// CHECK-NEXT: xegpu.load_nd %[[T1]][%[[W]]#2] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> +gpu.module @xevm_module{ + gpu.func @load_nd_1d(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> + %1 = xegpu.load_nd %0 [%c0] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : + !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32> + gpu.yield %1 : vector<16xf32> + } + "some_user_op"(%r) : (vector<1xf32>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @load_nd_2d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) { +// CHECK: gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch} +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: vector.shape_cast %[[T2]] : vector<16xf16> to vector<16x1xf16> +gpu.module @xevm_module{ + gpu.func @load_nd_2d(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16> + gpu.yield %1 : vector<16x16xf16> + } + "some_user_op"(%r) : (vector<16x1xf16>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @load_nd_array_length +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<2x16x1xf16>, +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, +// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) { +// CHECK: gpu.yield %{{.*}} : vector<2x16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr< +// CHECK-SAME: array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], +// CHECK-SAME: lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3] : !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16> +// CHECK-NEXT: vector.shape_cast %[[T2]] : vector<32xf16> to vector<2x16x1xf16> +gpu.module @xevm_module{ + gpu.func @load_nd_array_length(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x16x1xf16>) { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, + #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, + #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16> + gpu.yield %1 : vector<2x16x16xf16> + } + "some_user_op"(%r) : (vector<2x16x1xf16>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @dpas +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> +// CHECK-SAME: (vector<8x1xf32>, vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) { +// CHECK: gpu.yield %{{.*}} : vector<8x16xf32>, vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> +// CHECK-NEXT: } +// CHECK-DAG: %[[T1:.*]] = vector.shape_cast %[[W]]#1 : vector<8x1xf16> to vector<8xf16> +// CHECK-DAG: %[[T2:.*]] = vector.shape_cast %[[W]]#2 : vector<16x1xf16> to vector<16xf16> +// CHECK-DAG: %[[T3:.*]] = vector.shape_cast %[[W]]#3 : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T2]], %[[T3]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32> +// CHECK-NEXT: vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> +gpu.module @xevm_module{ + gpu.func @dpas(%laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { + %0 = "some_op"() : () -> vector<8x16xf16> + %1 = "some_op"() : () -> vector<16x16xf16> + %2 = "some_op"() : () -> vector<8x16xf32> + %3 = xegpu.dpas %0, %1, %2 + { + layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, + layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, + layout_operand_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, + layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> + } + : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + gpu.yield %3 : vector<8x16xf32> + } + "some_user_op"(%r) : (vector<8x1xf32>) -> () + gpu.return + } +} + + +// ----- +// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG1]])[16] -> (!xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64) { +// CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64 +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[W]]#1, shape : [64, 128], strides : [128, 1] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK-NEXT: builtin.unrealized_conversion_cast %[[T1]] : !xegpu.tensor_desc<16x16xf16> to !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> {resolve_simt_type_mismatch} +gpu.module @xevm_module{ + gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { + %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 -> + !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + gpu.yield %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + } + "some_user_op"(%r) + : (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @prefetch_2d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) { +// CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> +// CHECK-SAME: , index, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch} +// CHECK-NEXT: xegpu.prefetch_nd %[[T1]][%[[W]]#1, %[[W]]#2] +// CHECK-SAME: <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> +gpu.module @xevm_module{ + gpu.func @prefetch_2d(%laneid: index) { + %c0 = arith.constant 0 : index + gpu.warp_execute_on_lane_0(%laneid)[16] { + %0 = "some_op"() : () + -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + xegpu.prefetch_nd %0[%c0, %c0] + <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> + : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + } + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @prefetch_1d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) { +// CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16xf16, +// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf16> {resolve_simt_type_mismatch} +// CHECK-NEXT: xegpu.prefetch_nd %[[T1]][%[[W]]#1] <{l1_hint = #xegpu.cache_hint<cached>, +// CHECK-SAME: l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> +gpu.module @xevm_module{ + gpu.func @prefetch_1d(%laneid: index) { + %c0 = arith.constant 0 : index + gpu.warp_execute_on_lane_0(%laneid)[16] { + %0 = "some_op"() : () + -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> + xegpu.prefetch_nd %0[%c0] + <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> + : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> + } + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) { +// CHECK: gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) { +// CHECK: gpu.yield %{{.*}} +// CHECK: } +// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16xf16> -> vector<1xf16> +// CHECK: gpu.barrier +gpu.module @xevm_module{ + gpu.func @gpu_barrier(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf16>) { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> + %1 = xegpu.load_nd %0[%c0] + {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} + : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16> + gpu.barrier + gpu.yield %1 : vector<16xf16> + } + "some_user_op"(%r) : (vector<1xf16>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction +// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32> +// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] +// CHECK-SAME: -> (vector<2xf32>, vector<16x2xf32>, vector<2xf32>) { +// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x32xf32> +// CHECK: gpu.yield %{{.*}}, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<16x32xf32>, vector<32xf32> +// CHECK-NEXT: } +// CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1 +// CHECK-SAME: {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16x1xf32> to vector<16xf32> +// CHECK: %[[T3:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32> +// CHECK: %[[T4:.*]] = vector.reduction <add>, %[[T2]], %[[T3]] : vector<16xf32> into f32 +// CHECK: %[[T5:.*]] = vector.extract_strided_slice %[[W]]#1 +// CHECK-SAME: {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK: %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<16x1xf32> to vector<16xf32> +// CHECK: %[[T7:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32> +// CHECK: %[[T8:.*]] = vector.reduction <add>, %[[T6]], %[[T7]] : vector<16xf32> into f32 +// CHECK: %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<2xf32> +gpu.module @xevm_module{ +gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { + %src = "some_def"() + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : () -> (vector<16x32xf32>) + %acc = arith.constant + {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} + dense<0.0> : vector<32xf32> + %1 = vector.multi_reduction <add>, %src, %acc + { + layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, + layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, + layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]> + } [0] + : vector<16x32xf32> to vector<32xf32> + gpu.yield %1 : vector<32xf32> + } + "some_user_op"(%r) : (vector<2xf32>) -> () + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction +// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) { +// CHECK-NEXT: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32> +// CHECK-NEXT: %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32> +// CHECK-NEXT: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %cst : vector<16xf32> into f32 +// CHECK-NEXT: %[[T4:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32> +// CHECK-NEXT: %[[T5:.*]] = vector.reduction <add>, %[[T4]], %cst : vector<16xf32> into f32 +// CHECK-NEXT: %[[T6:.*]] = vector.from_elements %[[T3]], %[[T5]] : vector<2xf32> +// CHECK-NEXT: gpu.yield %[[T6]] : vector<2xf32> +// CHECK-NEXT: } +gpu.module @xevm_module{ +gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { + %src = "some_def"() + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : () -> (vector<2x16xf32>) + %acc = arith.constant + {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} + dense<0.0> : vector<2xf32> + %1 = vector.multi_reduction <add>, %src, %acc + { + layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, + layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>, + layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]> + } + [1] : vector<2x16xf32> to vector<2xf32> + gpu.yield %1 : vector<2xf32> + } + "some_user_op"(%r) : (vector<2xf32>) -> () + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction +// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32> +// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) { +// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<32x16xf32> +// CHECK: gpu.yield %9, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<32x16xf32>, vector<32xf32> +// CHECK: } +// CHECK: %[[T1:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32> +// CHECK: %[[T2:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32> +// CHECK: %[[T3:.*]] = vector.reduction <add>, %[[T1]], %[[T2]] : vector<16xf32> into f32 +// CHECK: %[[T4:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> +// CHECK: %[[T5:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32> +// CHECK: %[[T6:.*]] = vector.reduction <add>, %[[T4]], %[[T5]] : vector<16xf32> into f32 +// CHECK: %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32> +gpu.module @xevm_module{ +gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { + %src = "some_def"() + {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} + : () -> (vector<32x16xf32>) + %acc = arith.constant + {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>} + dense<0.0> : vector<32xf32> + %1 = vector.multi_reduction <add>, %src, %acc + { + layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, + layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>, + layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]> + } + [1] : vector<32x16xf32> to vector<32xf32> + gpu.yield %1 : vector<32xf32> + } + "some_user_op"(%r) : (vector<2xf32>) -> () + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction +// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) { +// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x2xf32> +// CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[SRC]] +// CHECK-SAME: {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] {{.*}} : vector<16x1xf32> to vector<16xf32> +// CHECK: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32 +// CHECK: %[[T4:.*]] = vector.extract_strided_slice %[[SRC]] +// CHECK-SAME: {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] {{.*}} : vector<16x1xf32> to vector<16xf32> +// CHECK: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32 +// CHECK: %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32> +// CHECK: gpu.yield %[[T7]] : vector<2xf32> +// CHECK: } +gpu.module @xevm_module{ +gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { + %src = "some_def"() + {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} + : () -> (vector<16x2xf32>) + %acc = arith.constant + {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>} + dense<0.0> : vector<2xf32> + %1 = vector.multi_reduction <add>, %src, %acc + { + layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, + layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>, + layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]> + } + [0] : vector<16x2xf32> to vector<2xf32> + gpu.yield %1 : vector<2xf32> + } + "some_user_op"(%r) : (vector<2xf32>) -> () + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) { +// CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex> +// CHECK: %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1> +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16] +// CHECK-SAME: -> (vector<1x8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) { +// CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] : +// CHECK-SAME: vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}> +// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}> +// CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +gpu.module @xevm_module{ + gpu.func @scatter_ops_chunksize(%laneid: index, %src: memref<256xf16>) { + gpu.warp_execute_on_lane_0(%laneid)[16] { + %1 = arith.constant + {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} + dense<1>: vector<16xi1> + %offset = arith.constant + {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} + dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> + { + layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, + layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, + layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> + } + : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> + xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> + { + layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>, + layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, + layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]> + } + : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + } + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) { +// CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex> +// CHECK: %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1> +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16] +// CHECK-SAME: -> (vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) { +// CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] +// CHECK-SAME: : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 +// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> +// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 +// CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +gpu.module @xevm_module{ + gpu.func @scatter_ops(%src: memref<256xf16>, %laneid: index) { + gpu.warp_execute_on_lane_0(%laneid)[16] { + %1 = arith.constant + {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} + dense<1> : vector<16xi1> + %offset = arith.constant + {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} + dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 + { + layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, + layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, + layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> + } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> + xegpu.store %3, %src[%offset], %1 + { + layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, + layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, + layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]> + } + : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + } + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (index, memref<256x256xf16>) { +// CHECK: gpu.yield %{{.*}}, %{{.*}} : index, memref<256x256xf16> +// CHECK-NEXT: } +// CHECK-NEXT: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[W]]#1 : memref<256x256xf16> -> index +// CHECK-NEXT: arith.index_cast %[[INTPTR]] : index to i64 +gpu.module @xevm_module{ + gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (index) { + %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index + gpu.yield %ptr : index + } + %ptr_i64 = arith.index_cast %r : index to i64 + "some_user_op"(%ptr_i64) : (i64) -> () + gpu.return + } +} + + +// ----- +// CHECK-LABEL: gpu.func @vector_transpose( +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2x1xf32>, vector<1x2xf32>) { +// CHECK: %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<16x2xf32> +// CHECK: gpu.yield %{{.*}}, %[[SRC]] : vector<2x16xf32>, vector<16x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = vector.transpose %[[W]]#1, [1, 0] : vector<1x2xf32> to vector<2x1xf32> +gpu.module @xevm_module{ + gpu.func @vector_transpose(%arg0: memref<2x16xf32>, %laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) { + %cst = "some_op"() + {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} + : () -> (vector<16x2xf32>) + %transpose = vector.transpose %cst, [1, 0] + { + layout_operand_0 = #xegpu.layout<lane_layout = [16 , 1], lane_data = [1, 1]>, + layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> + } + : vector<16x2xf32> to vector<2x16xf32> + gpu.yield %transpose : vector<2x16xf32> + } + "some_user_op"(%r) : (vector<2x1xf32>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @vector_bitcast( +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<4x1xi16>, vector<4x2xi8>) { +// CHECK: %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<4x32xi8> +// CHECK: gpu.yield %{{.*}}, %[[SRC]] : vector<4x16xi16>, vector<4x32xi8> +// CHECK: } +// CHECK: vector.bitcast %[[W]]#1 : vector<4x2xi8> to vector<4x1xi16> +gpu.module @xevm_module{ + gpu.func @vector_bitcast(%arg0: memref<4x16xi16>, %laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) { + %cst = "some_op"() + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} + : () -> (vector<4x32xi8>) + %bitcast = vector.bitcast %cst + { + layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, + layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> + } + : vector<4x32xi8> to vector<4x16xi16> + gpu.yield %bitcast : vector<4x16xi16> + } + "some_user_op"(%r) : (vector<4x1xi16>) -> () + gpu.return + } +} diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 59fac26..0e1365a 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -1,198 +1,76 @@ // RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -xegpu-subgroup-distribute \ // RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s -// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \ -// RUN: -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \ -// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION - -// CHECK-LABEL: gpu.func @store_nd_1d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) { -// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -// CHECK: gpu.return -gpu.module @xevm_module{ - gpu.func @store_nd_1d(%arg0: memref<16xf32>) { - %c0 = arith.constant 0 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf32> - %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - xegpu.store_nd %cst, %0 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @store_nd_2d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @xevm_module{ - gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) { - %c0 = arith.constant 0 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf16> - %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - xegpu.store_nd %cst, %0 [%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return - } -} - - - -// ----- -// CHECK-LABEL: gpu.func @load_nd_1d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -gpu.module @xevm_module{ - gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - %1 = xegpu.load_nd %0 [%c0] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32> - %2 = xegpu.create_nd_tdesc %arg1 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - xegpu.store_nd %1, %2 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @load_nd_2d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @xevm_module{ - gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @load_nd_array_length -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16> -// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16> -// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16> -// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @xevm_module{ - gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16> - %2 = vector.extract %1[%c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16> from vector<2x16x16xf16> - %3 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - xegpu.store_nd %2, %3[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @load_dpas_store -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -gpu.module @xevm_module{ - gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> - %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> - %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - xegpu.store_nd %4, %5[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return - } -} - - -// ----- // CHECK-LABEL: gpu.func @load_dpas_postop_store -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> -// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32> -// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> -// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> +// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> +// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32> +// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> +// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> - %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> - %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = math.exp %4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32> - %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return - } -} + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> + -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + %1 = xegpu.load_nd %0[%c0, %c0] + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : + !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16> + + %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> + -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> + %3 = xegpu.load_nd %2[%c0, %c0] + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} + : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> + -> vector<16x16xf16> + + %4 = xegpu.dpas %1, %3 + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// ----- -// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index, -// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index, -// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @xevm_module{ - gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg1, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + %5 = math.exp %4 + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : vector<8x16xf32> + + %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> + !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>, + !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> gpu.return } } // ----- -// TODO: gemm does not use update_nd_offset because of an issue in scf-for distribution. // CHECK-LABEL: gpu.func @gemm -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { -// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x -// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y -// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index -// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> -// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> -// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) { -// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> -// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> -// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> -// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> -// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> -// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> -// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> -// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> -// CHECK-NEXT: } -// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> -// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { +// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x +// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y +// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index +// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> +// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> +// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) +// CHECK-SAME: -> (vector<8x1xf32>) { +// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> +// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> +// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> +// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> +// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] +// CHECK-SAME: : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> +// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> +// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ %c0 = arith.constant 0 : index @@ -203,213 +81,56 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar %block_id_y = gpu.block_id y %0 = arith.muli %block_id_x, %c8 : index %1 = arith.muli %block_id_y, %c16 : index - %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - %3 = xegpu.load_nd %2[%0, %1] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32> - %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) { - %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> - %7 = xegpu.load_nd %5[%0, %arg3] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16> - %8 = xegpu.load_nd %6[%arg3, %1] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16> - %9 = xegpu.dpas %7, %8, %arg4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> - scf.yield %9 : vector<8x16xf32> - } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} - xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return -} -} + %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> + !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + %3 = xegpu.load_nd %2[%0, %1] + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32> -// ----- -// CHECK-LABEL: gpu.func @prefetch_2d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -gpu.module @xevm_module{ - gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - xegpu.prefetch_nd %0[%c0, %c0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @prefetch_1d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -gpu.module @xevm_module{ - gpu.func @prefetch_1d(%arg0: memref<256xf16>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0: memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - xegpu.prefetch_nd %0[%c0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - gpu.return - } -} + %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) { -// ----- -// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16xf16> -> vector<1xf16> -// CHECK-NEXT: gpu.barrier -// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<1xf16>, !xegpu.tensor_desc<16xf16> -gpu.module @xevm_module{ - gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - %1 = xegpu.load_nd %0[%c0] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16> - gpu.barrier - %2 = xegpu.create_nd_tdesc %arg1 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - xegpu.store_nd %1, %2[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - gpu.return - } -} + %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16> + -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16> + -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -// ----- -// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction -// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> -// CHECK-SAME: (!xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, vector<16x2xf32>) { -// CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x32xf32> -// CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, vector<16x32xf32> -// CHECK-NEXT: } -// CHECK: %[[COL0:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> -// CHECK-NEXT: %[[RED0:.*]] = vector.reduction <add>, %[[CAST0]], %{{.*}} : vector<16xf32> into f32 -// CHECK: %[[COL1:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> -// CHECK-NEXT: %[[RED1:.*]] = vector.reduction <add>, %[[CAST1]], %{{.*}} : vector<16xf32> into f32 -// CHECK-NEXT: vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32> -gpu.module @xevm_module{ -gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { - %c0 = arith.constant 0 : index - %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> (vector<16x32xf32>) - %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.0> : vector<32xf32> - %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] - : vector<16x32xf32> to vector<32xf32> - %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} - : vector<32xf32> to vector<1x32xf32> - xegpu.store_nd %3, %0[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return -} -} + %7 = xegpu.load_nd %5[%0, %arg3] + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16> + %8 = xegpu.load_nd %6[%arg3, %1] + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} + : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16> -// ----- -// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction -// CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<2x16xf32, -// CHECK-REDUCTION-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, f32, f32) { -// CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<2x16xf32> -// CHECK-REDUCTION-NEXT: %[[ROW0:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32> -// CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction <add>, %[[ROW0]], %{{.*}} : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: %[[ROW1:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32> -// CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction <add>, %[[ROW1]], %{{.*}} : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, f32, f32 -// CHECK-REDUCTION-NEXT: } -// CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> -gpu.module @xevm_module{ -gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { - %c0 = arith.constant 0 : index - %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> (vector<2x16xf32>) - %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} dense<0.0> : vector<2xf32> - %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} - [1] : vector<2x16xf32> to vector<2xf32> - %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} - : vector<2xf32> to vector<2x1xf32> - %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<2x1xf32> to vector<2x16xf32> - xegpu.store_nd %4, %0[%c0, %c0] : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return -} -} + %9 = xegpu.dpas %7, %8, %arg4 + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> -// ----- -// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction -// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%0)[16] -> -// CHECK-SAME: (!xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<2x16xf32>) { -// CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<32x16xf32> -// CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<32x16xf32> -// CHECK-NEXT: } -// CHECK: %[[ROW0:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32> -// CHECK-NEXT: %[[R0:.*]] = vector.reduction <add>, %[[ROW0]], %{{.*}} : vector<16xf32> into f32 -// CHECK: %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> -// CHECK-NEXT: %[[R1:.*]] = vector.reduction <add>, %[[ROW1]], %{{.*}} : vector<16xf32> into f32 -// CHECK-NEXT: vector.from_elements %[[R0]], %[[R1]] : vector<2xf32> -gpu.module @xevm_module{ -gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { - %c0 = arith.constant 0 : index - %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> - %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> (vector<32x16xf32>) - %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>} dense<0.0> : vector<32xf32> - %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>} [1] - : vector<32x16xf32> to vector<32xf32> - %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} - : vector<32xf32> to vector<32x1xf32> - xegpu.store_nd %3, %0[%c0, %c0] : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> - gpu.return -} -} + scf.yield %9 : vector<8x16xf32> + } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} -// ----- -// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction -// CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<16x2xf32, -// CHECK-REDUCTION-SAME: #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, f32, f32) { -// CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<16x2xf32> -// CHECK-REDUCTION-NEXT: %[[COL0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-REDUCTION-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> -// CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction <add>, %[[CAST0]], %{{.*}} : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: %[[COL1:.*]] = vector.extract_strided_slice %5 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-REDUCTION-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> -// CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction <add>, %[[CAST1]], %cst : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, f32, f32 -// CHECK-REDUCTION-NEXT: } -// CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> -gpu.module @xevm_module{ -gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { - %c0 = arith.constant 0 : index - %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> - %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> (vector<16x2xf32>) - %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>} dense<0.0> : vector<2xf32> - %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>} - [0] : vector<16x2xf32> to vector<2xf32> - %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} - : vector<2xf32> to vector<1x2xf32> - %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : vector<1x2xf32> to vector<16x2xf32> - xegpu.store_nd %4, %0[%c0, %c0] : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> + xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>, + !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> gpu.return } } // ----- -// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) { -// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1> -// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -gpu.module @xevm_module{ - gpu.func @scatter_ops_chunksize(%src: memref<256xf16>) { - %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1> - %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> - %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { - layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> - } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> - xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}}, -// CHECK-SAME: %[[PREDICATE:.*]]: i1) { -// CHECK: %[[DEFAULT:.*]] = arith.constant dense<1.200000e+01> : vector<8xf16> -// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1> -// CHECK: %[[PREDICATED_LOAD:.*]] = scf.if %[[PREDICATE]] -> (vector<8xf16>) { -// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// CHECK-NEXT: scf.yield %[[LOADED]] : vector<8xf16> -// CHECK-NEXT: } else { -// CHECK-NEXT: scf.yield %[[DEFAULT]] : vector<8xf16> -// CHECK-NEXT: } -// CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// CHECK-LABEL: gpu.func @scatter_ops_scf_yield +// CHECK: (%{{.*}}: memref<256xf16>, %[[PREDICATE:[a-zA-Z0-9]+]]: i1) { +// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.200000e+01> : vector<1x8xf16> +// CHECK-DAG: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1> +// CHECK: %[[IF:.*]] = scf.if %[[PREDICATE]] -> (vector<1x8xf16>) { +// CHECK-NEXT: %[[LD:.*]] = xegpu.load %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// CHECK-NEXT: %[[LD_CAST:.*]] = vector.shape_cast %[[LD]] : vector<8xf16> to vector<1x8xf16> +// CHECK-NEXT: scf.yield %[[LD_CAST]] : vector<1x8xf16> +// CHECK-NEXT: } else { +// CHECK-NEXT: scf.yield %[[CST]] : vector<1x8xf16> +// CHECK-NEXT: } +// CHECK-NEXT: %[[IF_CAST:.*]] = vector.shape_cast %[[IF]] : vector<1x8xf16> to vector<8xf16> +// CHECK-NEXT: xegpu.store %[[IF_CAST]], %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> gpu.module @xevm_module{ gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) { %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1> @@ -432,13 +153,15 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) { -// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1> -// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1 -// CHECK: scf.if %[[PREDICATE]] { -// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -// CHECK-NEXT: } +// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1> +// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1 +// CHECK: scf.if %[[PREDICATE]] { +// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// CHECK-NEXT: } gpu.module @xevm_module{ gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) { %pred = llvm.mlir.poison : i1 @@ -455,88 +178,13 @@ gpu.module @xevm_module{ } // ----- -// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) { -// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1> -// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> -// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -gpu.module @xevm_module{ - gpu.func @scatter_ops(%src: memref<256xf16>) { - %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1> - %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> - %3 = xegpu.load %src[%offset], %1 { - layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> - } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> - xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( -// CHECK: %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index -gpu.module @xevm_module{ - gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) { - %c0 = arith.constant 0 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf16> - %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index - %ptr_i64 = arith.index_cast %ptr : index to i64 - %tdesc = xegpu.create_nd_tdesc %ptr_i64, shape: [16], strides: [16] : i64 - -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - xegpu.store_nd %cst, %tdesc[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> - gpu.return - } -} - - -// ----- -// CHECK-LABEL: gpu.func @vector_transpose( -// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32> -// CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[DEST]][{{.*}}] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> -gpu.module @xevm_module{ - gpu.func @vector_transpose(%arg0: memref<2x16xf32>) { - %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} dense<1.000000e+00> - : vector<16x2xf32> - %c0 = arith.constant 0 : index - %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} - : vector<16x2xf32> to vector<2x16xf32> - %0 = xegpu.create_nd_tdesc %arg0 : memref<2x16xf32> - -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - xegpu.store_nd %transpose, %0[%c0, %c0] : vector<2x16xf32>, - !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @vector_bitcast( -// CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16> -// CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16> -// CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16> -// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]][{{.*}}] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> -gpu.module @xevm_module{ - gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) { - %cst = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} - : () -> (vector<4x32xi8>) - %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} - : vector<4x32xi8> to vector<4x16xi16> - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xi16> - -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - xegpu.store_nd %bitcast, %0[%c0, %c0] : vector<4x16xi16>, - !xegpu.tensor_desc<4x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return - } -} - -// ----- // CHECK-LABEL: gpu.func @mma_transpose_b( // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> // CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> // CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> +// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}> +// CHECK-SAME: !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> // CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> // CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> // CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 03c6386..38392fd 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -282,15 +282,20 @@ gpu.module @test_distribution { // CHECK-LABEL: @store_scatter // CHECK-SAME: %[[ARG0:.*]]: memref<256xf16> gpu.func @store_scatter(%dest : memref<256xf16>) { - // CHECK: %[[VAL:.*]] = arith.constant dense<2.550000e+01> : vector<8xf16> - // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex> - // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1> + // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16> + // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex> + // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1> // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> + // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>, + // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>} // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> - %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<25.5> : vector<256xf16> - %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<0> : vector<256xindex> - %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<1> : vector<256xi1> - xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [32], sg_data = [8]>, l1_hint = #xegpu.cache_hint<cached>} + %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<25.5> : vector<256xf16> + %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<0> : vector<256xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<1> : vector<256xi1> + xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>, + layout_operand_2 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>, + layout_operand_3 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>, + l1_hint = #xegpu.cache_hint<cached>} : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1> gpu.return } diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir new file mode 100644 index 0000000..0d559b6 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir @@ -0,0 +1,94 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope + + +llvm.func @tile_trivial_loop(%baseptr: !llvm.ptr, %tc: i32, %ts: i32) -> () { + %literal_cli = omp.new_cli + omp.canonical_loop(%literal_cli) %iv : i32 in range(%tc) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(42.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + omp.tile <- (%literal_cli) sizes(%ts : i32) + llvm.return +} + + +// CHECK-LABEL: define void @tile_trivial_loop( +// CHECK-SAME: ptr %[[TMP0:.+]], i32 %[[TMP1:.+]], i32 %[[TMP2:.+]]) { +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER]]: +// CHECK-NEXT: %[[TMP4:.+]] = udiv i32 %[[TMP1:.+]], %[[TMP2:.+]] +// CHECK-NEXT: %[[TMP5:.+]] = urem i32 %[[TMP1:.+]], %[[TMP2:.+]] +// CHECK-NEXT: %[[TMP6:.+]] = icmp ne i32 %[[TMP5:.+]], 0 +// CHECK-NEXT: %[[TMP7:.+]] = zext i1 %[[TMP6:.+]] to i32 +// CHECK-NEXT: %[[OMP_FLOOR0_TRIPCOUNT:.+]] = add nuw i32 %[[TMP4:.+]], %[[TMP7:.+]] +// CHECK-NEXT: br label %[[OMP_FLOOR0_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_HEADER]]: +// CHECK-NEXT: %[[OMP_FLOOR0_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER:.+]] ], [ %[[OMP_FLOOR0_NEXT:.+]], %[[OMP_FLOOR0_INC:.+]] ] +// CHECK-NEXT: br label %[[OMP_FLOOR0_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_COND]]: +// CHECK-NEXT: %[[OMP_FLOOR0_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV:.+]], %[[OMP_FLOOR0_TRIPCOUNT:.+]] +// CHECK-NEXT: br i1 %[[OMP_FLOOR0_CMP:.+]], label %[[OMP_FLOOR0_BODY:.+]], label %[[OMP_FLOOR0_EXIT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_BODY]]: +// CHECK-NEXT: %[[TMP8:.+]] = icmp eq i32 %[[OMP_FLOOR0_IV:.+]], %[[TMP4:.+]] +// CHECK-NEXT: %[[TMP9:.+]] = select i1 %[[TMP8:.+]], i32 %[[TMP5:.+]], i32 %[[TMP2:.+]] +// CHECK-NEXT: br label %[[OMP_TILE0_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_TILE0_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_HEADER]]: +// CHECK-NEXT: %[[OMP_TILE0_IV:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER:.+]] ], [ %[[OMP_TILE0_NEXT:.+]], %[[OMP_TILE0_INC:.+]] ] +// CHECK-NEXT: br label %[[OMP_TILE0_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_COND]]: +// CHECK-NEXT: %[[OMP_TILE0_CMP:.+]] = icmp ult i32 %[[OMP_TILE0_IV:.+]], %[[TMP9:.+]] +// CHECK-NEXT: br i1 %[[OMP_TILE0_CMP:.+]], label %[[OMP_TILE0_BODY:.+]], label %[[OMP_TILE0_EXIT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_BODY]]: +// CHECK-NEXT: %[[TMP10:.+]] = mul nuw i32 %[[TMP2:.+]], %[[OMP_FLOOR0_IV:.+]] +// CHECK-NEXT: %[[TMP11:.+]] = add nuw i32 %[[TMP10:.+]], %[[OMP_TILE0_IV:.+]] +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_BODY:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION]]: +// CHECK-NEXT: %[[TMP12:.+]] = getelementptr inbounds float, ptr %[[TMP0:.+]], i32 %[[TMP11:.+]] +// CHECK-NEXT: store float 4.200000e+01, ptr %[[TMP12:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT]]: +// CHECK-NEXT: br label %[[OMP_TILE0_INC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_INC]]: +// CHECK-NEXT: %[[OMP_TILE0_NEXT:.+]] = add nuw i32 %[[OMP_TILE0_IV:.+]], 1 +// CHECK-NEXT: br label %[[OMP_TILE0_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_EXIT]]: +// CHECK-NEXT: br label %[[OMP_TILE0_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_AFTER]]: +// CHECK-NEXT: br label %[[OMP_FLOOR0_INC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_INC]]: +// CHECK-NEXT: %[[OMP_FLOOR0_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR0_IV:.+]], 1 +// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_EXIT]]: +// CHECK-NEXT: br label %[[OMP_FLOOR0_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_AFTER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER]]: +// CHECK-NEXT: ret void +// CHECK-NEXT: } diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir new file mode 100644 index 0000000..22c2973 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir @@ -0,0 +1,184 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope + + +llvm.func @tile_2d_loop(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32, %ts1: i32, %ts2: i32) -> () { + %literal_outer = omp.new_cli + %literal_inner = omp.new_cli + omp.canonical_loop(%literal_outer) %iv1 : i32 in range(%tc1) { + omp.canonical_loop(%literal_inner) %iv2 : i32 in range(%tc2) { + %idx = llvm.add %iv1, %iv2 : i32 + %ptr = llvm.getelementptr inbounds %baseptr[%idx] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(42.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + omp.terminator + } + omp.tile <- (%literal_outer, %literal_inner) sizes(%ts1, %ts2 : i32,i32) + llvm.return +} + + +// CHECK-LABEL: define void @tile_2d_loop( +// CHECK-SAME: ptr %[[TMP0:.+]], i32 %[[TMP1:.+]], i32 %[[TMP2:.+]], i32 %[[TMP3:.+]], i32 %[[TMP4:.+]]) { +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER]]: +// CHECK-NEXT: %[[TMP6:.+]] = udiv i32 %[[TMP1:.+]], %[[TMP3:.+]] +// CHECK-NEXT: %[[TMP7:.+]] = urem i32 %[[TMP1:.+]], %[[TMP3:.+]] +// CHECK-NEXT: %[[TMP8:.+]] = icmp ne i32 %[[TMP7:.+]], 0 +// CHECK-NEXT: %[[TMP9:.+]] = zext i1 %[[TMP8:.+]] to i32 +// CHECK-NEXT: %[[OMP_FLOOR0_TRIPCOUNT:.+]] = add nuw i32 %[[TMP6:.+]], %[[TMP9:.+]] +// CHECK-NEXT: %[[TMP10:.+]] = udiv i32 %[[TMP2:.+]], %[[TMP4:.+]] +// CHECK-NEXT: %[[TMP11:.+]] = urem i32 %[[TMP2:.+]], %[[TMP4:.+]] +// CHECK-NEXT: %[[TMP12:.+]] = icmp ne i32 %[[TMP11:.+]], 0 +// CHECK-NEXT: %[[TMP13:.+]] = zext i1 %[[TMP12:.+]] to i32 +// CHECK-NEXT: %[[OMP_FLOOR1_TRIPCOUNT:.+]] = add nuw i32 %[[TMP10:.+]], %[[TMP13:.+]] +// CHECK-NEXT: br label %[[OMP_FLOOR0_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_HEADER:.+]]: +// CHECK-NEXT: %[[OMP_OMP_LOOP_IV:.+]] = phi i32 [ %[[OMP_OMP_LOOP_NEXT:.+]], %[[OMP_OMP_LOOP_INC:.+]] ] +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_COND]]: +// CHECK-NEXT: %[[OMP_OMP_LOOP_CMP:.+]] = icmp ult i32 %[[TMP19:.+]], %[[TMP1:.+]] +// CHECK-NEXT: br i1 %[[OMP_OMP_LOOP_CMP:.+]], label %[[OMP_OMP_LOOP_BODY:.+]], label %[[OMP_OMP_LOOP_EXIT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER1:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER1]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_BODY4:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_HEADER]]: +// CHECK-NEXT: %[[OMP_FLOOR0_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER:.+]] ], [ %[[OMP_FLOOR0_NEXT:.+]], %[[OMP_FLOOR0_INC:.+]] ] +// CHECK-NEXT: br label %[[OMP_FLOOR0_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_COND]]: +// CHECK-NEXT: %[[OMP_FLOOR0_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV:.+]], %[[OMP_FLOOR0_TRIPCOUNT:.+]] +// CHECK-NEXT: br i1 %[[OMP_FLOOR0_CMP:.+]], label %[[OMP_FLOOR0_BODY:.+]], label %[[OMP_FLOOR0_EXIT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_BODY]]: +// CHECK-NEXT: br label %[[OMP_FLOOR1_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR1_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_FLOOR1_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR1_HEADER]]: +// CHECK-NEXT: %[[OMP_FLOOR1_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR1_PREHEADER:.+]] ], [ %[[OMP_FLOOR1_NEXT:.+]], %[[OMP_FLOOR1_INC:.+]] ] +// CHECK-NEXT: br label %[[OMP_FLOOR1_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR1_COND]]: +// CHECK-NEXT: %[[OMP_FLOOR1_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR1_IV:.+]], %[[OMP_FLOOR1_TRIPCOUNT:.+]] +// CHECK-NEXT: br i1 %[[OMP_FLOOR1_CMP:.+]], label %[[OMP_FLOOR1_BODY:.+]], label %[[OMP_FLOOR1_EXIT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR1_BODY]]: +// CHECK-NEXT: %[[TMP14:.+]] = icmp eq i32 %[[OMP_FLOOR0_IV:.+]], %[[TMP6:.+]] +// CHECK-NEXT: %[[TMP15:.+]] = select i1 %[[TMP14:.+]], i32 %[[TMP7:.+]], i32 %[[TMP3:.+]] +// CHECK-NEXT: %[[TMP16:.+]] = icmp eq i32 %[[OMP_FLOOR1_IV:.+]], %[[TMP10:.+]] +// CHECK-NEXT: %[[TMP17:.+]] = select i1 %[[TMP16:.+]], i32 %[[TMP11:.+]], i32 %[[TMP4:.+]] +// CHECK-NEXT: br label %[[OMP_TILE0_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_TILE0_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_HEADER]]: +// CHECK-NEXT: %[[OMP_TILE0_IV:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER:.+]] ], [ %[[OMP_TILE0_NEXT:.+]], %[[OMP_TILE0_INC:.+]] ] +// CHECK-NEXT: br label %[[OMP_TILE0_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_COND]]: +// CHECK-NEXT: %[[OMP_TILE0_CMP:.+]] = icmp ult i32 %[[OMP_TILE0_IV:.+]], %[[TMP15:.+]] +// CHECK-NEXT: br i1 %[[OMP_TILE0_CMP:.+]], label %[[OMP_TILE0_BODY:.+]], label %[[OMP_TILE0_EXIT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_BODY]]: +// CHECK-NEXT: br label %[[OMP_TILE1_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE1_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_TILE1_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE1_HEADER]]: +// CHECK-NEXT: %[[OMP_TILE1_IV:.+]] = phi i32 [ 0, %[[OMP_TILE1_PREHEADER:.+]] ], [ %[[OMP_TILE1_NEXT:.+]], %[[OMP_TILE1_INC:.+]] ] +// CHECK-NEXT: br label %[[OMP_TILE1_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE1_COND]]: +// CHECK-NEXT: %[[OMP_TILE1_CMP:.+]] = icmp ult i32 %[[OMP_TILE1_IV:.+]], %[[TMP17:.+]] +// CHECK-NEXT: br i1 %[[OMP_TILE1_CMP:.+]], label %[[OMP_TILE1_BODY:.+]], label %[[OMP_TILE1_EXIT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE1_BODY]]: +// CHECK-NEXT: %[[TMP18:.+]] = mul nuw i32 %[[TMP3:.+]], %[[OMP_FLOOR0_IV:.+]] +// CHECK-NEXT: %[[TMP19:.+]] = add nuw i32 %[[TMP18:.+]], %[[OMP_TILE0_IV:.+]] +// CHECK-NEXT: %[[TMP20:.+]] = mul nuw i32 %[[TMP4:.+]], %[[OMP_FLOOR1_IV:.+]] +// CHECK-NEXT: %[[TMP21:.+]] = add nuw i32 %[[TMP20:.+]], %[[OMP_TILE1_IV:.+]] +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_BODY:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY4]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION12:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION12]]: +// CHECK-NEXT: %[[TMP22:.+]] = add i32 %[[TMP19:.+]], %[[TMP21:.+]] +// CHECK-NEXT: %[[TMP23:.+]] = getelementptr inbounds float, ptr %[[TMP0:.+]], i32 %[[TMP22:.+]] +// CHECK-NEXT: store float 4.200000e+01, ptr %[[TMP23:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT11:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT11]]: +// CHECK-NEXT: br label %[[OMP_TILE1_INC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE1_INC]]: +// CHECK-NEXT: %[[OMP_TILE1_NEXT:.+]] = add nuw i32 %[[OMP_TILE1_IV:.+]], 1 +// CHECK-NEXT: br label %[[OMP_TILE1_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE1_EXIT]]: +// CHECK-NEXT: br label %[[OMP_TILE1_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE1_AFTER]]: +// CHECK-NEXT: br label %[[OMP_TILE0_INC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_INC]]: +// CHECK-NEXT: %[[OMP_TILE0_NEXT:.+]] = add nuw i32 %[[OMP_TILE0_IV:.+]], 1 +// CHECK-NEXT: br label %[[OMP_TILE0_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_EXIT]]: +// CHECK-NEXT: br label %[[OMP_TILE0_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_TILE0_AFTER]]: +// CHECK-NEXT: br label %[[OMP_FLOOR1_INC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR1_INC]]: +// CHECK-NEXT: %[[OMP_FLOOR1_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR1_IV:.+]], 1 +// CHECK-NEXT: br label %[[OMP_FLOOR1_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR1_EXIT]]: +// CHECK-NEXT: br label %[[OMP_FLOOR1_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR1_AFTER]]: +// CHECK-NEXT: br label %[[OMP_FLOOR0_INC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_INC]]: +// CHECK-NEXT: %[[OMP_FLOOR0_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR0_IV:.+]], 1 +// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_EXIT]]: +// CHECK-NEXT: br label %[[OMP_FLOOR0_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FLOOR0_AFTER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT:.+]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_INC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_INC]]: +// CHECK-NEXT: %[[OMP_OMP_LOOP_NEXT:.+]] = add nuw i32 %[[TMP19:.+]], 1 +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_EXIT]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER]]: +// CHECK-NEXT: ret void +// CHECK-NEXT: } diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index e043a8c..00ee6b7 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -1340,6 +1340,34 @@ llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) { llvm.return } +// CHECK-LABEL: rocdl.cvt.scalef32.pk8 +// CHECK-SAME:(<8 x float> %[[V8F32:.+]], <8 x half> %[[V8F16:.+]], <8 x bfloat> %[[V8BF16:.+]], float %[[SCALE:.+]]) +llvm.func @rocdl.cvt.scalef32.pk8(%v8xf32: vector<8xf32>, %v8xf16: vector<8xf16>, %v8xbf16: vector<8xbf16>, %scale: f32) { + + // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> %[[V8F32]], float %[[SCALE]]) + %0 = rocdl.cvt.scalef32.pk8.fp8.f32 %v8xf32, %scale : vector<2xi32> + // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> %[[V8F32]], float %[[SCALE]]) + %1 = rocdl.cvt.scalef32.pk8.bf8.f32 %v8xf32, %scale : vector<2xi32> + // CHECK: call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> %[[V8F32]], float %[[SCALE]]) + %2 = rocdl.cvt.scalef32.pk8.fp4.f32 %v8xf32, %scale : i32 + + // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> %[[V8F16]], float %[[SCALE]]) + %3 = rocdl.cvt.scalef32.pk8.fp8.f16 %v8xf16, %scale : vector<2xi32> + // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> %[[V8F16]], float %[[SCALE]]) + %4 = rocdl.cvt.scalef32.pk8.bf8.f16 %v8xf16, %scale : vector<2xi32> + // CHECK: call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> %[[V8F16]], float %[[SCALE]]) + %5 = rocdl.cvt.scalef32.pk8.fp4.f16 %v8xf16, %scale : i32 + + // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> %[[V8BF16]], float %[[SCALE]]) + %6 = rocdl.cvt.scalef32.pk8.fp8.bf16 %v8xbf16, %scale : vector<2xi32> + // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> %[[V8BF16]], float %[[SCALE]]) + %7 = rocdl.cvt.scalef32.pk8.bf8.bf16 %v8xbf16, %scale : vector<2xi32> + // CHECK: call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> %[[V8BF16]], float %[[SCALE]]) + %8 = rocdl.cvt.scalef32.pk8.fp4.bf16 %v8xbf16, %scale : i32 + + llvm.return +} + // CHECK-LABEL: @rocdl.cvt.scale.pk16 // CHECK-SAME:(<3 x i32> %[[SRC0:.+]], i32 %[[SCALE:.+]]) llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) { diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt b/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt index 103bc94..7d32577 100644 --- a/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt +++ b/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt @@ -12,5 +12,7 @@ add_mlir_library(MLIRTestIRDLToCppDialect mlir_target_link_libraries(MLIRTestIRDLToCppDialect PUBLIC MLIRIR MLIRPass + MLIRSCFDialect MLIRTransforms + MLIRTestDialect ) diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp b/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp index 9550e4c..421db7e 100644 --- a/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp +++ b/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp @@ -13,6 +13,7 @@ // #include "mlir/IR/Dialect.h" #include "mlir/IR/Region.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/DialectImplementation.h" #include "mlir/Interfaces/InferTypeOpInterface.h" @@ -54,16 +55,34 @@ struct TestOpConversion : public OpConversionPattern<test_irdl_to_cpp::BeefOp> { } }; +struct TestRegionConversion + : public OpConversionPattern<test_irdl_to_cpp::ConditionalOp> { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(mlir::test_irdl_to_cpp::ConditionalOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Just exercising the C++ API even though these are not enforced in the + // dialect definition + assert(op.getThen().getBlocks().size() == 1); + assert(adaptor.getElse().getBlocks().size() == 1); + auto ifOp = scf::IfOp::create(rewriter, op.getLoc(), op.getInput()); + rewriter.replaceOp(op, ifOp); + return success(); + } +}; + struct ConvertTestDialectToSomethingPass : PassWrapper<ConvertTestDialectToSomethingPass, OperationPass<ModuleOp>> { void runOnOperation() override { MLIRContext *ctx = &getContext(); RewritePatternSet patterns(ctx); - patterns.add<TestOpConversion>(ctx); + patterns.add<TestOpConversion, TestRegionConversion>(ctx); ConversionTarget target(getContext()); - target.addIllegalOp<test_irdl_to_cpp::BeefOp>(); - target.addLegalOp<test_irdl_to_cpp::BarOp>(); - target.addLegalOp<test_irdl_to_cpp::HashOp>(); + target.addIllegalOp<test_irdl_to_cpp::BeefOp, + test_irdl_to_cpp::ConditionalOp>(); + target.addLegalOp<test_irdl_to_cpp::BarOp, test_irdl_to_cpp::HashOp, + scf::IfOp, scf::YieldOp>(); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) signalPassFailure(); @@ -73,6 +92,10 @@ struct ConvertTestDialectToSomethingPass StringRef getDescription() const final { return "Checks the convertability of an irdl dialect"; } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert<scf::SCFDialect>(); + } }; void registerIrdlTestDialect(mlir::DialectRegistry ®istry) { diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir index f6233ee..1915324 100644 --- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir +++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir @@ -1,15 +1,29 @@ // RUN: mlir-opt %s --pass-pipeline="builtin.module(test-irdl-conversion-check)" | FileCheck %s // CHECK-LABEL: module { module { - // CHECK: func.func @test() { + // CHECK: func.func @test(%[[test_arg:[^ ]*]]: i1) { // CHECK: %[[v0:[^ ]*]] = "test_irdl_to_cpp.bar"() : () -> i32 // CHECK: %[[v1:[^ ]*]] = "test_irdl_to_cpp.bar"() : () -> i32 // CHECK: %[[v2:[^ ]*]] = "test_irdl_to_cpp.hash"(%[[v0]], %[[v0]]) : (i32, i32) -> i32 + // CHECK: scf.if %[[test_arg]] // CHECK: return // CHECK: } - func.func @test() { + func.func @test(%test_arg: i1) { %0 = "test_irdl_to_cpp.bar"() : () -> i32 %1 = "test_irdl_to_cpp.beef"(%0, %0) : (i32, i32) -> i32 + "test_irdl_to_cpp.conditional"(%test_arg) ({ + ^cond(%test: i1): + %3 = "test_irdl_to_cpp.bar"() : () -> i32 + "test.terminator"() : ()->() + }, { + ^then(%what: i1, %ever: i32): + %4 = "test_irdl_to_cpp.bar"() : () -> i32 + "test.terminator"() : ()->() + }, { + ^else(): + %5 = "test_irdl_to_cpp.bar"() : () -> i32 + "test.terminator"() : ()->() + }) : (i1) -> () return } diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir index 42e713e..85fb8cb 100644 --- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir +++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir @@ -2,7 +2,7 @@ // CHECK: class TestIrdlToCpp irdl.dialect @test_irdl_to_cpp { - + // CHECK: class FooType irdl.type @foo @@ -32,4 +32,53 @@ irdl.dialect @test_irdl_to_cpp { irdl.operands(lhs: %0, rhs: %0) irdl.results(res: %0) } + + // CHECK: ConditionalOp declarations + // CHECK: ConditionalOpGenericAdaptorBase + // CHECK: ::mlir::Region &getCond() { return *getRegions()[0]; } + // CHECK: ::mlir::Region &getThen() { return *getRegions()[1]; } + // CHECK: ::mlir::Region &getElse() { return *getRegions()[2]; } + // + // CHECK: class ConditionalOp : public ::mlir::Op<ConditionalOp, ::mlir::OpTrait::NRegions<3>::Impl, ::mlir::OpTrait::OpInvariants> + // CHECK: ::mlir::Region &getCond() { return (*this)->getRegion(0); } + // CHECK: ::mlir::Region &getThen() { return (*this)->getRegion(1); } + // CHECK: ::mlir::Region &getElse() { return (*this)->getRegion(2); } + + // CHECK: ConditionalOp definitions + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_cond + // CHECK: if (!(region.getNumArguments() == 1)) { + // CHECK: failed to verify constraint: region with 1 entry block argument(s) + + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_then + // CHECK: if (!(true)) { + + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_else + // CHECK: if (!(region.getNumArguments() == 0)) { + // CHECK: failed to verify constraint: region with 0 entry block argument(s) + + // CHECK: ConditionalOp::build + // CHECK: for (unsigned i = 0; i != 3; ++i) + // CHECK-NEXT: (void)odsState.addRegion(); + + // CHECK: ConditionalOp::verifyInvariantsImpl + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_cond + // CHECK: failure + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_then + // CHECK: failure + // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_else + // CHECK: failure + // CHECK: success + irdl.operation @conditional { + %r0 = irdl.region // Unconstrained region + %r1 = irdl.region() // Region with no entry block arguments + + // TODO(#161018): support irdl.is in irdl-to-cpp + // %v0 = irdl.is i1 // Type constraint: i1 (boolean) + %v0 = irdl.any + %r2 = irdl.region(%v0) // Region with one i1 entry block argument + irdl.regions(cond: %r2, then: %r0, else: %r1) + + %0 = irdl.any + irdl.operands(input: %0) + } } diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir index 403b492..cc27456 100644 --- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir +++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir @@ -7,7 +7,7 @@ irdl.dialect @test_irdl_to_cpp { irdl.results(res: %1) } } -// ----- +// ----- irdl.dialect @test_irdl_to_cpp { irdl.operation @operands_no_any_of { @@ -42,7 +42,7 @@ irdl.dialect @test_irdl_to_cpp { irdl.dialect @test_irdl_to_cpp { irdl.type @ty { - %0 = irdl.any + %0 = irdl.any // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.parameters operation}} irdl.parameters(ty: %0) } @@ -51,29 +51,8 @@ irdl.dialect @test_irdl_to_cpp { // ----- irdl.dialect @test_irdl_to_cpp { - irdl.operation @test_op { - // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.region operation}} - %0 = irdl.region() - irdl.regions(reg: %0) - } - -} - -// ----- - -irdl.dialect @test_irdl_to_cpp { - irdl.operation @test_op { - // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.regions operation}} - irdl.regions() - } - -} - -// ----- - -irdl.dialect @test_irdl_to_cpp { irdl.type @test_derived { // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.base operation}} %0 = irdl.base "!builtin.integer" - } + } } diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index 094ef0a..6ba7a00 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -173,8 +173,6 @@ struct TestXeGPUUnrollingPatterns #undef DEBUG_TYPE #define DEBUG_TYPE "test-xegpu-layout-interface" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") -#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") // Test pattern for distributing vector::StepOp from workgroup to subgroup. // Validates DistributeLayoutAttr interfaces for offset computation @@ -220,6 +218,35 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> { } }; +struct TestXeGPUSGDistribute + : public PassWrapper<TestXeGPUSGDistribute, + OperationPass<gpu::GPUModuleOp>> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUSGDistribute) + + StringRef getArgument() const final { return "test-xegpu-sg-distribute"; } + + StringRef getDescription() const final { + return "Test the implementation of XeGPU Subgroup Distribution"; + } + + void getDependentDialects(::mlir::DialectRegistry ®istry) const override { + registry.insert<arith::ArithDialect>(); + registry.insert<memref::MemRefDialect>(); + registry.insert<xegpu::XeGPUDialect>(); + registry.insert<vector::VectorDialect>(); + registry.insert<index::IndexDialect>(); + } + + TestXeGPUSGDistribute() = default; + TestXeGPUSGDistribute(const TestXeGPUSGDistribute &pass) = default; + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + xegpu::populateXeGPUSubgroupDistributePatterns(patterns); + (void)applyPatternsGreedily(getOperation(), std::move(patterns)); + } +}; + struct TestXeGPULayoutInterface : public PassWrapper<TestXeGPULayoutInterface, OperationPass<gpu::GPUModuleOp>> { @@ -284,6 +311,7 @@ namespace test { void registerTestXeGPULowerings() { PassRegistration<TestXeGPUUnrollingPatterns>(); PassRegistration<TestXeGPULayoutInterface>(); + PassRegistration<TestXeGPUSGDistribute>(); } } // namespace test } // namespace mlir diff --git a/mlir/test/mlir-tblgen/op-format-invalid.td b/mlir/test/mlir-tblgen/op-format-invalid.td index 2f29543..0a022ad 100644 --- a/mlir/test/mlir-tblgen/op-format-invalid.td +++ b/mlir/test/mlir-tblgen/op-format-invalid.td @@ -307,7 +307,7 @@ def DirectiveTypeZOperandInvalidI : TestFormat_Op<[{ def LiteralInvalidA : TestFormat_Op<[{ `a:` }]>; -// CHECK: error: expected valid literal but got '1': single character literal must be a letter or one of '_:,=<>()[]{}?+*' +// CHECK: error: expected valid literal but got '1': single character literal must be a letter or one of '_:,=<>()[]{}?+-*' def LiteralInvalidB : TestFormat_Op<[{ `1` }]>; diff --git a/mlir/test/mlir-tblgen/op-format-spec.td b/mlir/test/mlir-tblgen/op-format-spec.td index 1541cd0..1ac2311 100644 --- a/mlir/test/mlir-tblgen/op-format-spec.td +++ b/mlir/test/mlir-tblgen/op-format-spec.td @@ -123,7 +123,7 @@ def DirectiveTypeValid : TestFormat_Op<[{ // CHECK-NOT: error def LiteralValid : TestFormat_Op<[{ - `_` `:` `,` `=` `<` `>` `(` `)` `[` `]` `?` `+` `*` ` ` `` `->` `\n` `abc$._` + `_` `:` `,` `=` `<` `>` `(` `)` `[` `]` `?` `+` `-` `*` ` ` `` `->` `\n` `abc$._` attr-dict }]>; diff --git a/mlir/test/python/dialects/transform_tune_ext.py b/mlir/test/python/dialects/transform_tune_ext.py index dfb9359..eb2a083 100644 --- a/mlir/test/python/dialects/transform_tune_ext.py +++ b/mlir/test/python/dialects/transform_tune_ext.py @@ -1,21 +1,21 @@ # RUN: %PYTHON %s | FileCheck %s -from mlir.ir import * +from mlir import ir from mlir.dialects import transform from mlir.dialects.transform import tune, debug def run(f): - print("\nTEST:", f.__name__) - with Context(), Location.unknown(): - module = Module.create() - with InsertionPoint(module.body): + print("\n// TEST:", f.__name__) + with ir.Context(), ir.Location.unknown(): + module = ir.Module.create() + with ir.InsertionPoint(module.body): sequence = transform.SequenceOp( transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get(), ) - with InsertionPoint(sequence.body): + with ir.InsertionPoint(sequence.body): f(sequence.bodyTarget) transform.YieldOp() print(module) @@ -29,10 +29,10 @@ def testKnobOp(target): # CHECK: %[[HEADS_OR_TAILS:.*]] = transform.tune.knob<"coin"> options = [true, false] -> !transform.any_param heads_or_tails = tune.KnobOp( - result=any_param, name=StringAttr.get("coin"), options=[True, False] + result=any_param, name=ir.StringAttr.get("coin"), options=[True, False] ) # CHECK: transform.tune.knob<"animal"> options = ["cat", "dog", unit] -> !transform.any_param - tune.KnobOp(any_param, name="animal", options=["cat", "dog", UnitAttr.get()]) + tune.KnobOp(any_param, name="animal", options=["cat", "dog", ir.UnitAttr.get()]) # CHECK: transform.tune.knob<"tile_size"> options = [2, 4, 8, 16, 24, 32] -> !transform.any_param tune.KnobOp(any_param, "tile_size", [2, 4, 8, 16, 24, 32]) # CHECK: transform.tune.knob<"magic_value"> options = [2.000000e+00, 2.250000e+00, 2.500000e+00, 2.750000e+00, 3.000000e+00] -> !transform.any_param @@ -45,7 +45,10 @@ def testKnobOp(target): heads = tune.KnobOp(any_param, "coin", options=[True, False], selected=True) # CHECK: transform.tune.knob<"animal"> = "dog" from options = ["cat", "dog", unit] -> !transform.any_param tune.KnobOp( - any_param, name="animal", options=["cat", "dog", UnitAttr.get()], selected="dog" + any_param, + name="animal", + options=["cat", "dog", ir.UnitAttr.get()], + selected="dog", ) # CHECK: transform.tune.knob<"tile_size"> = 8 : i64 from options = [2, 4, 8, 16, 24, 32] -> !transform.any_param tune.KnobOp(any_param, "tile_size", [2, 4, 8, 16, 24, 32], selected=8) @@ -57,16 +60,90 @@ def testKnobOp(target): # CHECK: transform.tune.knob<"range_as_a_dict"> = 4 : i64 from options = {start = 2 : i64, step = 2 : i64, stop = 16 : i64} -> !transform.any_param # NB: Membership of `selected` in non-ArrayAttr `options` is _not_ verified. - i64 = IntegerType.get_signless(64) + i64 = ir.IntegerType.get_signless(64) tune.knob( any_param, "range_as_a_dict", - DictAttr.get( + ir.DictAttr.get( { - "start": IntegerAttr.get(i64, 2), - "stop": IntegerAttr.get(i64, 16), - "step": IntegerAttr.get(i64, 2), + "start": ir.IntegerAttr.get(i64, 2), + "stop": ir.IntegerAttr.get(i64, 16), + "step": ir.IntegerAttr.get(i64, 2), } ), selected=4, ) + + +# CHECK-LABEL: TEST: testAlternativesOp +@run +def testAlternativesOp(target): + any_param = transform.AnyParamType.get() + + # CHECK: %[[LEFT_OR_RIGHT_OUTCOME:.*]] = transform.tune.alternatives<"left_or_right"> -> !transform.any_param { + left_or_right = tune.AlternativesOp( + [transform.AnyParamType.get()], "left_or_right", 2 + ) + idx_for_left, idx_for_right = 0, 1 + with ir.InsertionPoint(left_or_right.alternatives[idx_for_left].blocks[0]): + # CHECK: %[[C0:.*]] = transform.param.constant 0 + i32_0 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 0) + c0 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_0) + # CHECK: transform.yield %[[C0]] + transform.yield_(c0) + # CHECK-NEXT: }, { + with ir.InsertionPoint(left_or_right.alternatives[idx_for_right].blocks[0]): + # CHECK: %[[C1:.*]] = transform.param.constant 1 + i32_1 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 1) + c1 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_1) + # CHECK: transform.yield %[[C1]] + transform.yield_(c1) + # CHECK-NEXT: } + outcome_of_left_or_right_decision = left_or_right.results[0] + + # CHECK: transform.tune.alternatives<"fork_in_the_road"> selected_region = 0 -> !transform.any_param { + fork_in_the_road = tune.AlternativesOp( + [transform.AnyParamType.get()], "fork_in_the_road", 2, selected_region=0 + ) + with ir.InsertionPoint(fork_in_the_road.alternatives[idx_for_left].blocks[0]): + # CHECK: %[[C0:.*]] = transform.param.constant 0 + i32_0 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 0) + c0 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_0) + # CHECK: transform.yield %[[C0]] + transform.yield_(c0) + # CHECK-NEXT: }, { + with ir.InsertionPoint(fork_in_the_road.alternatives[idx_for_right].blocks[0]): + # CHECK: %[[C1:.*]] = transform.param.constant 1 + i32_1 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 1) + c1 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_1) + # CHECK: transform.yield %[[C1]] + transform.yield_(c1) + # CHECK-NEXT: } + + # CHECK: transform.tune.alternatives<"left_or_right_as_before"> selected_region = %[[LEFT_OR_RIGHT_OUTCOME]] : !transform.any_param { + left_or_right_as_before = tune.AlternativesOp( + [], + "left_or_right_as_before", + 2, + selected_region=outcome_of_left_or_right_decision, + ) + with ir.InsertionPoint( + left_or_right_as_before.alternatives[idx_for_left].blocks[0] + ): + # CHECK: transform.param.constant 1337 + i32_1337 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 1337) + c1337 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_1337) + # CHECK: transform.debug.emit_param_as_remark + debug.emit_param_as_remark(c1337) + transform.yield_([]) + # CHECK-NEXT: }, { + with ir.InsertionPoint( + left_or_right_as_before.alternatives[idx_for_right].blocks[0] + ): + # CHECK: transform.param.constant 42 + i32_42 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 42) + c42 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_42) + # CHECK: transform.debug.emit_param_as_remark + debug.emit_param_as_remark(c42) + transform.yield_([]) + # CHECK-NEXT: } diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py index 4a3625c..cb4cfc8c 100644 --- a/mlir/test/python/ir/operation.py +++ b/mlir/test/python/ir/operation.py @@ -696,6 +696,7 @@ def testOperationPrint(): # CHECK: resource1: "0x08 module.operation.print(large_elements_limit=2) + # CHECK-LABEL: TEST: testKnownOpView @run def testKnownOpView(): @@ -969,6 +970,13 @@ def testOperationLoc(): assert op.location == loc assert op.operation.location == loc + another_loc = Location.name("another_loc") + op.location = another_loc + assert op.location == another_loc + assert op.operation.location == another_loc + # CHECK: loc("another_loc") + print(op.location) + # CHECK-LABEL: TEST: testModuleMerge @run |