diff options
Diffstat (limited to 'mlir/test')
34 files changed, 1360 insertions, 195 deletions
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir new file mode 100644 index 0000000..74017e8 --- /dev/null +++ b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir @@ -0,0 +1,16 @@ +// RUN: mlir-opt %s -convert-gpu-to-llvm-spv | FileCheck %s + +gpu.module @test_module { + // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 2 : i32} + // CHECK: llvm.func spir_funccc @_Z6printfPU3AS2Kcz(!llvm.ptr<2>, ...) -> i32 + // CHECK-LABEL: llvm.func spir_funccc @test_printf + // CHECK: (%[[ARG0:.*]]: i32) + gpu.func @test_printf(%arg0: i32) { + // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<2> + // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<2>) -> !llvm.ptr<2>, !llvm.array<11 x i8> + // CHECK-NEXT: %{{.*}} = llvm.call spir_funccc @_Z6printfPU3AS2Kcz(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func<i32 (ptr<2>, ...)>) : (!llvm.ptr<2>, i32) -> i32 + gpu.printf "Hello: %d\n", %arg0 : i32 + gpu.return + } +} + diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir index 574e9f4..efdceed 100644 --- a/mlir/test/Dialect/Affine/unroll.mlir +++ b/mlir/test/Dialect/Affine/unroll.mlir @@ -1,9 +1,9 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true}))" | FileCheck %s --check-prefix UNROLL-FULL -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true unroll-full-threshold=2}))" | FileCheck %s --check-prefix SHORT +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=-1}))" | FileCheck %s --check-prefix UNROLL-FULL +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=-1 unroll-full-threshold=2}))" | FileCheck %s --check-prefix SHORT // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=4}))" | FileCheck %s --check-prefix UNROLL-BY-4 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=1}))" | FileCheck %s --check-prefix UNROLL-BY-1 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=5 cleanup-unroll=true}))" | FileCheck %s --check-prefix UNROLL-CLEANUP-LOOP -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(gpu.module(gpu.func(affine-loop-unroll{unroll-full=true})))" | FileCheck %s --check-prefix GPU-UNROLL-FULL +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(gpu.module(gpu.func(affine-loop-unroll{unroll-factor=-1})))" | FileCheck %s --check-prefix GPU-UNROLL-FULL // UNROLL-FULL-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)> // UNROLL-FULL-DAG: [[$MAP1:#map[0-9]*]] = affine_map<(d0) -> (d0 + 2)> diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir index 7160b52..3130902 100644 --- a/mlir/test/Dialect/MemRef/canonicalize.mlir +++ b/mlir/test/Dialect/MemRef/canonicalize.mlir @@ -901,6 +901,132 @@ func.func @scope_merge_without_terminator() { // ----- +// Check that we simplify extract_strided_metadata of cast +// when the source of the cast is compatible with what +// `extract_strided_metadata`s accept. +// +// When we apply the transformation the resulting offset, sizes and strides +// should come straight from the inputs of the cast. +// Additionally the folder on extract_strided_metadata should propagate the +// static information. +// +// CHECK-LABEL: func @extract_strided_metadata_of_cast +// CHECK-SAME: %[[ARG:.*]]: memref<3x?xi32, strided<[4, ?], offset: ?>>) +// +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK: %[[BASE:.*]], %[[DYN_OFFSET:.*]], %[[DYN_SIZES:.*]]:2, %[[DYN_STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG]] +// +// CHECK: return %[[BASE]], %[[DYN_OFFSET]], %[[C3]], %[[DYN_SIZES]]#1, %[[C4]], %[[DYN_STRIDES]]#1 +func.func @extract_strided_metadata_of_cast( + %arg : memref<3x?xi32, strided<[4, ?], offset:?>>) + -> (memref<i32>, index, + index, index, + index, index) { + + %cast = + memref.cast %arg : + memref<3x?xi32, strided<[4, ?], offset: ?>> to + memref<?x?xi32, strided<[?, ?], offset: ?>> + + %base, %base_offset, %sizes:2, %strides:2 = + memref.extract_strided_metadata %cast:memref<?x?xi32, strided<[?, ?], offset: ?>> + -> memref<i32>, index, + index, index, + index, index + + return %base, %base_offset, + %sizes#0, %sizes#1, + %strides#0, %strides#1 : + memref<i32>, index, + index, index, + index, index +} + +// ----- + +// Check that we simplify extract_strided_metadata of cast +// when the source of the cast is compatible with what +// `extract_strided_metadata`s accept. +// +// Same as extract_strided_metadata_of_cast but with constant sizes and strides +// in the destination type. +// +// CHECK-LABEL: func @extract_strided_metadata_of_cast_w_csts +// CHECK-SAME: %[[ARG:.*]]: memref<?x?xi32, strided<[?, ?], offset: ?>>) +// +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C18:.*]] = arith.constant 18 : index +// CHECK-DAG: %[[C25:.*]] = arith.constant 25 : index +// CHECK: %[[BASE:.*]], %[[DYN_OFFSET:.*]], %[[DYN_SIZES:.*]]:2, %[[DYN_STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG]] +// +// CHECK: return %[[BASE]], %[[C25]], %[[C4]], %[[DYN_SIZES]]#1, %[[DYN_STRIDES]]#0, %[[C18]] +func.func @extract_strided_metadata_of_cast_w_csts( + %arg : memref<?x?xi32, strided<[?, ?], offset:?>>) + -> (memref<i32>, index, + index, index, + index, index) { + + %cast = + memref.cast %arg : + memref<?x?xi32, strided<[?, ?], offset: ?>> to + memref<4x?xi32, strided<[?, 18], offset: 25>> + + %base, %base_offset, %sizes:2, %strides:2 = + memref.extract_strided_metadata %cast:memref<4x?xi32, strided<[?, 18], offset: 25>> + -> memref<i32>, index, + index, index, + index, index + + return %base, %base_offset, + %sizes#0, %sizes#1, + %strides#0, %strides#1 : + memref<i32>, index, + index, index, + index, index +} + +// ----- + +// Check that we don't simplify extract_strided_metadata of +// cast when the source of the cast is unranked. +// Unranked memrefs cannot feed into extract_strided_metadata operations. +// Note: Technically we could still fold the sizes and strides. +// +// CHECK-LABEL: func @extract_strided_metadata_of_cast_unranked +// CHECK-SAME: %[[ARG:.*]]: memref<*xi32>) +// +// CHECK: %[[CAST:.*]] = memref.cast %[[ARG]] : +// CHECK: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[CAST]] +// +// CHECK: return %[[BASE]], %[[OFFSET]], %[[SIZES]]#0, %[[SIZES]]#1, %[[STRIDES]]#0, %[[STRIDES]]#1 +func.func @extract_strided_metadata_of_cast_unranked( + %arg : memref<*xi32>) + -> (memref<i32>, index, + index, index, + index, index) { + + %cast = + memref.cast %arg : + memref<*xi32> to + memref<?x?xi32, strided<[?, ?], offset: ?>> + + %base, %base_offset, %sizes:2, %strides:2 = + memref.extract_strided_metadata %cast:memref<?x?xi32, strided<[?, ?], offset: ?>> + -> memref<i32>, index, + index, index, + index, index + + return %base, %base_offset, + %sizes#0, %sizes#1, + %strides#0, %strides#1 : + memref<i32>, index, + index, index, + index, index +} + +// ----- + // CHECK-LABEL: func @reinterpret_noop // CHECK-SAME: (%[[ARG:.*]]: memref<2x3x4xf32>) // CHECK-NEXT: return %[[ARG]] diff --git a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir index 1e6b011..18cdfb7 100644 --- a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir +++ b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir @@ -1378,133 +1378,6 @@ func.func @extract_strided_metadata_of_get_global_with_offset() // ----- -// Check that we simplify extract_strided_metadata of cast -// when the source of the cast is compatible with what -// `extract_strided_metadata`s accept. -// -// When we apply the transformation the resulting offset, sizes and strides -// should come straight from the inputs of the cast. -// Additionally the folder on extract_strided_metadata should propagate the -// static information. -// -// CHECK-LABEL: func @extract_strided_metadata_of_cast -// CHECK-SAME: %[[ARG:.*]]: memref<3x?xi32, strided<[4, ?], offset: ?>>) -// -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK: %[[BASE:.*]], %[[DYN_OFFSET:.*]], %[[DYN_SIZES:.*]]:2, %[[DYN_STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG]] -// -// CHECK: return %[[BASE]], %[[DYN_OFFSET]], %[[C3]], %[[DYN_SIZES]]#1, %[[C4]], %[[DYN_STRIDES]]#1 -func.func @extract_strided_metadata_of_cast( - %arg : memref<3x?xi32, strided<[4, ?], offset:?>>) - -> (memref<i32>, index, - index, index, - index, index) { - - %cast = - memref.cast %arg : - memref<3x?xi32, strided<[4, ?], offset: ?>> to - memref<?x?xi32, strided<[?, ?], offset: ?>> - - %base, %base_offset, %sizes:2, %strides:2 = - memref.extract_strided_metadata %cast:memref<?x?xi32, strided<[?, ?], offset: ?>> - -> memref<i32>, index, - index, index, - index, index - - return %base, %base_offset, - %sizes#0, %sizes#1, - %strides#0, %strides#1 : - memref<i32>, index, - index, index, - index, index -} - -// ----- - -// Check that we simplify extract_strided_metadata of cast -// when the source of the cast is compatible with what -// `extract_strided_metadata`s accept. -// -// Same as extract_strided_metadata_of_cast but with constant sizes and strides -// in the destination type. -// -// CHECK-LABEL: func @extract_strided_metadata_of_cast_w_csts -// CHECK-SAME: %[[ARG:.*]]: memref<?x?xi32, strided<[?, ?], offset: ?>>) -// -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C18:.*]] = arith.constant 18 : index -// CHECK-DAG: %[[C25:.*]] = arith.constant 25 : index -// CHECK: %[[BASE:.*]], %[[DYN_OFFSET:.*]], %[[DYN_SIZES:.*]]:2, %[[DYN_STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG]] -// -// CHECK: return %[[BASE]], %[[C25]], %[[C4]], %[[DYN_SIZES]]#1, %[[DYN_STRIDES]]#0, %[[C18]] -func.func @extract_strided_metadata_of_cast_w_csts( - %arg : memref<?x?xi32, strided<[?, ?], offset:?>>) - -> (memref<i32>, index, - index, index, - index, index) { - - %cast = - memref.cast %arg : - memref<?x?xi32, strided<[?, ?], offset: ?>> to - memref<4x?xi32, strided<[?, 18], offset: 25>> - - %base, %base_offset, %sizes:2, %strides:2 = - memref.extract_strided_metadata %cast:memref<4x?xi32, strided<[?, 18], offset: 25>> - -> memref<i32>, index, - index, index, - index, index - - return %base, %base_offset, - %sizes#0, %sizes#1, - %strides#0, %strides#1 : - memref<i32>, index, - index, index, - index, index -} - -// ----- - -// Check that we don't simplify extract_strided_metadata of -// cast when the source of the cast is unranked. -// Unranked memrefs cannot feed into extract_strided_metadata operations. -// Note: Technically we could still fold the sizes and strides. -// -// CHECK-LABEL: func @extract_strided_metadata_of_cast_unranked -// CHECK-SAME: %[[ARG:.*]]: memref<*xi32>) -// -// CHECK: %[[CAST:.*]] = memref.cast %[[ARG]] : -// CHECK: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[CAST]] -// -// CHECK: return %[[BASE]], %[[OFFSET]], %[[SIZES]]#0, %[[SIZES]]#1, %[[STRIDES]]#0, %[[STRIDES]]#1 -func.func @extract_strided_metadata_of_cast_unranked( - %arg : memref<*xi32>) - -> (memref<i32>, index, - index, index, - index, index) { - - %cast = - memref.cast %arg : - memref<*xi32> to - memref<?x?xi32, strided<[?, ?], offset: ?>> - - %base, %base_offset, %sizes:2, %strides:2 = - memref.extract_strided_metadata %cast:memref<?x?xi32, strided<[?, ?], offset: ?>> - -> memref<i32>, index, - index, index, - index, index - - return %base, %base_offset, - %sizes#0, %sizes#1, - %strides#0, %strides#1 : - memref<i32>, index, - index, index, - index, index -} - - -// ----- - memref.global "private" @dynamicShmem : memref<0xf16,3> // CHECK-LABEL: func @zero_sized_memred diff --git a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir index 8bd7ae8..ac1f22b 100644 --- a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir +++ b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir @@ -63,6 +63,20 @@ func.func @memref_dim_all_positive(%m: memref<?xf32>, %x: index) { // ----- +// CHECK-LABEL: func @memref_expand( +// CHECK-SAME: %[[m:[a-zA-Z0-9]+]]: memref<?xf32> +// CHECK-SAME: %[[sz:[a-zA-Z0-9]+]]: index +// CHECK: %[[c4:.*]] = arith.constant 4 : index +// CHECK: return %[[sz]], %[[c4]] +func.func @memref_expand(%m: memref<?xf32>, %sz: index) -> (index, index) { + %0 = memref.expand_shape %m [[0, 1]] output_shape [%sz, 4]: memref<?xf32> into memref<?x4xf32> + %1 = "test.reify_bound"(%0) {dim = 0} : (memref<?x4xf32>) -> (index) + %2 = "test.reify_bound"(%0) {dim = 1} : (memref<?x4xf32>) -> (index) + return %1, %2 : index, index +} + +// ----- + // CHECK-LABEL: func @memref_get_global( // CHECK: %[[c4:.*]] = arith.constant 4 : index // CHECK: return %[[c4]] diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir index 8713689..77d18da 100644 --- a/mlir/test/Dialect/OpenACC/ops.mlir +++ b/mlir/test/Dialect/OpenACC/ops.mlir @@ -2200,3 +2200,46 @@ acc.private.recipe @privatization_memref_slice : memref<10x10xf32> init { acc.yield %result : memref<10x10xf32> } + +// ----- + +func.func @test_firstprivate_map(%arg0: memref<10xf32>) { + // Map the function argument using firstprivate_map to enable + // moving to accelerator but prevent any present counter updates. + %mapped = acc.firstprivate_map varPtr(%arg0 : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32> + + acc.parallel { + // Allocate a local variable inside the parallel region to represent + // materialized privatization. + %local = memref.alloca() : memref<10xf32> + + // Initialize the local variable with the mapped firstprivate value + %c0 = arith.constant 0 : index + %c10 = arith.constant 10 : index + %c1 = arith.constant 1 : index + + scf.for %i = %c0 to %c10 step %c1 { + %val = memref.load %mapped[%i] : memref<10xf32> + memref.store %val, %local[%i] : memref<10xf32> + } + + acc.yield + } + + return +} + +// CHECK-LABEL: func @test_firstprivate_map +// CHECK-NEXT: %[[MAPPED:.*]] = acc.firstprivate_map varPtr(%{{.*}} : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32> +// CHECK-NEXT: acc.parallel { +// CHECK-NEXT: %[[LOCAL:.*]] = memref.alloca() : memref<10xf32> +// CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : index +// CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index +// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index +// CHECK-NEXT: scf.for %{{.*}} = %[[C0]] to %[[C10]] step %[[C1]] { +// CHECK-NEXT: %{{.*}} = memref.load %[[MAPPED]][%{{.*}}] : memref<10xf32> +// CHECK-NEXT: memref.store %{{.*}}, %[[LOCAL]][%{{.*}}] : memref<10xf32> +// CHECK-NEXT: } +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// CHECK-NEXT: return diff --git a/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir b/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir new file mode 100644 index 0000000..af52bef --- /dev/null +++ b/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir @@ -0,0 +1,88 @@ +// RUN: mlir-opt %s -split-input-file -test-acc-support | FileCheck %s + +// Test with direct variable names +func.func @test_direct_var_name() { + // Create a memref with acc.var_name attribute + %0 = memref.alloca() {acc.var_name = #acc.var_name<"my_variable">} : memref<10xi32> + + %1 = memref.cast %0 {test.var_name} : memref<10xi32> to memref<10xi32> + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32> + // CHECK-NEXT: getVariableName="my_variable" + + return +} + +// ----- + +// Test through memref.cast +func.func @test_through_cast() { + // Create a 5x2 memref with acc.var_name attribute + %0 = memref.alloca() {acc.var_name = #acc.var_name<"casted_variable">} : memref<5x2xi32> + + // Cast to dynamic dimensions + %1 = memref.cast %0 : memref<5x2xi32> to memref<?x?xi32> + + // Mark with test attribute - should find name through cast + %2 = memref.cast %1 {test.var_name} : memref<?x?xi32> to memref<5x2xi32> + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<?x?xi32> to memref<5x2xi32> + // CHECK-NEXT: getVariableName="casted_variable" + + return +} + +// ----- + +// Test with no variable name +func.func @test_no_var_name() { + // Create a memref without acc.var_name attribute + %0 = memref.alloca() : memref<10xi32> + + // Mark with test attribute - should find empty string + %1 = memref.cast %0 {test.var_name} : memref<10xi32> to memref<10xi32> + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32> + // CHECK-NEXT: getVariableName="" + + return +} + +// ----- + +// Test through multiple casts +func.func @test_multiple_casts() { + // Create a memref with acc.var_name attribute + %0 = memref.alloca() {acc.var_name = #acc.var_name<"multi_cast">} : memref<10xi32> + + // Multiple casts + %1 = memref.cast %0 : memref<10xi32> to memref<?xi32> + %2 = memref.cast %1 : memref<?xi32> to memref<10xi32> + + // Mark with test attribute - should find name through multiple casts + %3 = memref.cast %2 {test.var_name} : memref<10xi32> to memref<10xi32> + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32> + // CHECK-NEXT: getVariableName="multi_cast" + + return +} + +// ----- + +// Test with acc.copyin operation +func.func @test_copyin_name() { + // Create a memref + %0 = memref.alloca() : memref<10xf32> + + // Create an acc.copyin operation with a name + %1 = acc.copyin varPtr(%0 : memref<10xf32>) -> memref<10xf32> {name = "input_data"} + + // Mark with test attribute - should find name from copyin operation + %2 = memref.cast %1 {test.var_name} : memref<10xf32> to memref<?xf32> + + // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xf32> to memref<?xf32> + // CHECK-NEXT: getVariableName="input_data" + + return +} diff --git a/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir new file mode 100644 index 0000000..8972a08 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir @@ -0,0 +1,157 @@ +// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>} { + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + + omp.private {type = firstprivate} @private_eye : i32 copy { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.load %arg0 : !llvm.ptr -> i32 + llvm.store %0, %arg1 : i32, !llvm.ptr + omp.yield(%arg1 : !llvm.ptr) + } + omp.private {type = firstprivate} @boxchar_firstprivate : !llvm.struct<(ptr, i64)> init { + ^bb0(%arg0: !llvm.struct<(ptr, i64)>, %arg1: !llvm.struct<(ptr, i64)>): + %0 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)> + %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)> + %8 = llvm.call @malloc(%1) {bindc_name = "", uniq_name = ""} : (i64) -> !llvm.ptr + %9 = llvm.mlir.undef : !llvm.struct<(ptr, i64)> + %10 = llvm.insertvalue %8, %9[0] : !llvm.struct<(ptr, i64)> + %11 = llvm.insertvalue %1, %10[1] : !llvm.struct<(ptr, i64)> + omp.yield(%11 : !llvm.struct<(ptr, i64)>) + } copy { + ^bb0(%arg0: !llvm.struct<(ptr, i64)>, %arg1: !llvm.struct<(ptr, i64)>): + %3 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)> + %4 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)> + %5 = llvm.extractvalue %arg1[0] : !llvm.struct<(ptr, i64)> + %6 = llvm.extractvalue %arg1[1] : !llvm.struct<(ptr, i64)> + %7 = llvm.icmp "slt" %6, %4 : i64 + %8 = llvm.select %7, %6, %4 : i1, i64 + "llvm.intr.memmove"(%5, %3, %8) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () + omp.yield(%arg1 : !llvm.struct<(ptr, i64)>) + } dealloc { + ^bb0(%arg0: !llvm.struct<(ptr, i64)>): + %0 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)> + %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)> + llvm.call @free(%0) : (!llvm.ptr) -> () + omp.yield + } + + llvm.func @target_boxchar_(%arg0: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_boxchar", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i64) : i64 + %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64)> : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(1 : index) : i64 + %5 = llvm.mlir.constant(0 : index) : i64 + %6 = llvm.mlir.constant(0 : i32) : i32 + %7 = llvm.mlir.constant(1 : i64) : i64 + %8 = llvm.mlir.constant(1 : i64) : i64 + %9 = llvm.load %arg0 : !llvm.ptr -> i32 + %10 = llvm.icmp "sgt" %9, %6 : i32 + %11 = llvm.select %10, %9, %6 : i1, i32 + %12 = llvm.mlir.constant(1 : i64) : i64 + %13 = llvm.sext %11 : i32 to i64 + %14 = llvm.alloca %13 x i8 {bindc_name = "char_var"} : (i64) -> !llvm.ptr + %15 = llvm.mlir.undef : !llvm.struct<(ptr, i64)> + %16 = llvm.sext %11 : i32 to i64 + %17 = llvm.insertvalue %14, %15[0] : !llvm.struct<(ptr, i64)> + %18 = llvm.insertvalue %16, %17[1] : !llvm.struct<(ptr, i64)> + llvm.store %18, %3 : !llvm.struct<(ptr, i64)>, !llvm.ptr + %19 = llvm.load %3 : !llvm.ptr -> !llvm.struct<(ptr, i64)> + %20 = llvm.extractvalue %19[0] : !llvm.struct<(ptr, i64)> + %21 = llvm.extractvalue %19[1] : !llvm.struct<(ptr, i64)> + %22 = llvm.sub %21, %4 : i64 + %23 = omp.map.bounds lower_bound(%5 : i64) upper_bound(%22 : i64) extent(%21 : i64) stride(%4 : i64) start_idx(%5 : i64) {stride_in_bytes = true} + %24 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)> + %25 = omp.map.info var_ptr(%3 : !llvm.ptr, i8) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%24 : !llvm.ptr) bounds(%23) -> !llvm.ptr + %26 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.struct<(ptr, i64)>) map_clauses(to) capture(ByRef) members(%25 : [0] : !llvm.ptr) -> !llvm.ptr + %27 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr + omp.target nowait map_entries(%26 -> %arg1, %27 -> %arg2, %25 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@boxchar_firstprivate %18 -> %arg4 [map_idx=0], @private_eye %1 -> %arg5 [map_idx=1] : !llvm.struct<(ptr, i64)>, !llvm.ptr) { + omp.terminator + } + llvm.return + } +} +// CHECK-LABEL: llvm.func @target_boxchar_( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_boxchar", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr +// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(16 : i64) : i64 +// CHECK: %[[HEAP0:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr +// CHECK: %[[VAL_5:.*]] = llvm.alloca %[[VAL_2]] x !llvm.struct<(ptr, i64)> : (i64) -> !llvm.ptr +// CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(0 : index) : i64 +// CHECK: %[[VAL_8:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_9:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_11:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> i32 +// CHECK: %[[VAL_12:.*]] = llvm.icmp "sgt" %[[VAL_11]], %[[VAL_8]] : i32 +// CHECK: %[[VAL_13:.*]] = llvm.select %[[VAL_12]], %[[VAL_11]], %[[VAL_8]] : i1, i32 +// CHECK: %[[VAL_14:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_15:.*]] = llvm.sext %[[VAL_13]] : i32 to i64 +// CHECK: %[[VAL_16:.*]] = llvm.alloca %[[VAL_15]] x i8 {bindc_name = "char_var"} : (i64) -> !llvm.ptr +// CHECK: %[[VAL_17:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_18:.*]] = llvm.sext %[[VAL_13]] : i32 to i64 +// CHECK: %[[VAL_19:.*]] = llvm.insertvalue %[[VAL_16]], %[[VAL_17]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_20:.*]] = llvm.insertvalue %[[VAL_18]], %[[VAL_19]][1] : !llvm.struct<(ptr, i64)> +// CHECK: llvm.store %[[VAL_20]], %[[VAL_5]] : !llvm.struct<(ptr, i64)>, !llvm.ptr +// CHECK: %[[VAL_21:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_22:.*]] = llvm.extractvalue %[[VAL_21]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_23:.*]] = llvm.extractvalue %[[VAL_21]][1] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_24:.*]] = llvm.sub %[[VAL_23]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_25:.*]] = omp.map.bounds lower_bound(%[[VAL_7]] : i64) upper_bound(%[[VAL_24]] : i64) extent(%[[VAL_23]] : i64) stride(%[[VAL_6]] : i64) start_idx(%[[VAL_7]] : i64) {stride_in_bytes = true} +// CHECK: %[[VAL_26:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_27:.*]] = llvm.load %[[HEAP0]] : !llvm.ptr -> !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_28:.*]] = llvm.call @boxchar_firstprivate_init(%[[VAL_26]], %[[VAL_27]]) : (!llvm.struct<(ptr, i64)>, !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_29:.*]] = llvm.call @boxchar_firstprivate_copy(%[[VAL_26]], %[[VAL_28]]) : (!llvm.struct<(ptr, i64)>, !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> +// CHECK: llvm.store %[[VAL_29]], %[[HEAP0]] : !llvm.struct<(ptr, i64)>, !llvm.ptr +// CHECK: %[[VAL_30:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr +// CHECK: %[[VAL_31:.*]] = llvm.getelementptr %[[HEAP0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_32:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, i8) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%[[VAL_31]] : !llvm.ptr) bounds(%[[VAL_25]]) -> !llvm.ptr +// CHECK: %[[VAL_33:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, !llvm.struct<(ptr, i64)>) map_clauses(to) capture(ByRef) members(%[[VAL_32]] : [0] : !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_34:.*]] = llvm.load %[[HEAP0]] : !llvm.ptr -> !llvm.struct<(ptr, i64)> +// CHECK: omp.target depend(taskdependout -> %[[HEAP0]] : !llvm.ptr) nowait map_entries(%[[VAL_33]] -> %[[VAL_35:.*]], %[[VAL_30]] -> %[[VAL_36:.*]], %[[VAL_32]] -> %[[VAL_37:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@boxchar_firstprivate %[[VAL_34]] -> %[[VAL_38:.*]] [map_idx=0], @private_eye %[[VAL_1]] -> %[[VAL_39:.*]] [map_idx=1] : !llvm.struct<(ptr, i64)>, !llvm.ptr) { +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.task depend(taskdependin -> %[[HEAP0]] : !llvm.ptr) { +// CHECK: llvm.call @boxchar_firstprivate_dealloc(%[[VAL_29]]) : (!llvm.struct<(ptr, i64)>) -> () +// CHECK: llvm.call @free(%[[HEAP0]]) : (!llvm.ptr) -> () +// CHECK: omp.terminator +// CHECK: } +// CHECK: llvm.return +// CHECK: } + +// CHECK-LABEL: llvm.func @boxchar_firstprivate_init( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_2:.*]] = llvm.call @malloc(%[[VAL_1]]) {bindc_name = "", uniq_name = ""} : (i64) -> !llvm.ptr +// CHECK: %[[VAL_3:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_4:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_3]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_1]], %[[VAL_4]][1] : !llvm.struct<(ptr, i64)> +// CHECK: llvm.return %[[VAL_5]] : !llvm.struct<(ptr, i64)> +// CHECK: } + +// CHECK-LABEL: llvm.func @boxchar_firstprivate_copy( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[ARG1]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_3:.*]] = llvm.extractvalue %[[ARG1]][1] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_4:.*]] = llvm.icmp "slt" %[[VAL_3]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_5:.*]] = llvm.select %[[VAL_4]], %[[VAL_3]], %[[VAL_1]] : i1, i64 +// CHECK: "llvm.intr.memmove"(%[[VAL_2]], %[[VAL_0]], %[[VAL_5]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () +// CHECK: llvm.return %[[ARG1]] : !llvm.struct<(ptr, i64)> +// CHECK: } + +// CHECK-LABEL: llvm.func @boxchar_firstprivate_dealloc( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>) attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)> +// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)> +// CHECK: llvm.call @free(%[[VAL_0]]) : (!llvm.ptr) -> () +// CHECK: llvm.return +// CHECK: } diff --git a/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir new file mode 100644 index 0000000..0377d49 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir @@ -0,0 +1,201 @@ +// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>} { + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + + omp.private {type = firstprivate} @firstprivatizer : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(48 : i64) : i64 + %1 = llvm.call @malloc(%0) : (i64) -> !llvm.ptr + %2 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + llvm.store %1, %2 : !llvm.ptr, !llvm.ptr + omp.yield(%arg1 : !llvm.ptr) + } copy { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(48 : i32) : i32 + "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + omp.yield(%arg1 : !llvm.ptr) + } dealloc { + ^bb0(%arg0: !llvm.ptr): + llvm.call @free(%arg0) : (!llvm.ptr) -> () + omp.yield + } + omp.private {type = firstprivate} @firstprivatizer_1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(48 : i64) : i64 + %1 = llvm.call @malloc(%0) : (i64) -> !llvm.ptr + %2 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + llvm.store %1, %2 : !llvm.ptr, !llvm.ptr + omp.yield(%arg1 : !llvm.ptr) + } copy { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(48 : i32) : i32 + "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + omp.yield(%arg1 : !llvm.ptr) + } dealloc { + ^bb0(%arg0: !llvm.ptr): + llvm.call @free(%arg0) : (!llvm.ptr) -> () + omp.yield + } + + llvm.func internal @firstprivate_test(%arg0: !llvm.ptr {fir.bindc_name = "ptr0"}, %arg1: !llvm.ptr {fir.bindc_name = "ptr1"}) { + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.mlir.constant(0 : index) : i64 + %5 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %19 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr + %20 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "glocal"} : (i32) -> !llvm.ptr + %21 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr + %33 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + llvm.store %33, %19 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr + llvm.store %33, %20 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr + llvm.store %0, %21 : i32, !llvm.ptr + %124 = omp.map.info var_ptr(%21 : !llvm.ptr, i32) map_clauses(implicit) capture(ByCopy) -> !llvm.ptr {name = "i"} + %150 = llvm.getelementptr %19[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %151 = llvm.load %150 : !llvm.ptr -> i64 + %152 = llvm.getelementptr %19[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %153 = llvm.load %152 : !llvm.ptr -> i64 + %154 = llvm.getelementptr %19[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %155 = llvm.load %154 : !llvm.ptr -> i64 + %156 = llvm.sub %153, %1 : i64 + %157 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%156 : i64) extent(%153 : i64) stride(%155 : i64) start_idx(%151 : i64) {stride_in_bytes = true} + %158 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %159 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%158 : !llvm.ptr) bounds(%157) -> !llvm.ptr {name = ""} + %160 = omp.map.info var_ptr(%19 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%159 : [0] : !llvm.ptr) -> !llvm.ptr + %1501 = llvm.getelementptr %20[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %1511 = llvm.load %1501 : !llvm.ptr -> i64 + %1521 = llvm.getelementptr %20[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %1531 = llvm.load %1521 : !llvm.ptr -> i64 + %1541 = llvm.getelementptr %20[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %1551 = llvm.load %1541 : !llvm.ptr -> i64 + %1561 = llvm.sub %1531, %1 : i64 + %1571 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%1561 : i64) extent(%1531 : i64) stride(%1551 : i64) start_idx(%1511 : i64) {stride_in_bytes = true} + %1581 = llvm.getelementptr %20[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %1591 = omp.map.info var_ptr(%20 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%1581 : !llvm.ptr) bounds(%1571) -> !llvm.ptr {name = ""} + %1601 = omp.map.info var_ptr(%20 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%1591 : [0] : !llvm.ptr) -> !llvm.ptr + + // Test with two firstprivate variables so that we test that even if there are multiple variables to be cleaned up + // only one cleanup omp.task is generated. + omp.target nowait map_entries(%124 -> %arg2, %160 -> %arg5, %159 -> %arg8, %1601 -> %arg9, %1591 -> %arg10 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %19 -> %arg11 [map_idx=1], @firstprivatizer_1 %20 -> %arg12 [map_idx=3] : !llvm.ptr, !llvm.ptr) { + omp.terminator + } + %166 = llvm.mlir.constant(48 : i32) : i32 + %167 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> + %168 = llvm.load %167 : !llvm.ptr -> !llvm.ptr + llvm.call @free(%168) : (!llvm.ptr) -> () + llvm.return + } + +} +// CHECK-LABEL: llvm.func @free(!llvm.ptr) +// CHECK: llvm.func @malloc(i64) -> !llvm.ptr + + +// CHECK-LABEL: llvm.func internal @firstprivate_test( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr {fir.bindc_name = "ptr0"}, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr {fir.bindc_name = "ptr1"}) { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i32) : i32 +// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(0 : index) : i64 +// CHECK: %[[VAL_2:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr +// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(48 : i64) : i64 +// CHECK: %[[HEAP0:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr +// CHECK: %[[VAL_5:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr +// CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(48 : i64) : i64 +// CHECK: %[[HEAP1:.*]] = llvm.call @malloc(%[[VAL_6]]) : (i64) -> !llvm.ptr +// CHECK: %[[VAL_8:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "glocal"} : (i32) -> !llvm.ptr +// CHECK: %[[VAL_9:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr +// CHECK: %[[VAL_10:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_10]], %[[VAL_5]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr +// CHECK: llvm.store %[[VAL_10]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr +// CHECK: llvm.store %[[VAL_0]], %[[VAL_9]] : i32, !llvm.ptr +// CHECK: %[[VAL_11:.*]] = omp.map.info var_ptr(%[[VAL_9]] : !llvm.ptr, i32) map_clauses(implicit) capture(ByCopy) -> !llvm.ptr {name = "i"} +// CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_15:.*]] = llvm.load %[[VAL_14]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_16:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_17:.*]] = llvm.load %[[VAL_16]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_18:.*]] = llvm.sub %[[VAL_15]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_19:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_18]] : i64) extent(%[[VAL_15]] : i64) stride(%[[VAL_17]] : i64) start_idx(%[[VAL_13]] : i64) {stride_in_bytes = true} +// CHECK: %[[VAL_20:.*]] = llvm.call @firstprivatizer_init(%[[VAL_5]], %[[HEAP0]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_21:.*]] = llvm.call @firstprivatizer_copy(%[[VAL_5]], %[[VAL_20]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_22:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_23:.*]] = llvm.load %[[VAL_22]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_24:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_25:.*]] = llvm.load %[[VAL_24]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_26:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_27:.*]] = llvm.load %[[VAL_26]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_28:.*]] = llvm.sub %[[VAL_25]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_29:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_28]] : i64) extent(%[[VAL_25]] : i64) stride(%[[VAL_27]] : i64) start_idx(%[[VAL_23]] : i64) {stride_in_bytes = true} +// CHECK: %[[VAL_30:.*]] = llvm.call @firstprivatizer_1_init(%[[VAL_8]], %[[HEAP1]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_31:.*]] = llvm.call @firstprivatizer_1_copy(%[[VAL_8]], %[[VAL_30]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_32:.*]] = llvm.getelementptr %[[HEAP0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_33:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, i32) map_clauses({{.*}}to{{.*}}) capture(ByRef) var_ptr_ptr(%[[VAL_32]] : !llvm.ptr) bounds(%[[VAL_19]]) -> !llvm.ptr {name = ""} +// CHECK: %[[VAL_34:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always,{{.*}}to) capture(ByRef) members(%[[VAL_33]] : [0] : !llvm.ptr) -> !llvm.ptr +// CHECK: %[[VAL_35:.*]] = llvm.getelementptr %[[HEAP1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_36:.*]] = omp.map.info var_ptr(%[[HEAP1]] : !llvm.ptr, i32) map_clauses({{.*}}to{{.*}}) capture(ByRef) var_ptr_ptr(%[[VAL_35]] : !llvm.ptr) bounds(%[[VAL_29]]) -> !llvm.ptr {name = ""} +// CHECK: %[[VAL_37:.*]] = omp.map.info var_ptr(%[[HEAP1]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always,{{.*}}to) capture(ByRef) members(%[[VAL_36]] : [0] : !llvm.ptr) -> !llvm.ptr +// CHECK: omp.target depend(taskdependout -> %[[HEAP0]] : !llvm.ptr) nowait map_entries(%[[VAL_11]] -> %[[VAL_38:.*]], %[[VAL_34]] -> %[[VAL_39:.*]], %[[VAL_33]] -> %[[VAL_40:.*]], %[[VAL_37]] -> %[[VAL_41:.*]], %[[VAL_36]] -> %[[VAL_42:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %[[HEAP0]] -> %[[VAL_43:.*]] [map_idx=1], @firstprivatizer_1 %[[HEAP1]] -> %[[VAL_44:.*]] [map_idx=3] : !llvm.ptr, !llvm.ptr) { +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.task depend(taskdependin -> %[[HEAP0]] : !llvm.ptr) { +// CHECK: llvm.call @firstprivatizer_1_dealloc(%[[VAL_31]]) : (!llvm.ptr) -> () +// CHECK: llvm.call @free(%[[HEAP1]]) : (!llvm.ptr) -> () +// CHECK: llvm.call @firstprivatizer_dealloc(%[[VAL_21]]) : (!llvm.ptr) -> () +// CHECK: llvm.call @free(%[[HEAP0]]) : (!llvm.ptr) -> () +// CHECK: omp.terminator +// CHECK: } +// CHECK: %[[VAL_45:.*]] = llvm.mlir.constant(48 : i32) : i32 +// CHECK: %[[VAL_46:.*]] = llvm.getelementptr %[[VAL_5]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: %[[VAL_47:.*]] = llvm.load %[[VAL_46]] : !llvm.ptr -> !llvm.ptr +// CHECK: llvm.call @free(%[[VAL_47]]) : (!llvm.ptr) -> () +// CHECK: llvm.return +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_init( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i64) : i64 +// CHECK: %[[VAL_1:.*]] = llvm.call @malloc(%[[VAL_0]]) : (i64) -> !llvm.ptr +// CHECK: %[[VAL_2:.*]] = llvm.getelementptr %[[ARG1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_1]], %[[VAL_2]] : !llvm.ptr, !llvm.ptr +// CHECK: llvm.return %[[ARG1]] : !llvm.ptr +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_copy( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i32) : i32 +// CHECK: "llvm.intr.memcpy"(%[[ARG1]], %[[ARG0]], %[[VAL_0]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK: llvm.return %[[ARG1]] : !llvm.ptr +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_dealloc( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr) attributes {always_inline} { +// CHECK: llvm.call @free(%[[ARG0]]) : (!llvm.ptr) -> () +// CHECK: llvm.return +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_1_init( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i64) : i64 +// CHECK: %[[VAL_1:.*]] = llvm.call @malloc(%[[VAL_0]]) : (i64) -> !llvm.ptr +// CHECK: %[[VAL_2:.*]] = llvm.getelementptr %[[ARG1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_1]], %[[VAL_2]] : !llvm.ptr, !llvm.ptr +// CHECK: llvm.return %[[ARG1]] : !llvm.ptr +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_1_copy( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr, +// CHECK-SAME: %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} { +// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i32) : i32 +// CHECK: "llvm.intr.memcpy"(%[[ARG1]], %[[ARG0]], %[[VAL_0]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK: llvm.return %[[ARG1]] : !llvm.ptr +// CHECK: } + +// CHECK-LABEL: llvm.func @firstprivatizer_1_dealloc( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr) attributes {always_inline} { +// CHECK: llvm.call @free(%[[ARG0]]) : (!llvm.ptr) -> () +// CHECK: llvm.return +// CHECK: } diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir index 2bec636..084c3fc 100644 --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -1604,6 +1604,148 @@ func.func @func_execute_region_inline_multi_yield() { // ----- +// Test case with single scf.yield op inside execute_region and its operand is defined outside the execute_region op. +// Make scf.execute_region not to return anything. + +// CHECK: scf.execute_region no_inline { +// CHECK: func.call @foo() : () -> () +// CHECK: scf.yield +// CHECK: } + +module { +func.func private @foo()->() +func.func private @execute_region_yeilding_external_value() -> memref<1x60xui8> { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %1 = scf.execute_region -> memref<1x60xui8> no_inline { + func.call @foo():()->() + scf.yield %alloc: memref<1x60xui8> + } + return %1 : memref<1x60xui8> +} +} + +// ----- + +// Test case with scf.yield op inside execute_region with multiple operands. +// One of operands is defined outside the execute_region op. +// Remove just this operand from the op results. + +// CHECK: %[[VAL_1:.*]] = scf.execute_region -> memref<1x120xui8> no_inline { +// CHECK: %[[VAL_2:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8> +// CHECK: func.call @foo() : () -> () +// CHECK: scf.yield %[[VAL_2]] : memref<1x120xui8> +// CHECK: } +module { +func.func private @foo()->() +func.func private @execute_region_yeilding_external_and_local_values() -> (memref<1x60xui8>, memref<1x120xui8>) { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %1, %2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline { + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8> + func.call @foo():()->() + scf.yield %alloc, %alloc_1: memref<1x60xui8>, memref<1x120xui8> + } + return %1, %2 : memref<1x60xui8>, memref<1x120xui8> +} +} + +// ----- + +// Test case with multiple scf.yield ops inside execute_region with same operands and those operands are defined outside the execute_region op.. +// Make scf.execute_region not to return anything. +// scf.yield must remain, cause scf.execute_region can't be empty. + +// CHECK: scf.execute_region no_inline { +// CHECK: %[[VAL_3:.*]] = "test.cmp"() : () -> i1 +// CHECK: cf.cond_br %[[VAL_3]], ^bb1, ^bb2 +// CHECK: ^bb1: +// CHECK: scf.yield +// CHECK: ^bb2: +// CHECK: scf.yield +// CHECK: } + +module { + func.func private @foo()->() + func.func private @execute_region_multiple_yields_same_operands() -> (memref<1x60xui8>, memref<1x120xui8>) { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8> + %1, %2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline { + %c = "test.cmp"() : () -> i1 + cf.cond_br %c, ^bb2, ^bb3 + ^bb2: + func.call @foo():()->() + scf.yield %alloc, %alloc_1 : memref<1x60xui8>, memref<1x120xui8> + ^bb3: + func.call @foo():()->() + scf.yield %alloc, %alloc_1 : memref<1x60xui8>, memref<1x120xui8> + } + return %1, %2 : memref<1x60xui8>, memref<1x120xui8> + } +} + +// ----- + +// Test case with multiple scf.yield ops with at least one different operand, then no change. + +// CHECK: %[[VAL_3:.*]]:2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline { +// CHECK: ^bb1: +// CHECK: scf.yield %{{.*}}, %{{.*}} : memref<1x60xui8>, memref<1x120xui8> +// CHECK: ^bb2: +// CHECK: scf.yield %{{.*}}, %{{.*}} : memref<1x60xui8>, memref<1x120xui8> +// CHECK: } + +module { + func.func private @foo()->() + func.func private @execute_region_multiple_yields_different_operands() -> (memref<1x60xui8>, memref<1x120xui8>) { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8> + %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8> + %1, %2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline { + %c = "test.cmp"() : () -> i1 + cf.cond_br %c, ^bb2, ^bb3 + ^bb2: + func.call @foo():()->() + scf.yield %alloc, %alloc_1 : memref<1x60xui8>, memref<1x120xui8> + ^bb3: + func.call @foo():()->() + scf.yield %alloc, %alloc_2 : memref<1x60xui8>, memref<1x120xui8> + } + return %1, %2 : memref<1x60xui8>, memref<1x120xui8> + } +} + +// ----- + +// Test case with multiple scf.yield ops each has different operand. +// In this case scf.execute_region isn't changed. + +// CHECK: %[[VAL_2:.*]] = scf.execute_region -> memref<1x60xui8> no_inline { +// CHECK: ^bb1: +// CHECK: scf.yield %{{.*}} : memref<1x60xui8> +// CHECK: ^bb2: +// CHECK: scf.yield %{{.*}} : memref<1x60xui8> +// CHECK: } + +module { +func.func private @foo()->() +func.func private @execute_region_multiple_yields_different_operands() -> (memref<1x60xui8>) { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %1 = scf.execute_region -> (memref<1x60xui8>) no_inline { + %c = "test.cmp"() : () -> i1 + cf.cond_br %c, ^bb2, ^bb3 + ^bb2: + func.call @foo():()->() + scf.yield %alloc : memref<1x60xui8> + ^bb3: + func.call @foo():()->() + scf.yield %alloc_1 : memref<1x60xui8> + } + return %1 : memref<1x60xui8> +} +} + +// ----- + // CHECK-LABEL: func @canonicalize_parallel_insert_slice_indices( // CHECK-SAME: %[[arg0:.*]]: tensor<1x5xf32>, %[[arg1:.*]]: tensor<?x?xf32> func.func @canonicalize_parallel_insert_slice_indices( diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir index e8525a5..5a40f3f 100644 --- a/mlir/test/Dialect/Tosa/canonicalize.mlir +++ b/mlir/test/Dialect/Tosa/canonicalize.mlir @@ -9,11 +9,20 @@ func.func @argmax_nofold(%arg0: tensor<?x1xf32>) -> tensor<1xi32> { // ----- -// CHECK-LABEL: @pad_wh_avg_pool2d_fold -func.func @pad_wh_avg_pool2d_fold(%input: tensor<1x10x8x3xf32>) -> tensor<1x6x5x3xf32> { - // CHECK-NOT: tosa.pad +// CHECK-LABEL: @test_argmax_fold_i64_index +func.func @test_argmax_fold_i64_index(%arg0: tensor<1xi8>) -> tensor<i64> { + // CHECK: "tosa.const"() <{values = dense<0> : tensor<i64>}> : () -> tensor<i64> + %0 = tosa.argmax %arg0 {axis = 0 : i32} : (tensor<1xi8>) -> tensor<i64> + return %0 : tensor<i64> +} + +// ----- + +// CHECK-LABEL: @pad_wh_avg_pool2d_nofold +func.func @pad_wh_avg_pool2d_nofold(%input: tensor<1x10x8x3xf32>) -> tensor<1x6x5x3xf32> { + // CHECK: tosa.pad // CHECK: tosa.avg_pool2d - // CHECK-SAME: pad = array<i64: 1, 1, 1, 1> + // CHECK-SAME: pad = array<i64: 0, 1, 0, 1> %pad_shape = tosa.const_shape { values = dense<[0, 0, 1, 0, 1, 0, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8> %pad_const = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : ()-> tensor<1xf32> %input_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : ()-> tensor<1xf32> diff --git a/mlir/test/Dialect/Tosa/invalid_extension.mlir b/mlir/test/Dialect/Tosa/invalid_extension.mlir index e5c9402..005601d 100644 --- a/mlir/test/Dialect/Tosa/invalid_extension.mlir +++ b/mlir/test/Dialect/Tosa/invalid_extension.mlir @@ -538,3 +538,11 @@ func.func @test_avg_pool2d_non_const_output_zp(%arg0: tensor<1x32x32x8xf32>, %ou (tensor<1x32x32x8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x8xf32> return %0 : tensor<1x32x32x8xf32> } + +// ----- + +func.func @test_matmul_t_block_scaled(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf8E4M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + // expected-error@+1 {{'tosa.matmul_t_block_scaled' op illegal: requires [mxfp] but not enabled in target}} + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf8E4M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir index 8cc357e..8771e6e 100644 --- a/mlir/test/Dialect/Tosa/level_check.mlir +++ b/mlir/test/Dialect/Tosa/level_check.mlir @@ -1622,3 +1622,12 @@ func.func @test_unranked_weight_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor %0 = tosa.conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<*xf32>, tensor<8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32> return %0 : tensor<*xf32> } + +// ----- + +// CHECK-LABEL: test_matmul_t_block_scaled_invalid_size +func.func @test_matmul_t_block_scaled_invalid_size(%arg0: tensor<4x8x536870912xf4E2M1FN>, %arg1: tensor<4x8x16777216xf8E8M0FNU>, %arg2: tensor<4x16x536870912xf4E2M1FN>, %arg3: tensor<4x16x16777216xf8E8M0FNU>) -> tensor<*xf32> { + // expected-error@+1 {{'tosa.matmul_t_block_scaled' op failed level check: operand tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}} + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x536870912xf4E2M1FN>, tensor<4x8x16777216xf8E8M0FNU>, tensor<4x16x536870912xf4E2M1FN>, tensor<4x16x16777216xf8E8M0FNU>) -> tensor<*xf32> + return %0 : tensor<*xf32> +} diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir index 868b7b7..9bf36b5 100644 --- a/mlir/test/Dialect/Tosa/ops.mlir +++ b/mlir/test/Dialect/Tosa/ops.mlir @@ -1226,3 +1226,45 @@ func.func @test_scatter_f8E4M3FN(%arg0: tensor<13x29x3xf8E4M3FN>, %arg1: tensor< %0 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<13x29x3xf8E4M3FN>, tensor<13x26xi32>, tensor<13x26x3xf8E4M3FN>) -> tensor<13x29x3xf8E4M3FN> return %0 : tensor<13x29x3xf8E4M3FN> } + +// ----- +// CHECK-LABEL: test_matmul_t_block_scaled_static +func.func @test_matmul_t_block_scaled_static(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf8E4M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf8E4M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} + +// ----- +// CHECK-LABEL: test_matmul_t_block_scaled_unranked +func.func @test_matmul_t_block_scaled_unranked(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<*xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<*xf8E8M0FNU>) -> tensor<*xf32> { + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>) -> tensor<*xf32> + return %0 : tensor<*xf32> +} + +// ----- +// CHECK-LABEL: test_matmul_t_block_scaled_fp6e3m2 +func.func @test_matmul_t_block_scaled_fp6e3m2(%arg0: tensor<4x8x32xf6E3M2FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf6E3M2FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf6E3M2FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf6E3M2FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} + +// ----- +// CHECK-LABEL: test_matmul_t_block_scaled_fp6e2m3 +func.func @test_matmul_t_block_scaled_fp6e2m3(%arg0: tensor<4x8x32xf6E2M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf6E2M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf6E2M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf6E2M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} + +// ----- +// CHECK-LABEL: test_matmul_t_block_scaled_fp4e2m1 +func.func @test_matmul_t_block_scaled_fp4e2m1(%arg0: tensor<4x8x32xf4E2M1FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf4E2M1FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf4E2M1FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf4E2M1FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} + +// ----- +// CHECK-LABEL: test_matmul_t_block_scaled_broadcast +func.func @test_matmul_t_block_scaled_broadcast(%arg0: tensor<?x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<?x16x32xf8E4M3FN>, %arg3: tensor<1x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<?x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<?x16x32xf8E4M3FN>, tensor<1x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} diff --git a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir index 7ff8065..0271d71 100644 --- a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir +++ b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir @@ -2,7 +2,7 @@ // Enable all supported extensions to focus the verification of expected profile requirement errors. //-------------------------------------------------------------------------------------------------- -// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="profiles=pro_int extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,dynamic,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment" +// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.1.draft profiles=pro_int extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,dynamic,doubleround,inexactround,mxfp" -tosa-validate="strict-op-spec-alignment" // ----- func.func @test_const_f16() -> tensor<3x11x11x3xf16> { @@ -325,3 +325,10 @@ func.func @test_resize(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x8xf32> { %1 = tosa.resize %arg0, %scale, %offset, %border { mode = BILINEAR } : (tensor<1x32x32x8xf32>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<1x64x64x8xf32> return %1 : tensor<1x64x64x8xf32> } + +// ----- +func.func @test_matmul_t_block_scaled(%arg0: tensor<4x8x32xf6E3M2FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf6E3M2FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + // expected-error@+1 {{'tosa.matmul_t_block_scaled' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf6E3M2FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf6E3M2FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir index 80f06f1..72479fe 100644 --- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir +++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir @@ -1574,3 +1574,57 @@ func.func @test_mul_scalar(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<*xf %0 = tosa.mul %arg0, %arg1, %shift : (tensor<f32>, tensor<f32>, tensor<1xi8>) -> tensor<*xf32> return %0 : tensor<*xf32> } + +// ----- + +// CHECK-LABEL: test_matmul_t_block_scaled_static +func.func @test_matmul_t_block_scaled_static(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<1x16x32xf8E4M3FN>, %arg3: tensor<1x16x1xf8E8M0FNU>) -> tensor<?x?x?xf32> { + // CHECK: -> tensor<4x8x16xf32> + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<1x16x32xf8E4M3FN>, tensor<1x16x1xf8E8M0FNU>) -> tensor<?x?x?xf32> + return %0 : tensor<?x?x?xf32> +} + +// ----- + +// CHECK-LABEL: test_matmul_t_block_scaled_unranked_a_data +func.func @test_matmul_t_block_scaled_unranked_a_data(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf8E4M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<?x?x?xf32> { + // CHECK: -> tensor<4x8x16xf32> + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf8E4M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<?x?x?xf32> + return %0 : tensor<?x?x?xf32> +} + +// ----- + +// CHECK-LABEL: test_matmul_t_block_scaled_unranked_b_data_and_scale +func.func @test_matmul_t_block_scaled_unranked_b_data_and_scale(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<*xf8E8M0FNU>) -> tensor<?x?x?xf32> { + // CHECK: -> tensor<4x8x?xf32> + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>) -> tensor<?x?x?xf32> + return %0 : tensor<?x?x?xf32> +} + +// ----- + +// CHECK-LABEL: test_matmul_t_block_scaled_unranked_all +func.func @test_matmul_t_block_scaled_unranked_all(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<*xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<*xf8E8M0FNU>) -> tensor<?x?x?xf32> { + // CHECK: -> tensor<?x?x?xf32> + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>) -> tensor<?x?x?xf32> + return %0 : tensor<?x?x?xf32> +} + +// ----- + +// CHECK-LABEL: test_matmul_t_block_scaled_broadcast_b_data +func.func @test_matmul_t_block_scaled_broadcast_b_data(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<*xf8E8M0FNU>, %arg2: tensor<1x4x32xf8E4M3FN>, %arg3: tensor<1x4x1xf8E8M0FNU>) -> tensor<?x?x?xf32> { + // CHECK: -> tensor<?x?x4xf32> + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>, tensor<1x4x32xf8E4M3FN>, tensor<1x4x1xf8E8M0FNU>) -> tensor<?x?x?xf32> + return %0 : tensor<?x?x?xf32> +} + +// ----- + +// CHECK-LABEL: test_matmul_t_block_scaled_broadcast_b_scale +func.func @test_matmul_t_block_scaled_broadcast_b_scale(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<*xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<1x4x1xf8E8M0FNU>) -> tensor<?x?x?xf32> { + // CHECK: -> tensor<?x?x4xf32> + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<1x4x1xf8E8M0FNU>) -> tensor<?x?x?xf32> + return %0 : tensor<?x?x?xf32> +} diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir index 8164509..2040a4b 100644 --- a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir +++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.1.draft profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment" | FileCheck %s +// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.1.draft profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround,mxfp" -tosa-validate="strict-op-spec-alignment" | FileCheck %s // ----- @@ -18,3 +18,11 @@ func.func @test_matmul_fp8_input_fp32_acc_type(%arg0: tensor<1x14x19xf8E4M3FN>, %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E4M3FN>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>) -> tensor<1x14x28xf32> return %0 : tensor<1x14x28xf32> } + +// ----- + +// CHECK-LABEL: test_matmul_t_block_scaled_fp6e2m3 +func.func @test_matmul_t_block_scaled_fp6e2m3(%arg0: tensor<4x8x32xf6E2M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf6E2M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = BLOCK_SIZE_32} : (tensor<4x8x32xf6E2M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf6E2M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} diff --git a/mlir/test/Dialect/Tosa/verifier.mlir b/mlir/test/Dialect/Tosa/verifier.mlir index 430b06a..4be5d72 100644 --- a/mlir/test/Dialect/Tosa/verifier.mlir +++ b/mlir/test/Dialect/Tosa/verifier.mlir @@ -1102,3 +1102,51 @@ func.func @scatter_invalid_K_W(%arg0 : tensor<2x4x5xi32>, %arg1 : tensor<2x6xi32 %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<2x4x5xi32>, tensor<2x6xi32>, tensor<2x6x5xi32>) -> tensor<2x4x5xi32> return } + +// ----- + +func.func @test_matmul_t_block_scaled_data_mismatch(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf8E5M2>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expect A_data and B_data to have same element type, got 'f8E4M3FN' and 'f8E5M2'}} + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf8E5M2>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} + +// ----- + +func.func @test_matmul_t_block_scaled_output_batch_mismatch(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<?x8x1xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<4x?x?xf8E8M0FNU>) -> tensor<5x?x?xf32> { + // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expected output shape 5, ?, ? to be compatible with expected output shape 4, 8, ?}} + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<*xf8E4M3FN>, tensor<?x8x1xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<4x?x?xf8E8M0FNU>) -> tensor<5x?x?xf32> + return %0 : tensor<5x?x?xf32> +} + +// ----- + +func.func @test_matmul_t_block_scaled_output_height_mismatch(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<?x9x1xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<4x?x?xf8E8M0FNU>) -> tensor<4x8x?xf32> { + // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expected output shape 4, 8, ? to be compatible with expected output shape 4, 9, ?}} + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<*xf8E4M3FN>, tensor<?x9x1xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<4x?x?xf8E8M0FNU>) -> tensor<4x8x?xf32> + return %0 : tensor<4x8x?xf32> +} + +// ----- + +func.func @test_matmul_t_block_scaled_output_width_mismatch(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<?x?x1xf8E8M0FNU>, %arg2: tensor<?x1x?xf8E4M3FN>, %arg3: tensor<*xf8E8M0FNU>) -> tensor<?x?x10xf32> { + // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expected output shape ?, ?, 10 to be compatible with expected output shape ?, ?, 1}} + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<*xf8E4M3FN>, tensor<?x?x1xf8E8M0FNU>, tensor<?x1x?xf8E4M3FN>, tensor<*xf8E8M0FNU>) -> tensor<?x?x10xf32> + return %0 : tensor<?x?x10xf32> +} + +// ----- + +func.func @test_matmul_t_block_scaled_channel_not_multiple_of_block_size(%arg0: tensor<4x8x55xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf8E4M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expected channels of b_data to match size 55, got 32}} + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x8x55xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf8E4M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} + +// ----- + +func.func @test_matmul_t_block_scaled_batch_mismatch(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<2x16x32xf8E4M3FN>, %arg3: tensor<2x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> { + // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expect B matrix batch size to be broadcast compatible with A, got D=2 vs N=4}} + %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<2x16x32xf8E4M3FN>, tensor<2x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> + return %0 : tensor<4x8x16xf32> +} diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 0e1365a..27a3dc3 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -214,3 +214,54 @@ gpu.module @xevm_module{ } } + +// ----- +// CHECK-LABEL: gpu.func @warp_scf_for_unused_uniform_for_result( +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%{{.*}} : index, +// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, +// CHECK-SAME: memref<16x16xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) { +// CHECK: gpu.yield %{{.*}}, {{.*}} : vector<16x16xf32>, vector<16x1xf32> +// CHECK: } +// CHECK: %{{.*}}:2 = scf.for {{.*}} to %{{.*}} step %{{.*}} iter_args +// CHECK-SAME: (%{{.*}} = %[[W]]#0, %{{.*}} = %[[W]]#1) -> (vector<16x1xf32>, vector<16x1xf32>) { +// CHECK: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] +// CHECK-SAME: args(%{{.*}} : vector<16x1xf32>, vector<16x1xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) { +// CHECK: gpu.yield %{{.*}}, %{{.*}} : vector<16x16xf32>, vector<16x1xf32> +// CHECK: } +// CHECK: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<16x1xf32>, vector<16x1xf32> +// CHECK: } +gpu.module @xevm_module{ + gpu.func @warp_scf_for_unused_uniform_for_result(%arg0: index, + %arg1: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, + %arg2: memref<16x16xf32>) { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %ini = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : () -> (vector<16x1xf32>) + %ini2 = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : () -> (vector<16x16xf32>) + %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini2, %arg5 = %ini) -> (vector<16x16xf32>, vector<16x1xf32>) { + %1 = "some_def"(%arg5) + { + layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, + layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> + } + : (vector<16x1xf32>) -> (vector<16x1xf32>) + %acc = "some_def"(%arg4, %1) + { + layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, + layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, + layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> + } + : (vector<16x16xf32>, vector<16x1xf32>) -> (vector<16x16xf32>) + scf.yield %acc, %1 : vector<16x16xf32>, vector<16x1xf32> + } + { + layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> + } + xegpu.store_nd %3#0, %arg1[%c0, %c0] + : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + gpu.return + } +} diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir index 9f4393e..127ab70 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir @@ -103,6 +103,17 @@ func.func @main() { // CHECK: unexpected negative result on dimension #0 of input/output operand #0 func.call @reverse_from_3(%d5x) : (tensor<?xf32>) -> (tensor<?xf32>) + %c0x = arith.constant dense<1.0> : tensor<0xf32> + %d0x = tensor.cast %c0x : tensor<0xf32> to tensor<?xf32> + // CHECK-NOT: ERROR: Runtime op verification failed + func.call @fill_empty_1d(%d0x) : (tensor<?xf32>) -> (tensor<?xf32>) + + %c0x5 = arith.constant dense<0.0> : tensor<0x5xf32> + %d0x5 = tensor.cast %c0x5 : tensor<0x5xf32> to tensor<?x?xf32> + + // CHECK-NOT: ERROR: Runtime op verification failed + func.call @fill_empty_2d(%d0x5) : (tensor<?x?xf32>) -> (tensor<?x?xf32>) + return } @@ -297,3 +308,15 @@ func.func @reverse_from_3(%arg0: tensor<?xf32>) -> (tensor<?xf32>) { } -> tensor<?xf32> return %result : tensor<?xf32> } + +func.func @fill_empty_1d(%arg0: tensor<?xf32>) -> (tensor<?xf32>) { + %c0 = arith.constant 0.0 : f32 + %0 = linalg.fill ins(%c0 : f32) outs(%arg0 : tensor<?xf32>) -> tensor<?xf32> + return %0 : tensor<?xf32> +} + +func.func @fill_empty_2d(%arg0: tensor<?x?xf32>) -> (tensor<?x?xf32>) { + %c0 = arith.constant 0.0 : f32 + %0 = linalg.fill ins(%c0 : f32) outs(%arg0 : tensor<?x?xf32>) -> tensor<?x?xf32> + return %0 : tensor<?x?xf32> +} diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir new file mode 100644 index 0000000..edf8775 --- /dev/null +++ b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir @@ -0,0 +1,30 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \ +// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @test attributes {gpu.container_module} { + gpu.module @test_module { + gpu.func @test_printf(%arg0: i32, %arg1: f32) kernel { + gpu.printf "Hello: %d\n", %arg0 : i32 + gpu.printf "Hello: %f\n", %arg1 : f32 + gpu.return + } + } + + func.func @main() attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c11 = arith.constant 11 : i32 + %c4 = arith.constant 4.0 : f32 + // CHECK: Hello: 11 + // CHECK: Hello: 4.000000 + gpu.launch_func @test_module::@test_printf blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%c11 : i32, %c4 : f32) + return + } +} diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir index 8116044..21d7816 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir @@ -658,33 +658,20 @@ module attributes {transform.with_named_sequence} { } } -// CHECK: func.func private @tile_one_consumer_using_tile_and_fuse(%[[VAL_0:.*]]: tensor<16x128x48x96xf32>, %[[VAL_1:.*]]: tensor<16x96x48x128xf32>) -> tensor<16x96x48x128xf32> { -// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_3:.*]] = arith.constant 16 : index -// CHECK: %[[VAL_4:.*]] = arith.constant 128 : index -// CHECK: %[[VAL_5:.*]] = arith.constant 48 : index -// CHECK: %[[VAL_6:.*]] = arith.constant 96 : index -// CHECK: %[[VAL_7:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_8:.*]] = scf.for %[[VAL_9:.*]] = %[[VAL_2]] to %[[VAL_3]] step %[[VAL_7]] iter_args(%[[VAL_10:.*]] = %[[VAL_1]]) -> (tensor<16x96x48x128xf32>) { -// CHECK: %[[VAL_11:.*]] = scf.for %[[VAL_12:.*]] = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_3]] iter_args(%[[VAL_13:.*]] = %[[VAL_10]]) -> (tensor<16x96x48x128xf32>) { -// CHECK: %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (tensor<16x96x48x128xf32>) { -// CHECK: %[[VAL_17:.*]] = scf.for %[[VAL_18:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_3]] iter_args(%[[VAL_19:.*]] = %[[VAL_16]]) -> (tensor<16x96x48x128xf32>) { -// CHECK: %[[VAL_20:.*]] = tensor.extract_slice %[[VAL_0]]{{\[}}%[[VAL_9]], %[[VAL_12]], %[[VAL_15]], %[[VAL_18]]] [1, 16, 16, 16] [1, 1, 1, 1] : tensor<16x128x48x96xf32> to tensor<1x16x16x16xf32> -// CHECK: %[[VAL_21:.*]] = tensor.extract_slice %[[VAL_19]]{{\[}}%[[VAL_9]], %[[VAL_18]], %[[VAL_15]], %[[VAL_12]]] [1, 16, 16, 16] [1, 1, 1, 1] : tensor<16x96x48x128xf32> to tensor<1x16x16x16xf32> -// CHECK: %[[VAL_22:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[VAL_20]] : tensor<1x16x16x16xf32>) outs(%[[VAL_21]] : tensor<1x16x16x16xf32>) { -// CHECK: ^bb0(%[[VAL_23:.*]]: f32, %[[VAL_24:.*]]: f32): -// CHECK: linalg.yield %[[VAL_23]] : f32 -// CHECK: } -> tensor<1x16x16x16xf32> -// CHECK: %[[VAL_25:.*]] = tensor.insert_slice %[[VAL_26:.*]] into %[[VAL_19]]{{\[}}%[[VAL_9]], %[[VAL_18]], %[[VAL_15]], %[[VAL_12]]] [1, 16, 16, 16] [1, 1, 1, 1] : tensor<1x16x16x16xf32> into tensor<16x96x48x128xf32> -// CHECK: scf.yield %[[VAL_25]] : tensor<16x96x48x128xf32> -// CHECK: } -// CHECK: scf.yield %[[VAL_27:.*]] : tensor<16x96x48x128xf32> -// CHECK: } -// CHECK: scf.yield %[[VAL_28:.*]] : tensor<16x96x48x128xf32> -// CHECK: } -// CHECK: scf.yield %[[VAL_29:.*]] : tensor<16x96x48x128xf32> -// CHECK: } -// CHECK: return %[[VAL_30:.*]] : tensor<16x96x48x128xf32> -// CHECK: } -// CHECK: } - +// CHECK-LABEL: func private @tile_one_consumer_using_tile_and_fuse +// CHECK-SAME: %[[ARG0:.*]]: tensor<16x128x48x96xf32> +// CHECK-SAME: %[[ARG1:.*]]: tensor<16x96x48x128xf32> +// CHECK: scf.for %[[IV0:[a-zA-Z0-9]+]] = +// CHECK-SAME: iter_args(%[[ITERARG0:.+]] = %[[ARG1]]) +// CHECK: scf.for %[[IV1:[a-zA-Z0-9]+]] = +// CHECK-SAME: iter_args(%[[ITERARG1:.+]] = %[[ITERARG0]]) +// CHECK: scf.for %[[IV2:[a-zA-Z0-9]+]] = +// CHECK-SAME: iter_args(%[[ITERARG2:.+]] = %[[ITERARG1]]) +// CHECK: scf.for %[[IV3:[a-zA-Z0-9]+]] = +// CHECK-SAME: iter_args(%[[ITERARG3:.+]] = %[[ITERARG2]]) +// CHECK: %[[TILEDARG0:.*]] = tensor.extract_slice %[[ARG0]]{{\[}}%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]] +// CHECK: %[[TILEDARG1:.*]] = tensor.extract_slice %[[ITERARG3]]{{\[}}%[[IV0]], %[[IV3]], %[[IV2]], %[[IV1]]] +// CHECK: %[[RES:.*]] = linalg.generic +// CHECK-SAME: ins(%[[TILEDARG0]] +// CHECK-SAME: outs(%[[TILEDARG1]] +// CHECK: tensor.insert_slice %[[RES:.*]] diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_fp4x2.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_fp4x2.mlir index 04e2ddf..451475c 100644 --- a/mlir/test/Target/LLVMIR/nvvm/convert_fp4x2.mlir +++ b/mlir/test/Target/LLVMIR/nvvm/convert_fp4x2.mlir @@ -10,3 +10,14 @@ llvm.func @convert_f32x2_to_f4x2_e2m1(%srcA : f32, %srcB : f32) { %res2 = nvvm.convert.f32x2.to.f4x2 %srcA, %srcB {relu = true} : i8 (f4E2M1FN) llvm.return } + +// CHECK-LABEL: @convert_f4x2_to_f16x2 +llvm.func @convert_f4x2_to_f16x2(%src : i8) { + // CHECK: %[[res1:.*]] = zext i8 %{{.*}} to i16 + // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 %[[res1]]) + %res1 = nvvm.convert.f4x2.to.f16x2 %src : i8 (f4E2M1FN)-> vector<2xf16> + // CHECK: %[[res2:.*]] = zext i8 %{{.*}} to i16 + // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 %[[res2]]) + %res2 = nvvm.convert.f4x2.to.f16x2 %src {relu = true} : i8 (f4E2M1FN)-> vector<2xf16> + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir index 9928992..61a7a48 100644 --- a/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir +++ b/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir @@ -19,3 +19,27 @@ llvm.func @convert_f32x2_to_fp6x2_vector(%srcA : f32, %srcB : f32) { %res2 = nvvm.convert.f32x2.to.f6x2 %srcA, %srcB : vector<2xi8> (f6E3M2FN) llvm.return } + +// ----- + +// CHECK-LABEL: @convert_f6x2_to_f16x2_e2m3 +llvm.func @convert_f6x2_to_f16x2_e2m3(%src : vector<2xi8>) { + // CHECK: %[[res1:.*]] = bitcast <2 x i8> %{{.*}} to i16 + // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 %[[res1]]) + %res1 = nvvm.convert.f6x2.to.f16x2 %src : vector<2xi8> (f6E2M3FN)-> vector<2xf16> + // CHECK: %[[res2:.*]] = bitcast <2 x i8> %{{.*}} to i16 + // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 %[[res2]]) + %res2 = nvvm.convert.f6x2.to.f16x2 %src {relu = true} : vector<2xi8> (f6E2M3FN)-> vector<2xf16> + llvm.return +} + +// CHECK-LABEL: @convert_f6x2_to_f16x2_e3m2 +llvm.func @convert_f6x2_to_f16x2_e3m2(%src : vector<2xi8>) { + // CHECK: %[[res1:.*]] = bitcast <2 x i8> %{{.*}} to i16 + // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 %[[res1]]) + %res1 = nvvm.convert.f6x2.to.f16x2 %src : vector<2xi8> (f6E3M2FN)-> vector<2xf16> + // CHECK: %[[res2:.*]] = bitcast <2 x i8> %{{.*}} to i16 + // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 %[[res2]]) + %res2 = nvvm.convert.f6x2.to.f16x2 %src {relu = true} : vector<2xi8> (f6E3M2FN)-> vector<2xf16> + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir index de21826..4afe901 100644 --- a/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir +++ b/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir @@ -100,3 +100,37 @@ llvm.func @convert_bf16x2_to_f8x2_vector_return(%src : vector<2xbf16>) { %res2 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> vector<2xi8> (f8E8M0FNU) llvm.return } + +// ----- + +// CHECK-LABEL: @convert_f8x2_to_f16x2 +llvm.func @convert_f8x2_to_f16x2_e4m3(%src : vector<2xi8>) { + // CHECK: %[[res1:.*]] = bitcast <2 x i8> %{{.*}} to i16 + // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 %[[res1]]) + %res1 = nvvm.convert.f8x2.to.f16x2 %src : vector<2xi8> (f8E4M3FN)-> vector<2xf16> + // CHECK: %[[res2:.*]] = bitcast <2 x i8> %{{.*}} to i16 + // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 %[[res2]]) + %res2 = nvvm.convert.f8x2.to.f16x2 %src {relu = true} : vector<2xi8> (f8E4M3FN)-> vector<2xf16> + llvm.return +} + +// CHECK-LABEL: @convert_f8x2_to_f16x2_e5m2 +llvm.func @convert_f8x2_to_f16x2_e5m2(%src : vector<2xi8>) { + // CHECK: %[[res1:.*]] = bitcast <2 x i8> %{{.*}} to i16 + // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 %[[res1]]) + %res1 = nvvm.convert.f8x2.to.f16x2 %src : vector<2xi8> (f8E5M2)-> vector<2xf16> + // CHECK: %[[res2:.*]] = bitcast <2 x i8> %{{.*}} to i16 + // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 %[[res2]]) + %res2 = nvvm.convert.f8x2.to.f16x2 %src {relu = true} : vector<2xi8> (f8E5M2)-> vector<2xf16> + llvm.return +} + +// ----- + +// CHECK-LABEL: @convert_f8x2_to_bf16x2_ue8m0 +llvm.func @convert_f8x2_to_bf16x2_ue8m0(%src : vector<2xi8>) { + // CHECK: %[[res1:.*]] = bitcast <2 x i8> %{{.*}} to i16 + // CHECK-NEXT: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 %[[res1]]) + %res1 = nvvm.convert.f8x2.to.bf16x2 %src : vector<2xi8> (f8E8M0FNU)-> vector<2xbf16> + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir index 6cccfe4..09b8f59 100644 --- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir @@ -262,6 +262,38 @@ llvm.func @nvvm_cvt_f32x2_to_f4x2_invalid_type(%a : f32, %b : f32) { // ----- +llvm.func @nvvm_cvt_f8x2_to_f16x2_invalid_type(%src : vector<2xi8>) { + // expected-error @below {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f8x2 to f16x2.}} + %res = nvvm.convert.f8x2.to.f16x2 %src : vector<2xi8> (f8E4M3) -> vector<2xf16> + llvm.return +} + +// ----- + +llvm.func @nvvm_cvt_f8x2_to_bf16x2_invalid_type(%src : vector<2xi8>) { + // expected-error @below {{Only 'f8E8M0FNU' type is supported for conversions from f8x2 to bf16x2.}} + %res = nvvm.convert.f8x2.to.bf16x2 %src : vector<2xi8> (f8E4M3FN) -> vector<2xbf16> + llvm.return +} + +// ----- + +llvm.func @nvvm_cvt_f6x2_to_f16x2_invalid_type(%src : vector<2xi8>) { + // expected-error @below {{Only 'f6E2M3FN' and 'f6E3M2FN' types are supported for conversions from f6x2 to f16x2.}} + %res = nvvm.convert.f6x2.to.f16x2 %src : vector<2xi8> (f8E4M3FN) -> vector<2xf16> + llvm.return +} + +// ----- + +llvm.func @nvvm_cvt_f4x2_to_f16x2_invalid_type(%src : i8) { + // expected-error @below {{Only 'f4E2M1FN' type is supported for conversions from f4x2 to f16x2.}} + %res = nvvm.convert.f4x2.to.f16x2 %src : i8 (f6E2M3FN) -> vector<2xf16> + llvm.return +} + +// ----- + llvm.func @nvvm_prefetch_L1_with_evict_priority(%global_ptr: !llvm.ptr<1>) { // expected-error @below {{cache eviction priority supported only for cache level L2}} nvvm.prefetch level = L1, evict_priority = evict_last, %global_ptr : !llvm.ptr<1> diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index 2fa4470..af6d254 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -249,24 +249,6 @@ llvm.func @target_is_device_ptr(%x : !llvm.ptr) { // ----- -omp.private {type = firstprivate} @x.privatizer : i32 copy { -^bb0(%mold: !llvm.ptr, %private: !llvm.ptr): - %0 = llvm.load %mold : !llvm.ptr -> i32 - llvm.store %0, %private : i32, !llvm.ptr - omp.yield(%private: !llvm.ptr) -} -llvm.func @target_firstprivate(%x : !llvm.ptr) { - %0 = omp.map.info var_ptr(%x : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr - // expected-error@below {{not yet implemented: Unhandled clause privatization for deferred target tasks in omp.target operation}} - // expected-error@below {{LLVM Translation failed for operation: omp.target}} - omp.target nowait map_entries(%0 -> %blockarg0 : !llvm.ptr) private(@x.privatizer %x -> %arg0 [map_idx=0] : !llvm.ptr) { - omp.terminator - } - llvm.return -} - -// ----- - llvm.func @target_enter_data_depend(%x: !llvm.ptr) { // expected-error@below {{not yet implemented: Unhandled clause depend in omp.target_enter_data operation}} // expected-error@below {{LLVM Translation failed for operation: omp.target_enter_data}} diff --git a/mlir/test/Transforms/scf-loop-unroll.mlir b/mlir/test/Transforms/scf-loop-unroll.mlir index 0ef6ad1..db96c65 100644 --- a/mlir/test/Transforms/scf-loop-unroll.mlir +++ b/mlir/test/Transforms/scf-loop-unroll.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt %s --test-loop-unrolling="unroll-factor=3" -split-input-file -canonicalize | FileCheck %s // RUN: mlir-opt %s --test-loop-unrolling="unroll-factor=1" -split-input-file -canonicalize | FileCheck %s --check-prefix UNROLL-BY-1 -// RUN: mlir-opt %s --test-loop-unrolling="unroll-full=true" -split-input-file -canonicalize | FileCheck %s --check-prefix UNROLL-FULL +// RUN: mlir-opt %s --test-loop-unrolling="unroll-factor=-1" -split-input-file -canonicalize | FileCheck %s --check-prefix UNROLL-FULL // CHECK-LABEL: scf_loop_unroll_single func.func @scf_loop_unroll_single(%arg0 : f32, %arg1 : f32) -> f32 { diff --git a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt index 1e59338..a54b642 100644 --- a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt +++ b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt @@ -2,6 +2,7 @@ add_mlir_library(MLIROpenACCTestPasses TestOpenACC.cpp TestPointerLikeTypeInterface.cpp TestRecipePopulate.cpp + TestOpenACCSupport.cpp EXCLUDE_FROM_LIBMLIR ) @@ -11,6 +12,7 @@ mlir_target_link_libraries(MLIROpenACCTestPasses PUBLIC MLIRFuncDialect MLIRMemRefDialect MLIROpenACCDialect + MLIROpenACCAnalysis MLIRPass MLIRSupport ) diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp index bea21b9..e59d777 100644 --- a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp +++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp @@ -16,11 +16,13 @@ namespace test { // Forward declarations of individual test pass registration functions void registerTestPointerLikeTypeInterfacePass(); void registerTestRecipePopulatePass(); +void registerTestOpenACCSupportPass(); // Unified registration function for all OpenACC tests void registerTestOpenACC() { registerTestPointerLikeTypeInterfacePass(); registerTestRecipePopulatePass(); + registerTestOpenACCSupportPass(); } } // namespace test diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp new file mode 100644 index 0000000..8bf984b --- /dev/null +++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp @@ -0,0 +1,73 @@ +//===- TestOpenACCSupport.cpp - Test OpenACCSupport Analysis -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains test passes for testing the OpenACCSupport analysis. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace mlir::acc; + +namespace { + +struct TestOpenACCSupportPass + : public PassWrapper<TestOpenACCSupportPass, OperationPass<func::FuncOp>> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOpenACCSupportPass) + + StringRef getArgument() const override { return "test-acc-support"; } + + StringRef getDescription() const override { + return "Test OpenACCSupport analysis"; + } + + void runOnOperation() override; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert<acc::OpenACCDialect>(); + registry.insert<memref::MemRefDialect>(); + } +}; + +void TestOpenACCSupportPass::runOnOperation() { + auto func = getOperation(); + + // Get the OpenACCSupport analysis + OpenACCSupport &support = getAnalysis<OpenACCSupport>(); + + // Walk through operations looking for test attributes + func.walk([&](Operation *op) { + // Check for test.var_name attribute. This is the marker used to identify + // the operations that need to be tested for getVariableName. + if (op->hasAttr("test.var_name")) { + // For each result of this operation, try to get the variable name + for (auto result : op->getResults()) { + std::string foundName = support.getVariableName(result); + llvm::outs() << "op=" << *op << "\n\tgetVariableName=\"" << foundName + << "\"\n"; + } + } + }); +} + +} // namespace + +namespace mlir { +namespace test { + +void registerTestOpenACCSupportPass() { + PassRegistration<TestOpenACCSupportPass>(); +} + +} // namespace test +} // namespace mlir diff --git a/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp index ced0033..2470380 100644 --- a/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp +++ b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp @@ -42,11 +42,10 @@ struct TestLoopUnrollingPass TestLoopUnrollingPass(const TestLoopUnrollingPass &) {} explicit TestLoopUnrollingPass(uint64_t unrollFactorParam, unsigned loopDepthParam, - bool annotateLoopParam, bool unrollFullParam) { + bool annotateLoopParam) { unrollFactor = unrollFactorParam; loopDepth = loopDepthParam; annotateLoop = annotateLoopParam; - unrollFull = unrollFactorParam; } void getDependentDialects(DialectRegistry ®istry) const override { @@ -54,6 +53,12 @@ struct TestLoopUnrollingPass } void runOnOperation() override { + if (!(unrollFactor.getValue() > 0 || unrollFactor.getValue() == -1)) { + emitError(UnknownLoc::get(&getContext()), + "Invalid option: 'unroll-factor' should be greater than 0 or " + "equal to -1"); + return signalPassFailure(); + } SmallVector<scf::ForOp, 4> loops; getOperation()->walk([&](scf::ForOp forOp) { if (getNestingDepth(forOp) == loopDepth) @@ -65,15 +70,15 @@ struct TestLoopUnrollingPass } }; for (auto loop : loops) { - if (unrollFull) + if (unrollFactor.getValue() == -1) (void)loopUnrollFull(loop); else (void)loopUnrollByFactor(loop, unrollFactor, annotateFn); } } - Option<uint64_t> unrollFactor{*this, "unroll-factor", - llvm::cl::desc("Loop unroll factor."), - llvm::cl::init(1)}; + Option<int64_t> unrollFactor{*this, "unroll-factor", + llvm::cl::desc("Loop unroll factor."), + llvm::cl::init(1)}; Option<bool> annotateLoop{*this, "annotate", llvm::cl::desc("Annotate unrolled iterations."), llvm::cl::init(false)}; @@ -82,9 +87,6 @@ struct TestLoopUnrollingPass llvm::cl::init(false)}; Option<unsigned> loopDepth{*this, "loop-depth", llvm::cl::desc("Loop depth."), llvm::cl::init(0)}; - Option<bool> unrollFull{*this, "unroll-full", - llvm::cl::desc("Full unroll loops."), - llvm::cl::init(false)}; }; } // namespace diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index ee4fa39..efbdbfb 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -2136,7 +2136,7 @@ struct TestTypeConversionDriver Location loc) -> Value { if (inputs.size() != 1 || !inputs[0].getType().isInteger(37)) return Value(); - return builder.create<UnrealizedConversionCastOp>(loc, type, inputs) + return UnrealizedConversionCastOp::create(builder, loc, type, inputs) .getResult(0); }); diff --git a/mlir/test/python/dialects/shard.py b/mlir/test/python/dialects/shard.py new file mode 100644 index 0000000..c3ba605 --- /dev/null +++ b/mlir/test/python/dialects/shard.py @@ -0,0 +1,67 @@ +# RUN: %PYTHON %s | FileCheck %s + +from mlir.ir import * +from mlir.dialects import shard +from mlir.dialects import func + + +def constructAndPrintInModule(f): + print("\nTEST:", f.__name__) + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + f() + print(module) + module.operation.verify() + return f + + +# CHECK-LABEL: TEST: testShardGrid +@constructAndPrintInModule +def testShardGrid(): + # Test creating shard grids with different shapes + grid2d = shard.GridOp("grid_2d", [2, 2]) + grid1d = shard.GridOp("grid_1d", [4]) + + # CHECK: shard.grid @grid_2d(shape = 2x2) + # CHECK: shard.grid @grid_1d(shape = 4) + + +# CHECK-LABEL: TEST: testCollectiveOperations +@constructAndPrintInModule +def testCollectiveOperations(): + # Create grid and types + grid_op = shard.GridOp("grid_2x2", [2, 2]) + i32 = IntegerType.get_signless(32) + index_type = IndexType.get() + input_type = RankedTensorType.get([4, 2], i32) + gather_result_type = RankedTensorType.get([4, 4], i32) + + # Create a function to hold the operations + func_type = FunctionType.get([input_type], [input_type]) + test_func = func.FuncOp("test_collectives", func_type) + + with InsertionPoint(test_func.add_entry_block()): + arg = test_func.entry_block.arguments[0] + + gather_op = shard.AllGatherOp( + input=arg, + grid=FlatSymbolRefAttr.get("grid_2x2"), + grid_axes=DenseI16ArrayAttr.get([1]), + gather_axis=IntegerAttr.get(index_type, 1), + result=gather_result_type, + ) + + reduce_op = shard.AllReduceOp( + input=arg, + grid=FlatSymbolRefAttr.get("grid_2x2"), + reduction=shard.ReductionKind.Sum, + result=input_type, + ) + + func.ReturnOp([reduce_op]) + + # CHECK: shard.grid @grid_2x2(shape = 2x2) + # CHECK: func.func @test_collectives(%arg0: tensor<4x2xi32>) -> tensor<4x2xi32> + # CHECK: %all_gather = shard.all_gather %arg0 on @grid_2x2 grid_axes = [1] gather_axis = 1 : tensor<4x2xi32> -> tensor<4x4xi32> + # CHECK: %all_reduce = shard.all_reduce %arg0 on @grid_2x2 : tensor<4x2xi32> -> tensor<4x2xi32> |
