diff options
Diffstat (limited to 'mlir/test')
18 files changed, 708 insertions, 413 deletions
diff --git a/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir b/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir index a9ab0be..e6f22f0 100644 --- a/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir +++ b/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir @@ -1,13 +1,17 @@ // RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s -gpu.module @test_kernel { +#sg_map_a_f16 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +#sg_map_b_f16 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]> +#sg_map_c_f32 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> + +gpu.module @load_store_check { // CHECK-LABEL: func.func @dpas( // CHECK-SAME: %[[ARG0:.*]]: vector<8xf16>, %[[ARG1:.*]]: vector<16xf16>, %[[ARG2:.*]]: vector<8xf32> func.func @dpas(%a_loaded: vector<8xf16>, %b_loaded: vector<16xf16>, %c_loaded: vector<8xf32>) -> vector<8xf32> { // Loads are checked in a separate test. // CHECK: %[[D:.*]] = xevm.mma %[[ARG0]], %[[ARG1]], %[[ARG2]] {shape = <m = 8, n = 16, k = 16>, types = <d = f32, a = f16, b = f16, c = f32>} // CHECK-SAME: : (vector<8xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32> - %d = xegpu.dpas %a_loaded, %b_loaded, %c_loaded + %d = xegpu.dpas %a_loaded, %b_loaded, %c_loaded {a_layout = #sg_map_a_f16, b_layout = #sg_map_b_f16, c_layout = #sg_map_c_f32} : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32> return %d : vector<8xf32> } diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir deleted file mode 100644 index d4cb493..0000000 --- a/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir +++ /dev/null @@ -1,201 +0,0 @@ -// RUN: mlir-opt -split-input-file -convert-xegpu-to-xevm -cse %s | FileCheck %s - -gpu.module @test_kernel [#xevm.target<chip = "pvc">] { - - // e.g. for mem_desc<32x32xf16, @strides=[1, 16]> - // its memory layout tuple is (blocked shape = [1,1,32,32],strides=[1024,1024,32,1]) - //CHECK-LABEL: load_store_matrix_1 - gpu.func @load_store_matrix_1(%arg0: memref<4096xi8, 3>) -> f32 { - %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32> - - //CHECK: %[[TID:.*]] = gpu.thread_id x - //CHECK: %[[C1:.*]] = arith.constant 1 : index - //CHECK: %[[MUL1:.*]] = arith.muli %[[TID]], %[[C1]] : index - //CHECK: %[[C4:.*]] = arith.constant 4 : i32 - //CHECK: %[[MUL2:.*]] = arith.muli {{.*}}, %[[C4]] : i32 - //CHECK: llvm.load {{.*}} : !llvm.ptr<3> -> f32 - - %tid_x = gpu.thread_id x - %c0 = arith.constant 0 : index - %1 = xegpu.load_matrix %0[%c0, %tid_x]: !xegpu.mem_desc<32x32xf32>, index, index -> f32 - - //CHECK: llvm.store {{.*}}, {{.*}} : f32, !llvm.ptr<3> - - xegpu.store_matrix %1, %0[%c0, %tid_x]: f32, !xegpu.mem_desc<32x32xf32>, index, index - - gpu.return %1: f32 - } - -// e.g. for mem_desc<32x64xf16, @block=[16, 16], @strides=[1, 32]> - // its memory layout tuple is ([2,4,16,16],[256,512,1,16]) - //CHECK-LABEL: load_store_matrix_2 - gpu.func @load_store_matrix_2(%arg0: memref<4096xi8, 3>) -> f16 { - %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>> - //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[tid_x:.*]] = gpu.thread_id x - //CHECK: %[[c13:.*]] = arith.constant 13 : index - //CHECK: %[[c16:.*]] = arith.constant 16 : index - //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c13]], %[[c16]] : index - //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c13]], %[[c16]] : index - //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index - //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index - - //CHECK: %[[c256:.*]] = arith.constant 256 : index - //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c256]] : index - //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index - //CHECK: %[[c512:.*]] = arith.constant 512 : index - //CHECK: %[[mul1:.*]] = arith.muli %[[offsety_0]], %[[c512]] : index - //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index - //CHECK: %[[c1:.*]] = arith.constant 1 : index - //CHECK: %[[mul2:.*]] = arith.muli %[[offsetx_1]], %[[c1]] : index - //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index - //CHECK: %[[mul3:.*]] = arith.muli %[[offsety_1]], %[[c16]] : index - //CHECK: %[[add3:.*]] = arith.addi %[[mul3]], %[[add2]] : index - - //CHECK: %[[loaded:.*]] = llvm.load {{.*}}: !llvm.ptr<3> -> f16 - - - %tid_x = gpu.thread_id x - %c13 = arith.constant 13 : index - %1 = xegpu.load_matrix %0[%c13, %tid_x]: !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index -> f16 - - //CHECK: llvm.store %[[loaded]], {{.*}} : f16, !llvm.ptr<3> - - xegpu.store_matrix %1, %0[%c13, %tid_x]: f16, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index - gpu.return %1: f16 - } - - - // e.g. for mem_desc<32x64xf16, @block=[16, 16]> - // its memory layout tuple is ([2,4,16,16],[1024,256,16,1]) - //CHECK-LABEL: load_store_matrix_3 - gpu.func @load_store_matrix_3(%arg0: memref<4096xi8, 3>) -> f16 { - //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[view:.*]] = memref.view %arg0[%[[c0]]][] : memref<4096xi8, 3> to memref<2048xf16, 3> - %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>> - - //CHECK: %[[tid_x:.*]] = gpu.thread_id x - //CHECK: %[[c19:.*]] = arith.constant 19 : index - %tid_x = gpu.thread_id x - %c19 = arith.constant 19: index - - //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[view]] : memref<2048xf16, 3> -> index - //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i32 - //CHECK: %[[c16:.*]] = arith.constant 16 : index - //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c19]], %[[c16]] : index - //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c19]], %[[c16]] : index - //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index - //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index - //CHECK: %[[c1024:.*]] = arith.constant 1024 : index - //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c1024]] : index - //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index - //CHECK: %[[c256:.*]] = arith.constant 256 : index - //CHECK: %[[mul1:.*]] = arith.muli %[[offsety_0]], %[[c256]] : index - //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index - //CHECK: %[[mul2:.*]] = arith.muli %[[offsetx_1]], %[[c16]] : index - //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index - //CHECK: %[[c1:.*]] = arith.constant 1 : index - //CHECK: %[[mul3:.*]] = arith.muli %[[offsety_1]], %[[c1]] : index - //CHECK: %[[add3:.*]] = arith.addi %[[mul3]], %[[add2]] : index - - //CHECK: %[[loaded:.*]] = llvm.load {{.*}} : !llvm.ptr<3> -> f16 - %1 = xegpu.load_matrix %0[%c19, %tid_x]: !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index -> f16 - - //CHECK: llvm.store %[[loaded]], {{.*}} : f16, !llvm.ptr<3> - xegpu.store_matrix %1, %0[%c19, %tid_x]: f16, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index - - //CHECK: gpu.return %[[loaded]] : f16 - gpu.return %1: f16 - } - - // e.g. for mem_desc<32x64xf16, @block=[16, 16], @strides=[1, 16]> - // its memory layout tuple is ([2,4,16,16],[256,512,1,16]) - //CHECK-LABEL: load_store_matrix_4 - gpu.func @load_store_matrix_4(%arg0: memref<4096xi8, 3>) -> vector<8xf16> { - %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>> - - //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[tid_x:.*]] = gpu.thread_id x - - //CHECK: %[[c16:.*]] = arith.constant 16 : index - //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c16]], %[[c16]] : index - //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c16]], %[[c16]] : index - //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index - //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index - - //CHECK: %[[c256:.*]] = arith.constant 256 : index - //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c256]] : index - //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index - //CHECK: %[[c512:.*]] = arith.constant 512 : index - //CHECK: %[[mul1:.*]] = arith.muli %[[offsety_0]], %[[c512]] : index - //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index - //CHECK: %[[c1:.*]] = arith.constant 1 : index - //CHECK: %[[mul2:.*]] = arith.muli %[[offsetx_1]], %[[c1]] : index - //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index - //CHECK: %[[mul3:.*]] = arith.muli %[[offsety_1]], %[[c16]] : index - //CHECK: %[[add3:.*]] = arith.addi %[[mul3]], %[[add2]] : index - - //CHECK: %[[loaded:.*]] = llvm.load {{.*}}: !llvm.ptr<3> -> vector<8xf16> - - %tid_x = gpu.thread_id x - %c16 = arith.constant 16 : index - %1 = xegpu.load_matrix %0[%c16, %tid_x] : !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index -> vector<8xf16> - - //CHECK: llvm.store %[[loaded]], {{.*}} : vector<8xf16>, !llvm.ptr<3> - xegpu.store_matrix %1, %0[%c16, %tid_x] : vector<8xf16>, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index - - gpu.return %1: vector<8xf16> - } - - - // e.g. for mem_desc<32x64xf16, @block=[16, 16]> - // its memory layout tuple is ([2,4,16,16],[1024,256,16,1]) - //CHECK-LABEL: load_store_matrix_5 - gpu.func @load_store_matrix_5(%arg0: memref<4096xi8, 3>) -> vector<8xf16> { - //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[view:.*]] = memref.view %arg0[%[[c0]]][] : memref<4096xi8, 3> to memref<2048xf16, 3> - - %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>> - - //CHECK: %[[c16:.*]] = arith.constant 16 : index - //CHECK: %[[c48:.*]] = arith.constant 48 : index - - %c16 = arith.constant 16 : index - %c48 = arith.constant 48 : index - - //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[view]] : memref<2048xf16, 3> -> index - //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i32 - //CHECK: %[[offset0:.*]] = arith.divsi %[[c16]], %[[c16]] : index - //CHECK: %[[offset1:.*]] = arith.remsi %[[c16]], %[[c16]] : index - //CHECK: %[[offset2:.*]] = arith.divsi %[[c48]], %[[c16]] : index - //CHECK: %[[offset3:.*]] = arith.remsi %[[c48]], %[[c16]] : index - //CHECK: %[[c1024:.*]] = arith.constant 1024 : index - //CHECK: %[[mul0:.*]] = arith.muli %[[offset0]], %[[c1024]] : index - //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index - //CHECK: %[[c256:.*]] = arith.constant 256 : index - //CHECK: %[[mul1:.*]] = arith.muli %[[offset2]], %[[c256]] : index - //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index - //CHECK: %[[mul2:.*]] = arith.muli %[[offset1]], %[[c16]] : index - //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index - //CHECK: %[[c1:.*]] = arith.constant 1 : index - //CHECK: %[[mul3:.*]] = arith.muli %[[offset3]], %[[c1]] : index - //CHECK: %[[linearOffset:.*]] = arith.addi %[[mul3]], %[[add2]] : index - //CHECK: %[[linearOffsetI64:.*]] = arith.index_castui %[[linearOffset]] : index to i32 - //CHECK: %[[c2:.*]] = arith.constant 2 : i32 - //CHECK: %[[byteOffset:.*]] = arith.muli %[[linearOffsetI64]], %[[c2]] : i32 - //CHECK: %[[finalPtr:.*]] = arith.addi %[[basePtrI64]], %[[byteOffset]] : i32 - //CHECK: %[[ptr:.*]] = llvm.inttoptr %[[finalPtr]] : i32 to !llvm.ptr<3> - //CHECK: %[[loadedI16:.*]] = xevm.blockload %[[ptr]] : (!llvm.ptr<3>) -> vector<8xi16> - //CHECK: %[[loaded:.*]] = vector.bitcast %[[loadedI16]] : vector<8xi16> to vector<8xf16> - - %1 = xegpu.load_matrix %0[%c16, %c48] {subgroup_block_io}: !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index -> vector<8xf16> - - //CHECK: %[[storeDataI16:.*]] = vector.bitcast %[[loaded]] : vector<8xf16> to vector<8xi16> - //CHECK: xevm.blockstore %[[ptr]], %[[storeDataI16]] : (!llvm.ptr<3>, vector<8xi16>) - - xegpu.store_matrix %1, %0[%c16, %c48] {subgroup_block_io}: vector<8xf16>, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index - - gpu.return %1: vector<8xf16> - } - -} diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir index e56079c..1169cd1 100644 --- a/mlir/test/Dialect/Affine/canonicalize.mlir +++ b/mlir/test/Dialect/Affine/canonicalize.mlir @@ -2235,6 +2235,136 @@ func.func @affine_leading_zero_no_outer_bound(%arg0: index, %arg1: index) -> ind // ----- +// CHECK-LABEL: func @delin_apply_cancel_exact +// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>) +// CHECK-COUNT-6: memref.store %[[ARG0]], %[[ARG1]][%[[ARG0]]] +// CHECK-NOT: memref.store +// CHECK: return +func.func @delin_apply_cancel_exact(%arg0: index, %arg1: memref<?xindex>) { + %a:3 = affine.delinearize_index %arg0 into (4, 5) : index, index, index + %b:3 = affine.delinearize_index %arg0 into (3, 4, 5) : index, index, index + %c:2 = affine.delinearize_index %arg0 into (20) : index, index + + %t1 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 5 + s2 * 20)>()[%a#2, %a#1, %a#0] + memref.store %t1, %arg1[%t1] : memref<?xindex> + + %t2 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s2 * 20 + s1 * 5)>()[%a#2, %a#1, %a#0] + memref.store %t2, %arg1[%t2] : memref<?xindex> + + %t3 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 20 + s2 * 5 + s0)>()[%a#2, %a#0, %a#1] + memref.store %t3, %arg1[%t3] : memref<?xindex> + + %t4 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 5 + s2 * 20)>()[%b#2, %b#1, %b#0] + memref.store %t4, %arg1[%t4] : memref<?xindex> + + %t5 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 20)>()[%c#1, %c#0] + memref.store %t5, %arg1[%t5] : memref<?xindex> + + %t6 = affine.apply affine_map<()[s0, s1] -> (s1 * 20 + s0)>()[%c#1, %c#0] + memref.store %t6, %arg1[%t5] : memref<?xindex> + + return +} + +// ----- + +// CHECK-LABEL: func @delin_apply_cancel_exact_dim +// CHECK: affine.for %[[arg1:.+]] = 0 to 256 +// CHECK: memref.store %[[arg1]] +// CHECK: return +func.func @delin_apply_cancel_exact_dim(%arg0: memref<?xindex>) { + affine.for %arg1 = 0 to 256 { + %a:3 = affine.delinearize_index %arg1 into (2, 2, 64) : index, index, index + %i = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 128 + d2 * 64)>(%a#2, %a#0, %a#1) + memref.store %i, %arg0[%i] : memref<?xindex> + } + return +} + +// ----- + +// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 + 512)> +// CHECK-LABEL: func @delin_apply_cancel_const_term +// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>) +// CHECK: affine.apply #[[$MAP]]()[%[[ARG0]]] +// CHECK: return +func.func @delin_apply_cancel_const_term(%arg0: index, %arg1: memref<?xindex>) { + %a:3 = affine.delinearize_index %arg0 into (2, 2, 64) : index, index, index + + %t1 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 64 + 512)>()[%a#2, %a#0, %a#1] + memref.store %t1, %arg1[%t1] : memref<?xindex> + + return +} + +// ----- + +// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 + 512)> +// CHECK-LABEL: func @delin_apply_cancel_var_term +// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>, %[[ARG2:.+]]: index) +// CHECK: affine.apply #[[$MAP]]()[%[[ARG2]], %[[ARG0]]] +// CHECK: return +func.func @delin_apply_cancel_var_term(%arg0: index, %arg1: memref<?xindex>, %arg2: index) { + %a:3 = affine.delinearize_index %arg0 into (2, 2, 64) : index, index, index + + %t1 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s1 * 128 + s2 * 64 + s3 + 512)>()[%a#2, %a#0, %a#1, %arg2] + memref.store %t1, %arg1[%t1] : memref<?xindex> + + return +} + +// ----- + +// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2 + s0 ceildiv 4)> +// CHECK-LABEL: func @delin_apply_cancel_nested_exprs +// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>) +// CHECK: affine.apply #[[$MAP]]()[%[[ARG0]]] +// CHECK: return +func.func @delin_apply_cancel_nested_exprs(%arg0: index, %arg1: memref<?xindex>) { + %a:2 = affine.delinearize_index %arg0 into (20) : index, index + + %t1 = affine.apply affine_map<()[s0, s1] -> ((s0 + s1 * 20) ceildiv 4 + (s1 * 20 + s0) * 2)>()[%a#1, %a#0] + memref.store %t1, %arg1[%t1] : memref<?xindex> + + return +} + +// ----- + +// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)> +// CHECK-LABEL: func @delin_apply_cancel_preserve_rotation +// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>) +// CHECK: %[[A:.+]]:2 = affine.delinearize_index %[[ARG0]] into (20) +// CHECK: affine.apply #[[$MAP]]()[%[[A]]#1, %[[ARG0]]] +// CHECK: return +func.func @delin_apply_cancel_preserve_rotation(%arg0: index, %arg1: memref<?xindex>) { + %a:2 = affine.delinearize_index %arg0 into (20) : index, index + + %t1 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 20 + s0)>()[%a#1, %a#0] + memref.store %t1, %arg1[%t1] : memref<?xindex> + + return +} + +// ----- + +// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 5)> +// CHECK-LABEL: func @delin_apply_dont_cancel_partial +// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>) +// CHECK: %[[A:.+]]:3 = affine.delinearize_index %[[ARG0]] into (3, 4, 5) +// CHECK: affine.apply #[[$MAP]]()[%[[A]]#2, %[[A]]#1] +// CHECK: return +func.func @delin_apply_dont_cancel_partial(%arg0: index, %arg1: memref<?xindex>) { + %a:3 = affine.delinearize_index %arg0 into (3, 4, 5) : index, index, index + + %t1 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 5)>()[%a#2, %a#1] + memref.store %t1, %arg1[%t1] : memref<?xindex> + + return +} + +// ----- + // CHECK-LABEL: @cst_value_to_cst_attr_basis_delinearize_index // CHECK-SAME: (%[[ARG0:.*]]: index) // CHECK: %[[RET:.*]]:3 = affine.delinearize_index %[[ARG0]] into (3, 4, 2) : index, index diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir index 35f520a..93a0336 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir @@ -1,5 +1,9 @@ // RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s +///---------------------------------------------------------------------------------------- +/// Tests for linalg.dot +///---------------------------------------------------------------------------------------- + // CHECK-LABEL: contraction_dot func.func @contraction_dot(%A: memref<1584xf32>, %B: memref<1584xf32>, %C: memref<f32>) { @@ -20,6 +24,10 @@ module attributes {transform.with_named_sequence} { // ----- +///---------------------------------------------------------------------------------------- +/// Tests for linalg.matvec +///---------------------------------------------------------------------------------------- + // CHECK-LABEL: contraction_matvec func.func @contraction_matvec(%A: memref<1584x1584xf32>, %B: memref<1584xf32>, %C: memref<1584xf32>) { @@ -41,6 +49,10 @@ module attributes {transform.with_named_sequence} { // ----- +///---------------------------------------------------------------------------------------- +/// Tests for linalg.matmul +///---------------------------------------------------------------------------------------- + // CHECK-LABEL: contraction_matmul func.func @contraction_matmul(%A: memref<1584x1584xf32>, %B: memref<1584x1584xf32>, %C: memref<1584x1584xf32>) { // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<1584x1584x1584xf32> @@ -138,6 +150,10 @@ module attributes {transform.with_named_sequence} { // ----- +///---------------------------------------------------------------------------------------- +/// Tests for linalg.batch_matmul +///---------------------------------------------------------------------------------------- + // CHECK-LABEL: contraction_batch_matmul func.func @contraction_batch_matmul(%A: memref<1584x1584x1584xf32>, %B: memref<1584x1584x1584xf32>, %C: memref<1584x1584x1584xf32>) { // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<1584x1584x1584x1584xf32> @@ -159,6 +175,10 @@ module attributes {transform.with_named_sequence} { // ----- +///---------------------------------------------------------------------------------------- +/// Tests for linalg.cantract +///---------------------------------------------------------------------------------------- + // CHECK-LABEL: @matmul_as_contract // CHECK-SAME: %[[A:.*]]: tensor<24x12xf32> // CHECK-SAME: %[[B:.*]]: tensor<12x25xf32> @@ -220,6 +240,10 @@ module attributes {transform.with_named_sequence} { // ----- +///---------------------------------------------------------------------------------------- +/// Tests for linalg.fill +///---------------------------------------------------------------------------------------- + // CHECK-LABEL: func @test_vectorize_fill func.func @test_vectorize_fill(%A : memref<8x16xf32>, %arg0 : f32) { // CHECK: %[[V:.*]] = vector.broadcast {{.*}} : f32 to vector<8x16xf32> @@ -259,70 +283,14 @@ module attributes {transform.with_named_sequence} { // ----- -// CHECK-LABEL: func @test_vectorize_copy -func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) { - // CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32> - // CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32> - memref.copy %A, %B : memref<8x16xf32> to memref<8x16xf32> - return -} +///---------------------------------------------------------------------------------------- +/// Tests for linalg.pack +///---------------------------------------------------------------------------------------- -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} +// Note, see a similar test in: +// * vectorization.mlir. -// ----- - -// CHECK-LABEL: func @test_vectorize_copy_0d -func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) { - // CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>) - // CHECK: %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32> - // CHECK: %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32> - // CHECK: %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32> - // CHECK: vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32> - memref.copy %A, %B : memref<f32> to memref<f32> - return -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - -// CHECK-LABEL: func @test_vectorize_copy_complex -// CHECK-NOT: vector< -func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) { - memref.copy %A, %B : memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>> - return -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - -// Input identical as the test in vectorization.mlir. Output is different - -// vector sizes are inferred (rather than user-specified) and hence _no_ -// masking was used. - -func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { +func.func @pack_no_padding(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> return %pack : tensor<4x1x32x16x2xf32> } @@ -336,7 +304,7 @@ module attributes {transform.with_named_sequence} { } } -// CHECK-LABEL: func.func @test_vectorize_pack( +// CHECK-LABEL: func.func @pack_no_padding( // CHECK-SAME: %[[VAL_0:.*]]: tensor<32x8x16xf32>, // CHECK-SAME: %[[VAL_1:.*]]: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { // CHECK-DAG: %[[VAL_2:.*]] = ub.poison : f32 @@ -349,13 +317,16 @@ module attributes {transform.with_named_sequence} { // ----- -func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { +// Note, see a similar test in: +// * vectorization.mlir. + +func.func @pack_with_padding(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { %pad = arith.constant 0.000000e+00 : f32 %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> return %pack : tensor<32x4x1x16x2xf32> } -// CHECK-LABEL: func.func @test_vectorize_padded_pack( +// CHECK-LABEL: func.func @pack_with_padding( // CHECK-SAME: %[[VAL_0:.*]]: tensor<32x7x15xf32>, // CHECK-SAME: %[[VAL_1:.*]]: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { // CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32 @@ -377,6 +348,10 @@ module attributes {transform.with_named_sequence} { // ----- +///---------------------------------------------------------------------------------------- +/// Tests for linalg.map +///---------------------------------------------------------------------------------------- + func.func @vectorize_map(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) { linalg.map ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) @@ -403,6 +378,10 @@ module attributes {transform.with_named_sequence} { // ----- +///---------------------------------------------------------------------------------------- +/// Tests for linalg.transpose +///---------------------------------------------------------------------------------------- + func.func @vectorize_transpose(%arg0: memref<16x32x64xf32>, %arg1: memref<32x64x16xf32>) { linalg.transpose ins(%arg0 : memref<16x32x64xf32>) @@ -424,6 +403,10 @@ module attributes {transform.with_named_sequence} { // ----- +///---------------------------------------------------------------------------------------- +/// Tests for linalg.reduce +///---------------------------------------------------------------------------------------- + func.func @vectorize_reduce(%arg0: memref<16x32x64xf32>, %arg1: memref<16x64xf32>) { linalg.reduce ins(%arg0 : memref<16x32x64xf32>) @@ -449,6 +432,10 @@ module attributes {transform.with_named_sequence} { // ----- +///---------------------------------------------------------------------------------------- +/// Tests for linalg.generic +///---------------------------------------------------------------------------------------- + #matmul_trait = { indexing_maps = [ affine_map<(m, n, k) -> (m, k)>, @@ -1446,6 +1433,8 @@ module attributes {transform.with_named_sequence} { // ----- +// TODO: Two Linalg Ops in one tests - either split or document "why". + // CHECK-DAG: #[[$M6:.*]] = affine_map<(d0, d1) -> (d0, 0)> // CHECK-LABEL: func @fused_broadcast_red_2d @@ -1896,3 +1885,65 @@ module attributes {transform.with_named_sequence} { } } +// ----- + +///---------------------------------------------------------------------------------------- +/// Tests for memref.copy +///---------------------------------------------------------------------------------------- + +// CHECK-LABEL: func @test_vectorize_copy +func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) { + // CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32> + // CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32> + memref.copy %A, %B : memref<8x16xf32> to memref<8x16xf32> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_copy_0d +func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) { + // CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>) + // CHECK: %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32> + // CHECK: %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32> + // CHECK: %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32> + // CHECK: vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32> + memref.copy %A, %B : memref<f32> to memref<f32> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_copy_complex +// CHECK-NOT: vector< +func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) { + memref.copy %A, %B : memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir index 11bea8d..1304a90 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir @@ -1307,14 +1307,17 @@ func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf /// Tests for linalg.pack ///---------------------------------------------------------------------------------------- -// Input identical as the test in vectorization-with-patterns.mlir. Output is -// different - vector sizes are inferred (rather than user-specified) and hence -// masking was used. +// This packing requires no padding, so no out-of-bounds read/write vector Ops. -// CHECK-LABEL: func @test_vectorize_pack +// Note, see a similar test in: +// * vectorization-with-patterns.mlir +// The output is identical (the input vector sizes == the inferred vector +// sizes, i.e. the tensor sizes). + +// CHECK-LABEL: func @pack_no_padding // CHECK-SAME: %[[SRC:.*]]: tensor<32x8x16xf32>, // CHECK-SAME: %[[DEST:.*]]: tensor<4x1x32x16x2xf32> -func.func @test_vectorize_pack(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { +func.func @pack_no_padding(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { %pack = linalg.pack %src outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> return %pack : tensor<4x1x32x16x2xf32> } @@ -1325,9 +1328,9 @@ func.func @test_vectorize_pack(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x1 // CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> // CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32> // CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]] +// CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]] // CHECK-SAME: {in_bounds = [true, true, true, true, true]} : vector<4x1x32x16x2xf32>, tensor<4x1x32x16x2xf32> -// CHECK: return %[[write]] : tensor<4x1x32x16x2xf32> +// CHECK: return %[[WRITE]] : tensor<4x1x32x16x2xf32> module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%src: !transform.any_op {transform.readonly}) { @@ -1339,14 +1342,18 @@ module attributes {transform.with_named_sequence} { // ----- -// Input identical as the test in vectorization-with-patterns.mlir. Output is -// different - vector sizes are inferred (rather than user-specified) and hence -// masking was used. +// This packing does require padding, so there are out-of-bounds read/write +// vector Ops. + +// Note, see a similar test in: +// * vectorization-with-patterns.mlir. +// The output is different (the input vector sizes != inferred vector sizes, +// i.e. the tensor sizes). -// CHECK-LABEL: func @test_vectorize_padded_pack +// CHECK-LABEL: func @pack_with_padding // CHECK-SAME: %[[SRC:.*]]: tensor<32x7x15xf32>, // CHECK-SAME: %[[DEST:.*]]: tensor<32x4x1x16x2xf32> -func.func @test_vectorize_padded_pack(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { +func.func @pack_with_padding(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { %pad = arith.constant 0.000000e+00 : f32 %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> return %pack : tensor<32x4x1x16x2xf32> @@ -1364,9 +1371,9 @@ func.func @test_vectorize_padded_pack(%src: tensor<32x7x15xf32>, %dest: tensor<3 // CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> // CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32> // CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]] +// CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]] // CHECK-SAME: {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32> -// CHECK: return %[[write]] : tensor<32x4x1x16x2xf32> +// CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32> module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { @@ -1378,10 +1385,46 @@ module attributes {transform.with_named_sequence} { // ----- -// CHECK-LABEL: func @test_vectorize_dynamic_pack +// This packing does require padding, so there are out-of-bounds read/write +// vector Ops. + +// Note, see a similar test in: +// * vectorization-with-patterns.mlir. +// The output is identical (in both cases the vector sizes are inferred). + +// CHECK-LABEL: func @pack_with_padding_no_vector_sizes +// CHECK-SAME: %[[SRC:.*]]: tensor<32x7x15xf32>, +// CHECK-SAME: %[[DEST:.*]]: tensor<32x4x1x16x2xf32> +func.func @pack_with_padding_no_vector_sizes(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { + %pad = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> + return %pack : tensor<32x4x1x16x2xf32> +} +// CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]] +// CHECK-SAME: {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32> +// CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> +// CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32> +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]] +// CHECK-SAME: {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32> +// CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @pack_with_dynamic_dims // CHECK-SAME: %[[SRC:.*]]: tensor<?x?xf32>, // CHECK-SAME: %[[DEST:.*]]: tensor<?x?x16x2xf32> -func.func @test_vectorize_dynamic_pack(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> { +func.func @pack_with_dynamic_dims(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> { %pack = linalg.pack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?xf32> -> tensor<?x?x16x2xf32> return %pack : tensor<?x?x16x2xf32> } @@ -1418,64 +1461,6 @@ module attributes {transform.with_named_sequence} { } } -// ----- - -// CHECK-LABEL: func @test_vectorize_pack_no_vector_sizes -// CHECK-SAME: %[[SRC:.*]]: tensor<64x4xf32>, -// CHECK-SAME: %[[DEST:.*]]: tensor<2x4x16x2xf32> -func.func @test_vectorize_pack_no_vector_sizes(%src: tensor<64x4xf32>, %dest: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> { - %pack = linalg.pack %src outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %dest : tensor<64x4xf32> -> tensor<2x4x16x2xf32> - return %pack : tensor<2x4x16x2xf32> -} -// CHECK-DAG: %[[CST:.*]] = ub.poison : f32 -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST]] -// CHECK-SAME: {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32> -// CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<64x4xf32> to vector<4x16x2x2xf32> -// CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32> -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]] -// CHECK-SAME: {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32> -// CHECK: return %[[WRITE]] : tensor<2x4x16x2xf32> - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 : !transform.any_op - transform.yield - } -} - -// ----- - -// CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes -// CHECK-SAME: %[[SRC:.*]]: tensor<32x7x15xf32>, -// CHECK-SAME: %[[DEST:.*]]: tensor<32x4x1x16x2xf32> -func.func @test_vectorize_padded_pack_no_vector_sizes(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { - %pad = arith.constant 0.000000e+00 : f32 - %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> - return %pack : tensor<32x4x1x16x2xf32> -} -// CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]] -// CHECK-SAME: {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32> -// CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> -// CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32> -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]] -// CHECK-SAME: {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32> -// CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32> - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 : !transform.any_op - transform.yield - } -} - - ///---------------------------------------------------------------------------------------- /// Tests for other Ops ///---------------------------------------------------------------------------------------- diff --git a/mlir/test/Dialect/Tosa/tosa-attach-target.mlir b/mlir/test/Dialect/Tosa/tosa-attach-target.mlir index d6c886c..a0c59c0 100644 --- a/mlir/test/Dialect/Tosa/tosa-attach-target.mlir +++ b/mlir/test/Dialect/Tosa/tosa-attach-target.mlir @@ -1,12 +1,14 @@ // RUN: mlir-opt %s -split-input-file -tosa-attach-target="profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround,dynamic level=none" | FileCheck %s --check-prefix=CHECK-ALL // RUN: mlir-opt %s -split-input-file -tosa-attach-target="level=8k" | FileCheck %s --check-prefix=CHECK-LVL-8K // RUN: mlir-opt %s -split-input-file -tosa-attach-target | FileCheck %s --check-prefix=CHECK-DEFAULT +// RUN: mlir-opt %s -split-input-file -tosa-attach-target="specification_version=1.1.draft" | FileCheck %s --check-prefix=CHECK-VERSION-1P1 // ----- -// CHECK-ALL: module attributes {tosa.target_env = #tosa.target_env<level = none, profiles = [pro_int, pro_fp], extensions = [int16, int4, bf16, fp8e4m3, fp8e5m2, fft, variable, controlflow, doubleround, inexactround, dynamic]>} -// CHECK-LVL-8K: module attributes {tosa.target_env = #tosa.target_env<level = "8k", profiles = [], extensions = []>} -// CHECK-DEFAULT: module attributes {tosa.target_env = #tosa.target_env<level = "8k", profiles = [], extensions = []>} +// CHECK-ALL: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = none, profiles = [pro_int, pro_fp], extensions = [int16, int4, bf16, fp8e4m3, fp8e5m2, fft, variable, controlflow, doubleround, inexactround, dynamic]>} +// CHECK-LVL-8K: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = "8k", profiles = [], extensions = []>} +// CHECK-DEFAULT: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = "8k", profiles = [], extensions = []>} +// CHECK-VERSION-1P1: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.1.draft", level = "8k", profiles = [], extensions = []>} // CHECK-LABEL: test_simple func.func @test_simple(%arg0 : tensor<1x1x1x1xf32>, %arg1 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> { %1 = tosa.add %arg0, %arg1 : (tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir new file mode 100644 index 0000000..51089df --- /dev/null +++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir @@ -0,0 +1,21 @@ +// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.0 profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,dynamic,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment" + +// ----- + +func.func @test_matmul_fp8_mixed_precision_operands(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E5M2>) -> tensor<1x14x28xf16> { + %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN> + %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E5M2>}> : () -> tensor<1xf8E5M2> + // expected-error@+1 {{'tosa.matmul' op illegal: the target specification version (1.0) is not backwards compatible with the op compliance specification version (1.1)}} + %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E5M2>, tensor<1xf8E4M3FN>, tensor<1xf8E5M2>) -> tensor<1x14x28xf16> + return %0 : tensor<1x14x28xf16> +} + +// ----- + +func.func @test_matmul_fp8_input_fp32_acc_type(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E4M3FN>) -> tensor<1x14x28xf32> { + %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN> + %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN> + // expected-error@+1 {{'tosa.matmul' op illegal: the target specification version (1.0) is not backwards compatible with the op compliance specification version (1.1)}} + %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E4M3FN>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>) -> tensor<1x14x28xf32> + return %0 : tensor<1x14x28xf32> +} diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir new file mode 100644 index 0000000..8164509 --- /dev/null +++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir @@ -0,0 +1,20 @@ +// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.1.draft profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment" | FileCheck %s + +// ----- + +func.func @test_matmul_fp8_mixed_precision_operands(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E5M2>) -> tensor<1x14x28xf16> { + %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN> + %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E5M2>}> : () -> tensor<1xf8E5M2> + %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E5M2>, tensor<1xf8E4M3FN>, tensor<1xf8E5M2>) -> tensor<1x14x28xf16> + return %0 : tensor<1x14x28xf16> +} + +// ----- + +// CHECK-LABEL: test_matmul_fp8_input_fp32_acc_type +func.func @test_matmul_fp8_input_fp32_acc_type(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E4M3FN>) -> tensor<1x14x28xf32> { + %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN> + %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN> + %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E4M3FN>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>) -> tensor<1x14x28xf32> + return %0 : tensor<1x14x28xf32> +} diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index ebbe3ce..228ef69d 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -858,7 +858,7 @@ func.func @load_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16> // ----- func.func @load_mem_desc_invalid_result_size(%arg0: !xegpu.mem_desc<16x64xf16>) { - // expected-error@+1 {{data shape must not exceed mem_desc shape}} + // expected-error@+1 {{result shape must not exceed mem_desc shape}} %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<32x16xf16> return } @@ -871,14 +871,6 @@ func.func @load_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>) { } // ----- -func.func @load_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>) { - // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}} - %data2 = xegpu.load_matrix %arg0[8, 8] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16> -> vector<16x16xf16> - return -} - - -// ----- func.func @store_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf32>) { // expected-error@+1 {{failed to verify that all of {mem_desc, data} have same element type}} xegpu.store_matrix %arg1, %arg0[8, 8] : vector<16x16xf32>, !xegpu.mem_desc<16x64xf16> @@ -900,16 +892,30 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve } // ----- -func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) { - // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}} - xegpu.store_matrix %data, %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16> +func.func @mem_desc_subview_size_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) { + // expected-error@+1 {{result shape must not exceed source shape}} + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<32x16xf16> + return +} + +// ----- +func.func @mem_desc_subview_layout_mismatch(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>>) { + // expected-error@+1 {{result must inherit the source strides}} + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>> -> !xegpu.mem_desc<8x16xf16> + return +} + +// ----- +func.func @mem_desc_subview_element_type_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) { + // expected-error@+1 {{failed to verify that all of {src, res} have same element type}} + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf32, #xegpu.mem_layout<stride =[64, 1]>> return } // ----- -func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) { - // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}} - xegpu.store_matrix %data, %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16> +func.func @mem_desc_subview_rank_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) { + // expected-error@+1 {{result rank must not exceed source rank}} + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<4x8x16xf16> return } diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index 0a10f68..bb37902 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -825,73 +825,53 @@ gpu.func @create_mem_desc_with_stride() { gpu.return } -// CHECK: gpu.func @load_matrix([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>) -gpu.func @load_matrix(%arg0: !xegpu.mem_desc<16x64xf16>) { +// CHECK: gpu.func @load_mem_desc([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>) +gpu.func @load_mem_desc(%arg0: !xegpu.mem_desc<16x64xf16>) { // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> vector<8x16xf16> %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<8x16xf16> gpu.return } -// CHECK: gpu.func @load_matrix_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) -gpu.func @load_matrix_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) { +// CHECK: gpu.func @load_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) +gpu.func @load_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) { // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16> %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16> gpu.return } -// CHECK: gpu.func @simt_load_matrix(%arg0: !xegpu.mem_desc<16x64xf16>) -gpu.func @simt_load_matrix(%arg0: !xegpu.mem_desc<16x64xf16>) { - // CHECK: xegpu.load_matrix [[ARG0]][8, 16] : !xegpu.mem_desc<16x64xf16> -> vector<1xf16> - %data = xegpu.load_matrix %arg0[8, 16]: !xegpu.mem_desc<16x64xf16> -> vector<1xf16> - gpu.return -} - -// CHECK: gpu.func @simt_load_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>) -gpu.func @simt_load_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>) { - // CHECK: xegpu.load_matrix [[ARG0]][8, 16] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>> -> vector<8xf16> - %data = xegpu.load_matrix %arg0[8, 16] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>> -> vector<8xf16> - gpu.return -} - -// CHECK: gpu.func @simt_load_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) -gpu.func @simt_load_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) { - // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8xf16> - %data = xegpu.load_matrix %arg0[8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8xf16> - gpu.return -} -// CHECK: gpu.func @store_matrix([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>) -gpu.func @store_matrix(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf16>) { +// CHECK: gpu.func @store_mem_desc([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>) +gpu.func @store_mem_desc(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf16>) { // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.mem_desc<16x64xf16> xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16> gpu.return } -// CHECK: gpu.func @store_matrix_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>) -gpu.func @store_matrix_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<16x16xf16>) { +// CHECK: gpu.func @store_mem_desc_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>) +gpu.func @store_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<16x16xf16>) { // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][0, 8] : vector<16x16xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> xegpu.store_matrix %arg1, %arg0[0, 8]: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> gpu.return } -// CHECK: gpu.func @simt_store_matrix(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<1xf16>) { -gpu.func @simt_store_matrix(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<1xf16>) { - // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 16] : vector<1xf16>, !xegpu.mem_desc<16x64xf16> - xegpu.store_matrix %arg1, %arg0[8, 16]: vector<1xf16>, !xegpu.mem_desc<16x64xf16> +// CHECK: gpu.func @mem_desc_subview([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>) +gpu.func @mem_desc_subview(%arg0: !xegpu.mem_desc<16x64xf16>) { + //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>> + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>> gpu.return } -// CHECK: gpu.func @simt_store_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>, %arg1: vector<8xf16>) -gpu.func @simt_store_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>, %arg1: vector<8xf16>) { - // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 16] <{subgroup_block_io}>: vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>> - xegpu.store_matrix %arg1, %arg0[8, 16] <{subgroup_block_io}>: vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>> +// CHECK: gpu.func @mem_desc_subview_lower_rank([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>) +gpu.func @mem_desc_subview_lower_rank(%arg0: !xegpu.mem_desc<16x64xf16>) { + //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>> + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>> gpu.return } -// CHECK: gpu.func @simt_store_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<8xf16>) { -gpu.func @simt_store_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<8xf16>) { - // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> - xegpu.store_matrix %arg1, %arg0[8, 8] : vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> +// CHECK: gpu.func @mem_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) +gpu.func @mem_desc_subview_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) { + //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>> + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>> gpu.return } diff --git a/mlir/test/Target/LLVMIR/Import/function-attributes.ll b/mlir/test/Target/LLVMIR/Import/function-attributes.ll index cc3d799..00d09ba 100644 --- a/mlir/test/Target/LLVMIR/Import/function-attributes.ll +++ b/mlir/test/Target/LLVMIR/Import/function-attributes.ll @@ -393,6 +393,12 @@ declare void @alwaysinline_attribute() alwaysinline // ----- +; CHECK-LABEL: @inlinehint_attribute +; CHECK-SAME: attributes {inline_hint} +declare void @inlinehint_attribute() inlinehint + +// ----- + ; CHECK-LABEL: @optnone_attribute ; CHECK-SAME: attributes {no_inline, optimize_none} declare void @optnone_attribute() noinline optnone diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index 69814f2..cc243c8 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -2555,6 +2555,17 @@ llvm.func @always_inline() attributes { always_inline } { // ----- +// CHECK-LABEL: @inline_hint +// CHECK-SAME: #[[ATTRS:[0-9]+]] +llvm.func @inline_hint() attributes { inline_hint } { + llvm.return +} + +// CHECK: #[[ATTRS]] +// CHECK-SAME: inlinehint + +// ----- + // CHECK-LABEL: @optimize_none // CHECK-SAME: #[[ATTRS:[0-9]+]] llvm.func @optimize_none() attributes { no_inline, optimize_none } { diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp index 727c84c..8c5c8e8 100644 --- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp +++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp @@ -276,10 +276,8 @@ void TestLinalgTransforms::runOnOperation() { Operation *consumer = opOperand->getOwner(); // If we have a pack/unpack consumer and a producer that has multiple // uses, do not apply the folding patterns. - if (isa<linalg::PackOp, linalg::UnPackOp>(consumer) && - isa<TilingInterface>(producer) && !producer->hasOneUse()) - return false; - return true; + return !(isa<linalg::PackOp, linalg::UnPackOp>(consumer) && + isa<TilingInterface>(producer) && !producer->hasOneUse()); }; applyFoldIntoPackAndUnpackPatterns(rootOp, controlFn); } diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp index 97fc699..496f18b 100644 --- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp +++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp @@ -938,10 +938,10 @@ public: // These are automatically generated by ODS but are not used as the Transform // dialect uses a different dispatch mechanism to support dialect extensions. -LLVM_ATTRIBUTE_UNUSED static OptionalParseResult +[[maybe_unused]] static OptionalParseResult generatedTypeParser(AsmParser &parser, StringRef *mnemonic, Type &value); -LLVM_ATTRIBUTE_UNUSED static LogicalResult -generatedTypePrinter(Type def, AsmPrinter &printer); +[[maybe_unused]] static LogicalResult generatedTypePrinter(Type def, + AsmPrinter &printer); #define GET_TYPEDEF_CLASSES #include "TestTransformDialectExtensionTypes.cpp.inc" diff --git a/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll b/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll index 4e869e5..4be30d8 100644 --- a/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll +++ b/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll @@ -28,7 +28,7 @@ // CHECK: operation "test.op3" // CHECK: )mlir", context), std::forward<ConfigsT>(configs)...) -// CHECK: static void LLVM_ATTRIBUTE_UNUSED populateGeneratedPDLLPatterns(::mlir::RewritePatternSet &patterns, ConfigsT &&...configs) { +// CHECK{LITERAL}: [[maybe_unused]] static void populateGeneratedPDLLPatterns(::mlir::RewritePatternSet &patterns, ConfigsT &&...configs) { // CHECK-NEXT: patterns.add<GeneratedPDLLPattern0>(patterns.getContext(), configs...); // CHECK-NEXT: patterns.add<NamedPattern>(patterns.getContext(), configs...); // CHECK-NEXT: patterns.add<GeneratedPDLLPattern1>(patterns.getContext(), configs...); diff --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py index 26ee9f3..66c4018 100644 --- a/mlir/test/python/dialects/gpu/dialect.py +++ b/mlir/test/python/dialects/gpu/dialect.py @@ -1,6 +1,7 @@ # RUN: %PYTHON %s | FileCheck %s from mlir.ir import * +import mlir.ir as ir import mlir.dialects.gpu as gpu import mlir.dialects.gpu.passes from mlir.passmanager import * @@ -64,3 +65,95 @@ def testObjectAttr(): # CHECK: #gpu.object<#nvvm.target, kernels = <[#gpu.kernel_metadata<"kernel", () -> ()>]>, "BC\C0\DE5\14\00\00\05\00\00\00b\0C0$MY\BEf"> print(o) assert o.kernels == kernelTable + + +# CHECK-LABEL: testGPUFuncOp +@run +def testGPUFuncOp(): + assert gpu.GPUFuncOp.__doc__ is not None + module = Module.create() + with InsertionPoint(module.body): + gpu_module_name = StringAttr.get("gpu_module") + gpumodule = gpu.GPUModuleOp(gpu_module_name) + block = gpumodule.bodyRegion.blocks.append() + + def builder(func: gpu.GPUFuncOp) -> None: + gpu.GlobalIdOp(gpu.Dimension.x) + gpu.ReturnOp([]) + + with InsertionPoint(block): + name = StringAttr.get("kernel0") + func_type = ir.FunctionType.get(inputs=[], results=[]) + type_attr = TypeAttr.get(func_type) + func = gpu.GPUFuncOp(type_attr, name) + func.attributes["sym_name"] = name + func.attributes["gpu.kernel"] = UnitAttr.get() + + try: + func.entry_block + assert False, "Expected RuntimeError" + except RuntimeError as e: + assert ( + str(e) + == "Entry block does not exist for kernel0. Do you need to call the add_entry_block() method on this GPUFuncOp?" + ) + + block = func.add_entry_block() + with InsertionPoint(block): + builder(func) + + try: + func.add_entry_block() + assert False, "Expected RuntimeError" + except RuntimeError as e: + assert str(e) == "Entry block already exists for kernel0" + + func = gpu.GPUFuncOp( + func_type, + sym_name="kernel1", + kernel=True, + body_builder=builder, + known_block_size=[1, 2, 3], + known_grid_size=DenseI32ArrayAttr.get([4, 5, 6]), + ) + + assert func.name.value == "kernel1" + assert func.function_type.value == func_type + assert func.arg_attrs == None + assert func.res_attrs == None + assert func.arguments == [] + assert func.entry_block == func.body.blocks[0] + assert func.is_kernel + assert func.known_block_size == DenseI32ArrayAttr.get( + [1, 2, 3] + ), func.known_block_size + assert func.known_grid_size == DenseI32ArrayAttr.get( + [4, 5, 6] + ), func.known_grid_size + + func = gpu.GPUFuncOp( + func_type, + sym_name="non_kernel_func", + body_builder=builder, + ) + assert not func.is_kernel + assert func.known_block_size is None + assert func.known_grid_size is None + + print(module) + + # CHECK: gpu.module @gpu_module + # CHECK: gpu.func @kernel0() kernel { + # CHECK: %[[VAL_0:.*]] = gpu.global_id x + # CHECK: gpu.return + # CHECK: } + # CHECK: gpu.func @kernel1() kernel attributes + # CHECK-SAME: known_block_size = array<i32: 1, 2, 3> + # CHECK-SAME: known_grid_size = array<i32: 4, 5, 6> + # CHECK: %[[VAL_0:.*]] = gpu.global_id x + # CHECK: gpu.return + # CHECK: } + # CHECK: gpu.func @non_kernel_func() { + # CHECK: %[[VAL_0:.*]] = gpu.global_id x + # CHECK: gpu.return + # CHECK: } diff --git a/mlir/test/python/dialects/openacc.py b/mlir/test/python/dialects/openacc.py new file mode 100644 index 0000000..8f2142a --- /dev/null +++ b/mlir/test/python/dialects/openacc.py @@ -0,0 +1,171 @@ +# RUN: %PYTHON %s | FileCheck %s +from unittest import result +from mlir.ir import ( + Context, + FunctionType, + Location, + Module, + InsertionPoint, + IntegerType, + IndexType, + MemRefType, + F32Type, + Block, + ArrayAttr, + Attribute, + UnitAttr, + StringAttr, + DenseI32ArrayAttr, + ShapedType, +) +from mlir.dialects import openacc, func, arith, memref +from mlir.extras import types + + +def run(f): + print("\n// TEST:", f.__name__) + with Context(), Location.unknown(): + f() + return f + + +@run +def testParallelMemcpy(): + module = Module.create() + + dynamic = ShapedType.get_dynamic_size() + memref_f32_1d_any = MemRefType.get([dynamic], types.f32()) + + with InsertionPoint(module.body): + function_type = FunctionType.get( + [memref_f32_1d_any, memref_f32_1d_any, types.i64()], [] + ) + f = func.FuncOp( + type=function_type, + name="memcpy_idiom", + ) + f.attributes["sym_visibility"] = StringAttr.get("public") + + with InsertionPoint(f.add_entry_block()): + c1024 = arith.ConstantOp(types.i32(), 1024) + c128 = arith.ConstantOp(types.i32(), 128) + + arg0, arg1, arg2 = f.arguments + + copied = openacc.copyin( + acc_var=arg0.type, + var=arg0, + var_type=types.f32(), + bounds=[], + async_operands=[], + implicit=False, + structured=True, + ) + created = openacc.create_( + acc_var=arg1.type, + var=arg1, + var_type=types.f32(), + bounds=[], + async_operands=[], + implicit=False, + structured=True, + ) + + parallel_op = openacc.ParallelOp( + asyncOperands=[], + waitOperands=[], + numGangs=[c1024], + numWorkers=[], + vectorLength=[c128], + reductionOperands=[], + privateOperands=[], + firstprivateOperands=[], + dataClauseOperands=[], + ) + + # Set required device_type and segment attributes to satisfy verifier + acc_device_none = ArrayAttr.get([Attribute.parse("#acc.device_type<none>")]) + parallel_op.numGangsDeviceType = acc_device_none + parallel_op.numGangsSegments = DenseI32ArrayAttr.get([1]) + parallel_op.vectorLengthDeviceType = acc_device_none + + parallel_block = Block.create_at_start(parent=parallel_op.region, arg_types=[]) + + with InsertionPoint(parallel_block): + c0 = arith.ConstantOp(types.i64(), 0) + c1 = arith.ConstantOp(types.i64(), 1) + + loop_op = openacc.LoopOp( + results_=[], + lowerbound=[c0], + upperbound=[f.arguments[2]], + step=[c1], + gangOperands=[], + workerNumOperands=[], + vectorOperands=[], + tileOperands=[], + cacheOperands=[], + privateOperands=[], + reductionOperands=[], + firstprivateOperands=[], + ) + + # Set loop attributes: gang and independent on device_type<none> + acc_device_none = ArrayAttr.get([Attribute.parse("#acc.device_type<none>")]) + loop_op.gang = acc_device_none + loop_op.independent = acc_device_none + + loop_block = Block.create_at_start( + parent=loop_op.region, arg_types=[types.i64()] + ) + + with InsertionPoint(loop_block): + idx = arith.index_cast(out=IndexType.get(), in_=loop_block.arguments[0]) + val = memref.load(memref=copied, indices=[idx]) + memref.store(value=val, memref=created, indices=[idx]) + openacc.YieldOp([]) + + openacc.YieldOp([]) + + deleted = openacc.delete( + acc_var=copied, + bounds=[], + async_operands=[], + implicit=False, + structured=True, + ) + copied = openacc.copyout( + acc_var=created, + var=arg1, + var_type=types.f32(), + bounds=[], + async_operands=[], + implicit=False, + structured=True, + ) + func.ReturnOp([]) + + print(module) + + # CHECK: TEST: testParallelMemcpy + # CHECK-LABEL: func.func public @memcpy_idiom( + # CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: i64) { + # CHECK: %[[CONSTANT_0:.*]] = arith.constant 1024 : i32 + # CHECK: %[[CONSTANT_1:.*]] = arith.constant 128 : i32 + # CHECK: %[[COPYIN_0:.*]] = acc.copyin varPtr(%[[ARG0]] : memref<?xf32>) -> memref<?xf32> + # CHECK: %[[CREATE_0:.*]] = acc.create varPtr(%[[ARG1]] : memref<?xf32>) -> memref<?xf32> + # CHECK: acc.parallel num_gangs({%[[CONSTANT_0]] : i32}) vector_length(%[[CONSTANT_1]] : i32) { + # CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : i64 + # CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : i64 + # CHECK: acc.loop gang control(%[[VAL_0:.*]] : i64) = (%[[CONSTANT_2]] : i64) to (%[[ARG2]] : i64) step (%[[CONSTANT_3]] : i64) { + # CHECK: %[[INDEX_CAST_0:.*]] = arith.index_cast %[[VAL_0]] : i64 to index + # CHECK: %[[LOAD_0:.*]] = memref.load %[[COPYIN_0]]{{\[}}%[[INDEX_CAST_0]]] : memref<?xf32> + # CHECK: memref.store %[[LOAD_0]], %[[CREATE_0]]{{\[}}%[[INDEX_CAST_0]]] : memref<?xf32> + # CHECK: acc.yield + # CHECK: } attributes {independent = [#acc.device_type<none>]} + # CHECK: acc.yield + # CHECK: } + # CHECK: acc.delete accPtr(%[[COPYIN_0]] : memref<?xf32>) + # CHECK: acc.copyout accPtr(%[[CREATE_0]] : memref<?xf32>) to varPtr(%[[ARG1]] : memref<?xf32>) + # CHECK: return + # CHECK: } diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py index cb4cfc8c..1d4ede1 100644 --- a/mlir/test/python/ir/operation.py +++ b/mlir/test/python/ir/operation.py @@ -569,12 +569,30 @@ def testOperationAttributes(): # CHECK: Attribute value b'text' print(f"Attribute value {sattr.value_bytes}") + # Python dict-style iteration # We don't know in which order the attributes are stored. - # CHECK-DAG: NamedAttribute(dependent="text") - # CHECK-DAG: NamedAttribute(other.attribute=3.000000e+00 : f64) - # CHECK-DAG: NamedAttribute(some.attribute=1 : i8) - for attr in op.attributes: - print(str(attr)) + # CHECK-DAG: dependent + # CHECK-DAG: other.attribute + # CHECK-DAG: some.attribute + for name in op.attributes: + print(name) + + # Basic dict-like introspection + # CHECK: True + print("some.attribute" in op.attributes) + # CHECK: False + print("missing" in op.attributes) + # CHECK: Keys: ['dependent', 'other.attribute', 'some.attribute'] + print("Keys:", sorted(op.attributes.keys())) + # CHECK: Values count 3 + print("Values count", len(op.attributes.values())) + # CHECK: Items count 3 + print("Items count", len(op.attributes.items())) + + # Dict() conversion test + d = {k: v.value for k, v in dict(op.attributes).items()} + # CHECK: Dict mapping {'dependent': 'text', 'other.attribute': 3.0, 'some.attribute': 1} + print("Dict mapping", d) # Check that exceptions are raised as expected. try: |