diff options
Diffstat (limited to 'mlir/test')
86 files changed, 2170 insertions, 957 deletions
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 89568e7..a4a942d 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -167,6 +167,10 @@ if(MLIR_ENABLE_SYCL_RUNNER) list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime) endif() +if(MLIR_ENABLE_LEVELZERO_RUNNER) + list(APPEND MLIR_TEST_DEPENDS mlir_levelzero_runtime) +endif() + if (MLIR_RUN_ARM_SME_TESTS AND NOT ARM_SME_ABI_ROUTINES_SHLIB) list(APPEND MLIR_TEST_DEPENDS mlir_arm_sme_abi_stubs) endif() diff --git a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir index 2a7be0b..e6321e9 100644 --- a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir +++ b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir @@ -85,6 +85,28 @@ func.func @load_i1(%src: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %i : return %0: i1 } +// CHECK-LABEL: func @load_aligned +// CHECK-SAME: (%[[SRC:.+]]: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %[[IDX:.+]]: index) +func.func @load_aligned(%src: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %i : index) -> i1 { + // CHECK: spirv.Load "StorageBuffer" {{.*}} ["Aligned", 32] : i8 + %0 = memref.load %src[%i] { alignment = 32 } : memref<4xi1, #spirv.storage_class<StorageBuffer>> + return %0: i1 +} + +// CHECK-LABEL: func @load_aligned_nontemporal +func.func @load_aligned_nontemporal(%src: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %i : index) -> i1 { + // CHECK: spirv.Load "StorageBuffer" {{.*}} ["Aligned|Nontemporal", 32] : i8 + %0 = memref.load %src[%i] { alignment = 32, nontemporal = true } : memref<4xi1, #spirv.storage_class<StorageBuffer>> + return %0: i1 +} + +// CHECK-LABEL: func @load_aligned_psb +func.func @load_aligned_psb(%src: memref<4xi1, #spirv.storage_class<PhysicalStorageBuffer>>, %i : index) -> i1 { + // CHECK: %[[VAL:.+]] = spirv.Load "PhysicalStorageBuffer" {{.*}} ["Aligned", 32] : i8 + %0 = memref.load %src[%i] { alignment = 32 } : memref<4xi1, #spirv.storage_class<PhysicalStorageBuffer>> + return %0: i1 +} + // CHECK-LABEL: func @store_i1 // CHECK-SAME: %[[DST:.+]]: memref<4xi1, #spirv.storage_class<StorageBuffer>>, // CHECK-SAME: %[[IDX:.+]]: index diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index 580b09d..e505767 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -681,3 +681,17 @@ llvm.func @ex2(%input : f32, %pred : i1) { %1 = nvvm.inline_ptx "ex2.approx.ftz.f32 $0, $1;" (%input), predicate = %pred : f32, i1 -> f32 llvm.return } + +// ----- + +// CHECK-LABEL: @nvvm_pmevent +llvm.func @nvvm_pmevent() { + // CHECK: %[[S0:.+]] = llvm.mlir.constant(10 : i32) : i32 + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "pmevent $0;", "n" %[[S0]] : (i32) -> () + + nvvm.pmevent id = 10 + // CHECK: %[[S1:.+]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "pmevent $0;", "n" %[[S1]] : (i32) -> () + nvvm.pmevent id = 4 + llvm.return +} diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir index 31e17fb..5a424a8 100644 --- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir @@ -1679,6 +1679,16 @@ func.func @load_0d(%memref : memref<200x100xf32>, %i : index, %j : index) -> vec // ----- +func.func @load_with_alignment(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<8xf32> { + %0 = vector.load %memref[%i, %j] { alignment = 8 } : memref<200x100xf32>, vector<8xf32> + return %0 : vector<8xf32> +} + +// CHECK-LABEL: func @load_with_alignment +// CHECK: llvm.load {{.*}} {alignment = 8 : i64} : !llvm.ptr -> vector<8xf32> + +// ----- + //===----------------------------------------------------------------------===// // vector.store //===----------------------------------------------------------------------===// @@ -1785,6 +1795,16 @@ func.func @store_0d(%memref : memref<200x100xf32>, %i : index, %j : index) { // ----- +func.func @store_with_alignment(%memref : memref<200x100xf32>, %i : index, %j : index, %val : vector<4xf32>) { + vector.store %val, %memref[%i, %j] {alignment = 8} : memref<200x100xf32>, vector<4xf32> + return +} + +// CHECK-LABEL: func @store_with_alignment +// CHECK: llvm.store %{{.*}} {alignment = 8 : i64} : vector<4xf32>, !llvm.ptr + +// ----- + //===----------------------------------------------------------------------===// // vector.maskedload //===----------------------------------------------------------------------===// @@ -1839,6 +1859,16 @@ func.func @masked_load_index_scalable(%arg0: memref<?xindex>, %arg1: vector<[16] // ----- +func.func @masked_load_with_alignment(%arg0: memref<?xf32>, %arg1: vector<16xi1>, %arg2: vector<16xf32>, %arg3: index) -> vector<16xf32> { + %0 = vector.maskedload %arg0[%arg3], %arg1, %arg2 { alignment = 2 } : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32> + return %0 : vector<16xf32> +} + +// CHECK-LABEL: func @masked_load_with_alignment +// CHECK: llvm.intr.masked.load %{{.*}} {alignment = 2 : i32} : (!llvm.ptr, vector<16xi1>, vector<16xf32>) -> vector<16xf32> + +// ----- + //===----------------------------------------------------------------------===// // vector.maskedstore //===----------------------------------------------------------------------===// @@ -1891,6 +1921,16 @@ func.func @masked_store_index_scalable(%arg0: memref<?xindex>, %arg1: vector<[16 // ----- +func.func @masked_store_with_alignment(%arg0: memref<?xf32>, %arg1: vector<16xi1>, %arg2: vector<16xf32>, %arg3: index) { + vector.maskedstore %arg0[%arg3], %arg1, %arg2 { alignment = 2 } : memref<?xf32>, vector<16xi1>, vector<16xf32> + return +} + +// CHECK-LABEL: func @masked_store_with_alignment +// CHECK: llvm.intr.masked.store %{{.*}} {alignment = 2 : i32} : vector<16xf32>, vector<16xi1> into !llvm.ptr + +// ----- + //===----------------------------------------------------------------------===// // vector.gather //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir index 57afa12..8ca3dd6 100644 --- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir +++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir @@ -54,18 +54,20 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) { // CHECK: func @test_expand_shape // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index func.func @test_expand_shape(%offset_i: index, %offset_j: index) { - // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> + // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3> // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16> // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index - // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX]]], %[[LOCAL]][%[[C0]], %[[C0]]] - // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, 3> + // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index + // CHECK: %[[IDXL:.*]] = affine.linearize_index [%[[C0]], %[[C0]]] by (64, 64) : index + // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDXM]]], %[[LOCAL]][%[[IDXL]]] + // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3> - %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace> + %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace> %mem = memref.alloc() : memref<8192xf16> - %expand = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16> into memref<64x128xf16> + %expand_mem = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16> into memref<64x128xf16> + %expand_alloc = memref.expand_shape %alloc [[0, 1]] output_shape [64, 64] : memref<4096xf16, #gpu_lds_addrspace> into memref<64x64xf16, #gpu_lds_addrspace> %c0 = arith.constant 0 : index - amdgpu.gather_to_lds %expand[%offset_i, %offset_j], %alloc[%c0, %c0] + amdgpu.gather_to_lds %expand_mem[%offset_i, %offset_j], %expand_alloc[%c0, %c0] : vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu_lds_addrspace> func.return } @@ -80,15 +82,82 @@ func.func @test_collapse_shape(%offset_i: index, %offset_j: index) { // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3> // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16> // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index - // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]] + // CHECK: %[[INDICES_MEM:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index + // CHECK: %[[INDICES_LDS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (64, 64) : index, index + // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES_MEM]]#0, %[[INDICES_MEM]]#1], %[[LOCAL]][%[[INDICES_LDS]]#0, %[[INDICES_LDS]]#1] // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3> %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace> + %collapse_alloc = memref.collapse_shape %alloc [[0, 1]] : memref<64x64xf16, #gpu_lds_addrspace> into memref<4096xf16, #gpu_lds_addrspace> %mem = memref.alloc() : memref<64x128xf16> - %collapse = memref.collapse_shape %mem [[0, 1]] : memref<64x128xf16> into memref<8192xf16> + %collapse_mem = memref.collapse_shape %mem [[0, 1]] : memref<64x128xf16> into memref<8192xf16> %c0 = arith.constant 0 : index - amdgpu.gather_to_lds %collapse[%offset_i], %alloc[%c0, %c0] + amdgpu.gather_to_lds %collapse_mem[%offset_i], %collapse_alloc[%offset_j] + : vector<8xf16>, memref<8192xf16>, memref<4096xf16, #gpu_lds_addrspace> + func.return +} + + +// ----- + +#gpu_lds_addrspace = 3 + + +// CHECK: func @test_expand_shape_src_raw_buffer +// CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index +func.func @test_expand_shape_src_raw_buffer(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) { + // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3> + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG1]], %[[ARG2]]] by (64, 128) : index + // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[IDXM]]], %[[LOCAL]][%[[C0]]] + // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3> + + %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace> + %expand_mem = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>> into memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>> + + %c0 = arith.constant 0 : index + amdgpu.gather_to_lds %expand_mem[%offset_i, %offset_j], %alloc[%c0] + : vector<8xf16>, memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu_lds_addrspace> + func.return +} + +// ----- + +#gpu_lds_addrspace = 3 + +// CHECK: func @test_expand_shape_dst_only +// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index +func.func @test_expand_shape_dst_only(%offset_i: index, %offset_j: index) { + // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3> + // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16> + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[IDX_LDS:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (64, 64) : index + // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]]], %[[LOCAL]][%[[IDX_LDS]]] + // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3> + + %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace> + %mem = memref.alloc() : memref<8192xf16> + %expand_alloc = memref.expand_shape %alloc [[0, 1]] output_shape [64, 64] : memref<4096xf16, #gpu_lds_addrspace> into memref<64x64xf16, #gpu_lds_addrspace> + + %c0 = arith.constant 0 : index + amdgpu.gather_to_lds %mem[%offset_i], %expand_alloc[%offset_j, %c0] : vector<8xf16>, memref<8192xf16>, memref<64x64xf16, #gpu_lds_addrspace> func.return } + +// ----- + +#gpu_lds_addrspace = 3 + +// CHECK: func @test_nop +// CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index +func.func @test_nop(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) { + // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3> + // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[ARG1]]], %[[LOCAL]][%[[ARG2]]] + // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3> + + %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace> + amdgpu.gather_to_lds %mem[%offset_i], %alloc[%offset_j] + : vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu_lds_addrspace> + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 0d2fd24..66e7dd4 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -230,3 +230,11 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16> func.return } + +// ----- + +func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16, strided<[?]>, #gpu.address_space<workgroup>>) { + // expected-error@+1 {{'amdgpu.gather_to_lds' op destination type inner most dim must be contiguous}} + amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16, strided<[?]>, #gpu.address_space<workgroup>> + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index fe78b53..87e11c0 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -539,13 +539,15 @@ func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16 } // CHECK-LABEL: func @gather_to_lds -func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>) { +func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>, %smem3 : memref<?x?xf16, strided<[?, 1]>, #gpu.address_space<workgroup>>) { // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}] // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}] // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}] + // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}] amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32x32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>> amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem1[%idx1] : vector<2xf16>, memref<32x32xf16>, memref<32xf16, #gpu.address_space<workgroup>> amdgpu.gather_to_lds %mem1[%idx1], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>> + amdgpu.gather_to_lds %mem1[%idx1], %smem3[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<?x?xf16, strided<[?, 1]>, #gpu.address_space<workgroup>> func.return } diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir index 3d5a46d..78f6782 100644 --- a/mlir/test/Dialect/Arith/canonicalize.mlir +++ b/mlir/test/Dialect/Arith/canonicalize.mlir @@ -654,7 +654,7 @@ func.func @signExtendConstant() -> i16 { // CHECK: return %[[cres]] func.func @signExtendConstantSplat() -> vector<4xi16> { %c-2 = arith.constant -2 : i8 - %splat = vector.splat %c-2 : vector<4xi8> + %splat = vector.broadcast %c-2 : i8 to vector<4xi8> %ext = arith.extsi %splat : vector<4xi8> to vector<4xi16> return %ext : vector<4xi16> } @@ -682,7 +682,7 @@ func.func @unsignedExtendConstant() -> i16 { // CHECK: return %[[cres]] func.func @unsignedExtendConstantSplat() -> vector<4xi16> { %c2 = arith.constant 2 : i8 - %splat = vector.splat %c2 : vector<4xi8> + %splat = vector.broadcast %c2 : i8 to vector<4xi8> %ext = arith.extui %splat : vector<4xi8> to vector<4xi16> return %ext : vector<4xi16> } @@ -866,7 +866,7 @@ func.func @truncExtsiVector(%arg0: vector<2xi32>) -> vector<2xi16> { // CHECK: return %[[cres]] func.func @truncConstantSplat() -> vector<4xi8> { %c-2 = arith.constant -2 : i16 - %splat = vector.splat %c-2 : vector<4xi16> + %splat = vector.broadcast %c-2 : i16 to vector<4xi16> %trunc = arith.trunci %splat : vector<4xi16> to vector<4xi8> return %trunc : vector<4xi8> } @@ -2334,7 +2334,7 @@ func.func @constant_FPtoUI_splat() -> vector<4xi32> { // CHECK: %[[C0:.+]] = arith.constant dense<2> : vector<4xi32> // CHECK: return %[[C0]] %c0 = arith.constant 2.0 : f32 - %splat = vector.splat %c0 : vector<4xf32> + %splat = vector.broadcast %c0 : f32 to vector<4xf32> %res = arith.fptoui %splat : vector<4xf32> to vector<4xi32> return %res : vector<4xi32> } @@ -2374,7 +2374,7 @@ func.func @constant_FPtoSI_splat() -> vector<4xi32> { // CHECK: %[[C0:.+]] = arith.constant dense<-2> : vector<4xi32> // CHECK: return %[[C0]] %c0 = arith.constant -2.0 : f32 - %splat = vector.splat %c0 : vector<4xf32> + %splat = vector.broadcast %c0 : f32 to vector<4xf32> %res = arith.fptosi %splat : vector<4xf32> to vector<4xi32> return %res : vector<4xi32> } @@ -2413,7 +2413,7 @@ func.func @constant_SItoFP_splat() -> vector<4xf32> { // CHECK: %[[C0:.+]] = arith.constant dense<2.000000e+00> : vector<4xf32> // CHECK: return %[[C0]] %c0 = arith.constant 2 : i32 - %splat = vector.splat %c0 : vector<4xi32> + %splat = vector.broadcast %c0 : i32 to vector<4xi32> %res = arith.sitofp %splat : vector<4xi32> to vector<4xf32> return %res : vector<4xf32> } @@ -2442,7 +2442,7 @@ func.func @constant_UItoFP_splat() -> vector<4xf32> { // CHECK: %[[C0:.+]] = arith.constant dense<2.000000e+00> : vector<4xf32> // CHECK: return %[[C0]] %c0 = arith.constant 2 : i32 - %splat = vector.splat %c0 : vector<4xi32> + %splat = vector.broadcast %c0 : i32 to vector<4xi32> %res = arith.uitofp %splat : vector<4xi32> to vector<4xf32> return %res : vector<4xf32> } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index ee1fdfa..9cc0bf8 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -17,6 +17,18 @@ module attributes {gpu.container_module} { return } + // CHECK-LABEL:func @launch_with_module_func_attr(%{{.*}}: index) + func.func @launch_with_module_func_attr(%sz : index) { + // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) module(@test_module) function(@test_kernel_func) + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz) + threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) + module(@test_module) function(@test_kernel_func) { + // CHECK: gpu.terminator + gpu.terminator + } + return + } + // CHECK-LABEL:func @args(%{{.*}}: index, %{{.*}}: index, %{{.*}}: f32, %{{.*}}: memref<?xf32, 1>) { func.func @args(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) { // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir index d48fa05..0490118 100644 --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -509,7 +509,7 @@ func.func @launch_cluster() { // CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1> // ----- -// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch +// This test tests the two optional attributes `module` and `function` for gpu.launch // CHECK-LABEL: func.func @testKernelAttributes() // CHECK: gpu.launch_func @test_module::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) // CHECK: gpu.module @test_module @@ -523,15 +523,16 @@ func.func @testKernelAttributes() { %bDimZ = arith.constant 8 : index gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) - threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) + module(@test_module) function(@test_kernel_func) { "some_op"(%bx, %tx) : (index, index) -> () gpu.terminator - } {kernelModule = @test_module, kernelFunc = @test_kernel_func} + } return } // ----- -// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch, when kernelModule already exists. +// This test tests the two optional attributes `module` and `function` for gpu.launch, when kernelModule already exists. // CHECK-LABEL: gpu.module @existing_module // CHECK: gpu.func @test_kernel_func() @@ -556,15 +557,16 @@ func.func @testExistingModule() { %bDimZ = arith.constant 8 : index gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) - threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) + module(@existing_module) function(@test_kernel_func) { "some_op"(%bx, %tx) : (index, index) -> () gpu.terminator - } {kernelModule = @existing_module, kernelFunc = @test_kernel_func} + } return } // ----- -// This test tests the optional attribute kernelModule for gpu.launch. +// This test tests the optional attribute `module` for gpu.launch. // CHECK-LABEL: func.func @testKernelModuleOnly() // CHECK: gpu.launch_func @test_module::@testKernelModuleOnly_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) // CHECK: gpu.module @test_module @@ -578,15 +580,16 @@ func.func @testKernelModuleOnly() { %bDimZ = arith.constant 8 : index gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) - threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) + module(@test_module) { "some_op"(%bx, %tx) : (index, index) -> () gpu.terminator - } {kernelModule = @test_module} + } return } // ----- -// This test tests the optional attribute kernelFunc for gpu.launch. +// This test tests the optional attribute `function` for gpu.launch. // CHECK-LABEL: func.func @testKernelFuncOnly() // CHECK: gpu.launch_func @test_kernel_func::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) @@ -601,15 +604,16 @@ func.func @testKernelFuncOnly() { %bDimZ = arith.constant 8 : index gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) - threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) + function(@test_kernel_func) { "some_op"(%bx, %tx) : (index, index) -> () gpu.terminator - } {kernelFunc = @test_kernel_func} + } return } // ----- -// This test tests gpu.launch when optional attributes kernelModule and kernelFunc are not specified. +// This test tests gpu.launch when optional attributes `module` and `function` are not specified. // CHECK-LABEL: func.func @testNoAttributes() // CHECK: gpu.launch_func @testNoAttributes_kernel::@testNoAttributes_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) diff --git a/mlir/test/Dialect/LLVMIR/call-intrin.mlir b/mlir/test/Dialect/LLVMIR/call-intrin.mlir index b8d845d..bf11e07 100644 --- a/mlir/test/Dialect/LLVMIR/call-intrin.mlir +++ b/mlir/test/Dialect/LLVMIR/call-intrin.mlir @@ -27,14 +27,13 @@ llvm.func @round_overloaded() -> f32 { // CHECK: define void @lifetime_start() { // CHECK: %1 = alloca float, i8 1, align 4 -// CHECK: call void @llvm.lifetime.start.p0(i64 4, ptr %1) +// CHECK: call void @llvm.lifetime.start.p0(ptr %1) // CHECK: ret void // CHECK: } llvm.func @lifetime_start() { - %0 = llvm.mlir.constant(4 : i64) : i64 - %1 = llvm.mlir.constant(1 : i8) : i8 - %2 = llvm.alloca %1 x f32 : (i8) -> !llvm.ptr - llvm.call_intrinsic "llvm.lifetime.start"(%0, %2) {} : (i64, !llvm.ptr) -> () + %0 = llvm.mlir.constant(1 : i8) : i8 + %1 = llvm.alloca %0 x f32 : (i8) -> !llvm.ptr + llvm.call_intrinsic "llvm.lifetime.start"(%1) {} : (!llvm.ptr) -> () llvm.return } diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir index 551e0c9..8e292f4 100644 --- a/mlir/test/Dialect/LLVMIR/inlining.mlir +++ b/mlir/test/Dialect/LLVMIR/inlining.mlir @@ -299,7 +299,7 @@ llvm.func @test_inline(%cond0 : i1, %cond1 : i1, %funcArg : f32) -> f32 { ^bb1: // Make sure the lifetime begin intrinsic has been inserted where the call // used to be, even though the alloca has been moved to the entry block. - // CHECK-NEXT: llvm.intr.lifetime.start 4, %[[PTR]] + // CHECK-NEXT: llvm.intr.lifetime.start %[[PTR]] %0 = llvm.call @static_alloca(%cond1) : (i1) -> f32 // CHECK: llvm.cond_br %{{.+}}, ^[[BB2:.+]], ^[[BB3:.+]] llvm.br ^bb3(%0: f32) @@ -307,9 +307,9 @@ llvm.func @test_inline(%cond0 : i1, %cond1 : i1, %funcArg : f32) -> f32 { // return sites of the callee. // CHECK: ^[[BB2]]: // CHECK-NEXT: llvm.load - // CHECK-NEXT: llvm.intr.lifetime.end 4, %[[PTR]] + // CHECK-NEXT: llvm.intr.lifetime.end %[[PTR]] // CHECK: ^[[BB3]]: - // CHECK-NEXT: llvm.intr.lifetime.end 4, %[[PTR]] + // CHECK-NEXT: llvm.intr.lifetime.end %[[PTR]] ^bb2: llvm.br ^bb3(%funcArg: f32) ^bb3(%blockArg: f32): @@ -334,9 +334,9 @@ llvm.func @test_inline(%cond0 : i1) { // CHECK: "test.one_region_op"() ({ "test.one_region_op"() ({ %0 = llvm.call @static_alloca() : () -> f32 - // CHECK-NEXT: llvm.intr.lifetime.start 4, %[[ALLOCA]] + // CHECK-NEXT: llvm.intr.lifetime.start %[[ALLOCA]] // CHECK-NEXT: %[[RES:.+]] = llvm.load %[[ALLOCA]] - // CHECK-NEXT: llvm.intr.lifetime.end 4, %[[ALLOCA]] + // CHECK-NEXT: llvm.intr.lifetime.end %[[ALLOCA]] // CHECK-NEXT: test.region_yield %[[RES]] test.region_yield %0 : f32 }) : () -> () @@ -368,9 +368,9 @@ llvm.func @test_inline(%cond0 : i1) { llvm.func @alloca_with_lifetime(%cond: i1) -> f32 { %0 = llvm.mlir.constant(4 : i32) : i32 %1 = llvm.alloca %0 x f32 : (i32) -> !llvm.ptr - llvm.intr.lifetime.start 4, %1 : !llvm.ptr + llvm.intr.lifetime.start %1 : !llvm.ptr %2 = llvm.load %1 : !llvm.ptr -> f32 - llvm.intr.lifetime.end 4, %1 : !llvm.ptr + llvm.intr.lifetime.end %1 : !llvm.ptr %3 = llvm.fadd %2, %2 : f32 llvm.return %3 : f32 } @@ -385,9 +385,9 @@ llvm.func @test_inline(%cond0 : i1, %cond1 : i1, %funcArg : f32) -> f32 { ^bb1: // Make sure the original lifetime intrinsic has been preserved, rather than // inserting a new one with a larger scope. - // CHECK: llvm.intr.lifetime.start 4, %[[PTR]] + // CHECK: llvm.intr.lifetime.start %[[PTR]] // CHECK-NEXT: llvm.load %[[PTR]] - // CHECK-NEXT: llvm.intr.lifetime.end 4, %[[PTR]] + // CHECK-NEXT: llvm.intr.lifetime.end %[[PTR]] // CHECK: llvm.fadd // CHECK-NOT: llvm.intr.lifetime.end %0 = llvm.call @alloca_with_lifetime(%cond1) : (i1) -> f32 diff --git a/mlir/test/Dialect/LLVMIR/mem2reg.mlir b/mlir/test/Dialect/LLVMIR/mem2reg.mlir index 56634cf..716a586 100644 --- a/mlir/test/Dialect/LLVMIR/mem2reg.mlir +++ b/mlir/test/Dialect/LLVMIR/mem2reg.mlir @@ -304,10 +304,9 @@ llvm.func @g() // CHECK-NOT: = llvm.alloca llvm.func amdgpu_kernelcc @addrspace_discard() { %0 = llvm.mlir.constant(1 : i32) : i32 - %1 = llvm.mlir.constant(2 : i64) : i64 - %2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr<5> - %3 = llvm.addrspacecast %2 : !llvm.ptr<5> to !llvm.ptr - llvm.intr.lifetime.start 2, %3 : !llvm.ptr + %1 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + llvm.intr.lifetime.start %2 : !llvm.ptr llvm.return } @@ -406,9 +405,9 @@ llvm.func @unreachable_jumps_to_merge_point(%arg0: i1) -> i32 { llvm.func @ignore_lifetime() { %0 = llvm.mlir.constant(1 : i32) : i32 %1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr - llvm.intr.lifetime.start 2, %1 : !llvm.ptr + llvm.intr.lifetime.start %1 : !llvm.ptr llvm.store %0, %1 {alignment = 4 : i64} : i32, !llvm.ptr - llvm.intr.lifetime.end 2, %1 : !llvm.ptr + llvm.intr.lifetime.end %1 : !llvm.ptr llvm.return } @@ -437,9 +436,9 @@ llvm.func @ignore_discardable_tree() { %5 = llvm.insertvalue %1, %4[1] : !llvm.struct<(i8, i16)> %6 = llvm.alloca %0 x !llvm.struct<(i8, i16)> {alignment = 8 : i64} : (i32) -> !llvm.ptr %7 = llvm.getelementptr %6[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i8, i16)> - llvm.intr.lifetime.start 2, %7 : !llvm.ptr + llvm.intr.lifetime.start %7 : !llvm.ptr llvm.store %5, %6 {alignment = 2 : i64} : !llvm.struct<(i8, i16)>, !llvm.ptr - llvm.intr.lifetime.end 2, %7 : !llvm.ptr + llvm.intr.lifetime.end %7 : !llvm.ptr llvm.return } @@ -517,8 +516,8 @@ llvm.func @discardable_use_tree() { %2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr %3 = llvm.bitcast %2 : !llvm.ptr to !llvm.ptr %4 = llvm.bitcast %3 : !llvm.ptr to !llvm.ptr - llvm.intr.lifetime.start 2, %3 : !llvm.ptr - llvm.intr.lifetime.start 2, %4 : !llvm.ptr + llvm.intr.lifetime.start %3 : !llvm.ptr + llvm.intr.lifetime.start %4 : !llvm.ptr %5 = llvm.intr.invariant.start 2, %3 : !llvm.ptr llvm.intr.invariant.end %5, 2, %3 : !llvm.ptr llvm.return @@ -534,8 +533,8 @@ llvm.func @non_discardable_use_tree() { %2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr %3 = llvm.bitcast %2 : !llvm.ptr to !llvm.ptr %4 = llvm.bitcast %3 : !llvm.ptr to !llvm.ptr - llvm.intr.lifetime.start 2, %3 : !llvm.ptr - llvm.intr.lifetime.start 2, %4 : !llvm.ptr + llvm.intr.lifetime.start %3 : !llvm.ptr + llvm.intr.lifetime.start %4 : !llvm.ptr llvm.call @use(%4) : (!llvm.ptr) -> i1 llvm.return } @@ -551,8 +550,8 @@ llvm.func @trivial_get_element_ptr() { %2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr %3 = llvm.bitcast %2 : !llvm.ptr to !llvm.ptr %4 = llvm.getelementptr %3[0] : (!llvm.ptr) -> !llvm.ptr, i8 - llvm.intr.lifetime.start 2, %3 : !llvm.ptr - llvm.intr.lifetime.start 2, %4 : !llvm.ptr + llvm.intr.lifetime.start %3 : !llvm.ptr + llvm.intr.lifetime.start %4 : !llvm.ptr llvm.return } @@ -565,8 +564,8 @@ llvm.func @nontrivial_get_element_ptr() { // CHECK: = llvm.alloca %2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr %4 = llvm.getelementptr %2[1] : (!llvm.ptr) -> !llvm.ptr, i8 - llvm.intr.lifetime.start 2, %2 : !llvm.ptr - llvm.intr.lifetime.start 2, %4 : !llvm.ptr + llvm.intr.lifetime.start %2 : !llvm.ptr + llvm.intr.lifetime.start %4 : !llvm.ptr llvm.return } @@ -580,8 +579,8 @@ llvm.func @dynamic_get_element_ptr() { %2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr %3 = llvm.bitcast %2 : !llvm.ptr to !llvm.ptr %4 = llvm.getelementptr %3[%0] : (!llvm.ptr, i32) -> !llvm.ptr, i8 - llvm.intr.lifetime.start 2, %3 : !llvm.ptr - llvm.intr.lifetime.start 2, %4 : !llvm.ptr + llvm.intr.lifetime.start %3 : !llvm.ptr + llvm.intr.lifetime.start %4 : !llvm.ptr llvm.return } diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index a2b2f84..db5271c 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -981,6 +981,13 @@ llvm.func @rocdl.s.wait.expcnt() { // ----- +llvm.func @rocdl.readfirstlane(%src : f32) -> f32 { + // CHECK-LABEL: rocdl.readfirstlane + // CHECK: rocdl.readfirstlane %{{.*}} : f32 + %ret = rocdl.readfirstlane %src : f32 + llvm.return %ret : f32 +} + llvm.func @rocdl.readlane(%src : f32) -> f32 { %cst0 = llvm.mlir.constant(0 : i32) : i32 diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir index a0273fb..7344797 100644 --- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir +++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir @@ -685,10 +685,10 @@ func.func @fastmathFlags(%arg0: f32, %arg1: f32, %arg2: i32, %arg3: vector<2 x f // CHECK-LABEL: @lifetime // CHECK-SAME: %[[P:.*]]: !llvm.ptr llvm.func @lifetime(%p: !llvm.ptr) { - // CHECK: llvm.intr.lifetime.start 16, %[[P]] - llvm.intr.lifetime.start 16, %p : !llvm.ptr - // CHECK: llvm.intr.lifetime.end 16, %[[P]] - llvm.intr.lifetime.end 16, %p : !llvm.ptr + // CHECK: llvm.intr.lifetime.start %[[P]] + llvm.intr.lifetime.start %p : !llvm.ptr + // CHECK: llvm.intr.lifetime.end %[[P]] + llvm.intr.lifetime.end %p : !llvm.ptr llvm.return } diff --git a/mlir/test/Dialect/LLVMIR/sroa.mlir b/mlir/test/Dialect/LLVMIR/sroa.mlir index fe1531d..1674bbd 100644 --- a/mlir/test/Dialect/LLVMIR/sroa.mlir +++ b/mlir/test/Dialect/LLVMIR/sroa.mlir @@ -177,7 +177,7 @@ llvm.func @direct_promotable_use_is_fine() -> i32 { // CHECK: %[[RES:.*]] = llvm.load %[[ALLOCA]] %3 = llvm.load %2 : !llvm.ptr -> i32 // This is a direct use of the slot but it can be removed because it implements PromotableOpInterface. - llvm.intr.lifetime.start 2, %1 : !llvm.ptr + llvm.intr.lifetime.start %1 : !llvm.ptr // CHECK: llvm.return %[[RES]] : i32 llvm.return %3 : i32 } diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir index 4ba4b09..2f30e8b 100644 --- a/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir +++ b/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir @@ -20,20 +20,6 @@ func.func @block_matmul( return %0 : tensor<64x64xf32> } -func.func @block_matmul_transpose_a( - %A: tensor<128x64xf32>, %B: tensor<128x64xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> { - %0 = linalg.matmul_transpose_a ins(%A, %B : tensor<128x64xf32>, tensor<128x64xf32>) - outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32> - return %0 : tensor<64x64xf32> -} - -func.func @block_matmul_transpose_b( - %A: tensor<64x128xf32>, %B: tensor<64x128xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> { - %0 = linalg.matmul_transpose_b ins(%A, %B : tensor<64x128xf32>, tensor<64x128xf32>) - outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32> - return %0 : tensor<64x64xf32> -} - // MMT4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> // MMT4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)> // MMT4D-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> @@ -43,18 +29,6 @@ func.func @block_matmul_transpose_b( // MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // MMT4D-COUNT-1: linalg.unpack -// MMT4D-LABEL: func @block_matmul_transpose_a -// MMT4D-COUNT-3: linalg.pack -// MMT4D: linalg.generic -// MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] -// MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MMT4D-COUNT-1: linalg.unpack -// MMT4D-LABEL: func @block_matmul_transpose_b -// MMT4D-COUNT-3: linalg.pack -// MMT4D: linalg.generic -// MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] -// MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MMT4D-COUNT-1: linalg.unpack // MM4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> // MM4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)> @@ -65,18 +39,6 @@ func.func @block_matmul_transpose_b( // MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // MM4D-COUNT-1: linalg.unpack -// MM4D-LABEL: func @block_matmul_transpose_a -// MM4D-COUNT-3: linalg.pack -// MM4D: linalg.generic -// MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] -// MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MM4D-COUNT-1: linalg.unpack -// MM4D-LABEL: func @block_matmul_transpose_b -// MM4D-COUNT-3: linalg.pack -// MM4D: linalg.generic -// MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] -// MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MM4D-COUNT-1: linalg.unpack // MTM4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d5, d3)> // MTM4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)> @@ -87,15 +49,3 @@ func.func @block_matmul_transpose_b( // MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // MTM4D-COUNT-1: linalg.unpack -// MTM4D-LABEL: func @block_matmul_transpose_a -// MTM4D-COUNT-3: linalg.pack -// MTM4D: linalg.generic -// MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] -// MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MTM4D-COUNT-1: linalg.unpack -// MTM4D-LABEL: func @block_matmul_transpose_b -// MTM4D-COUNT-3: linalg.pack -// MTM4D: linalg.generic -// MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] -// MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MTM4D-COUNT-1: linalg.unpack diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir index aa860db..e16af1f 100644 --- a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir +++ b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir @@ -197,150 +197,6 @@ func.func @block_batch_matmul( // ----- -func.func @block_matmul_transpose_a( - %A: tensor<128x64xf32>, %B: tensor<128x64xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> { - %0 = linalg.matmul_transpose_a ins(%A, %B : tensor<128x64xf32>, tensor<128x64xf32>) - outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32> - return %0 : tensor<64x64xf32> -} - -// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> - -// CHECK-LABEL: func @block_matmul_transpose_a( -// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<128x64xf32>, %[[B:[0-9a-z]+]]: tensor<128x64xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32> -// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] -// CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 64] -// CHECK-SAME: into %[[PACK_DST_0]] : tensor<128x64xf32> -> tensor<2x2x32x64xf32> -// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] -// CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] -// CHECK-SAME: into %[[PACK_DST_1]] : tensor<128x64xf32> -> tensor<4x2x16x64xf32> -// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] -// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] -// CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32> -// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic -// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] -// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] -// CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32> -// CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32> - -// ----- - -func.func @block_batch_matmul_transpose_a( - %A: tensor<512x128x64xf32>, %B: tensor<512x128x64xf32>, %C: tensor<512x64x64xf32>) -> tensor<512x64x64xf32> { - %0 = linalg.batch_matmul_transpose_a ins(%A, %B : tensor<512x128x64xf32>, tensor<512x128x64xf32>) - outs(%C : tensor<512x64x64xf32>) -> tensor<512x64x64xf32> - return %0 : tensor<512x64x64xf32> -} - -// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d3, d4, d6)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d5, d6)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5)> - -// CHECK-LABEL: func @block_batch_matmul_transpose_a( -// CHECK-SAME: %[[A:.+]]: tensor<512x128x64xf32>, %[[B:.+]]: tensor<512x128x64xf32>, %[[C:.+]]: tensor<512x64x64xf32> -// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] -// CHECK-SAME: outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [32, 64] -// CHECK-SAME: into %[[PACK_DST_0]] : tensor<512x128x64xf32> -> tensor<512x2x2x32x64xf32> -// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] -// CHECK-SAME: outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [16, 64] -// CHECK-SAME: into %[[PACK_DST_1]] : tensor<512x128x64xf32> -> tensor<512x4x2x16x64xf32> -// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] -// CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] -// CHECK-SAME: into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32> -// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic -// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] -// CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] -// CHECK-SAME: into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32> -// CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32> - -// ----- - -func.func @block_matmul_transpose_b( - %A: tensor<64x128xf32>, %B: tensor<64x128xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> { - %0 = linalg.matmul_transpose_b ins(%A, %B : tensor<64x128xf32>, tensor<64x128xf32>) - outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32> - return %0 : tensor<64x64xf32> -} - -// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> - -// CHECK-LABEL: func @block_matmul_transpose_b( -// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<64x128xf32>, %[[B:[0-9a-z]+]]: tensor<64x128xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32> -// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] -// CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] -// CHECK-SAME: into %[[PACK_DST_0]] : tensor<64x128xf32> -> tensor<2x2x32x64xf32> -// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] -// CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 64] -// CHECK-SAME: into %[[PACK_DST_1]] : tensor<64x128xf32> -> tensor<4x2x16x64xf32> -// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] -// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] -// CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32> -// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic -// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] -// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] -// CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32> -// CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32> - -// ----- - -func.func @block_batch_matmul_transpose_b( - %A: tensor<512x64x128xf32>, %B: tensor<512x64x128xf32>, %C: tensor<512x64x64xf32>) -> tensor<512x64x64xf32> { - %0 = linalg.batch_matmul_transpose_b ins(%A, %B : tensor<512x64x128xf32>, tensor<512x64x128xf32>) - outs(%C : tensor<512x64x64xf32>) -> tensor<512x64x64xf32> - return %0 : tensor<512x64x64xf32> -} - -// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d3, d4, d6)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d5, d6)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5)> - -// CHECK-LABEL: func @block_batch_matmul_transpose_b( -// CHECK-SAME: %[[A:.+]]: tensor<512x64x128xf32>, %[[B:.+]]: tensor<512x64x128xf32>, %[[C:.+]]: tensor<512x64x64xf32> -// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] -// CHECK-SAME: outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [32, 64] -// CHECK-SAME: into %[[PACK_DST_0]] : tensor<512x64x128xf32> -> tensor<512x2x2x32x64xf32> -// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] -// CHECK-SAME: outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 64] -// CHECK-SAME: into %[[PACK_DST_1]] : tensor<512x64x128xf32> -> tensor<512x4x2x16x64xf32> -// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] -// CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] -// CHECK-SAME: into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32> -// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic -// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] -// CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] -// CHECK-SAME: into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32> -// CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32> - -// ----- - #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> diff --git a/mlir/test/Dialect/Linalg/elementwise/named-to-elementwise.mlir b/mlir/test/Dialect/Linalg/elementwise/named-to-elementwise.mlir new file mode 100644 index 0000000..2332b28 --- /dev/null +++ b/mlir/test/Dialect/Linalg/elementwise/named-to-elementwise.mlir @@ -0,0 +1,56 @@ +// RUN: mlir-opt %s -linalg-morph-ops=named-to-category -split-input-file | FileCheck %s + +// CHECK: @exp(%[[A:.+]]: tensor<16x8xf32>, %[[B:.+]]: tensor<16x8xf32>) -> tensor<16x8xf32> { +// CHECK: {{.*}} = linalg.elementwise +// CHECK-SAME: kind=#linalg.elementwise_kind<exp> +// CHECK-SAME: ins(%[[A]] : tensor<16x8xf32>) +// CHECK-SAME: outs(%[[B]] : tensor<16x8xf32>) -> tensor<16x8xf32> +// +func.func @exp(%A : tensor<16x8xf32>, %B : tensor<16x8xf32>) -> tensor<16x8xf32> { + %exp = linalg.exp ins(%A : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32> + return %exp : tensor<16x8xf32> +} + +// ---- + +// CHECK: @add(%[[A:.+]]: tensor<16x8xf32>, %[[B:.+]]: tensor<16x8xf32>, %[[C:.+]]: tensor<16x8xf32>) -> tensor<16x8xf32> { +// CHECK: {{.*}} = linalg.elementwise +// CHECK-SAME: kind=#linalg.elementwise_kind<add> +// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<16x8xf32>, tensor<16x8xf32>) +// CHECK-SAME: outs(%[[C]] : tensor<16x8xf32>) -> tensor<16x8xf32> +// +func.func @add(%A : tensor<16x8xf32>, %B: tensor<16x8xf32>, %C : tensor<16x8xf32>) -> tensor<16x8xf32> { + %add = linalg.add ins(%A, %B : tensor<16x8xf32>, tensor<16x8xf32>) outs(%C : tensor<16x8xf32>) -> tensor<16x8xf32> + return %add : tensor<16x8xf32> +} + +// ---- + +// CHECK: @sub(%[[A:.+]]: tensor<16x8xf32>, %[[B:.+]]: tensor<16x8xf32>, %[[C:.+]]: tensor<16x8xf32>) -> tensor<16x8xf32> { +// CHECK: {{.*}} = linalg.elementwise +// CHECK-SAME: kind=#linalg.elementwise_kind<sub> +// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<16x8xf32>, tensor<16x8xf32>) +// CHECK-SAME: outs(%[[C]] : tensor<16x8xf32>) +// +func.func @sub(%A : tensor<16x8xf32>, %B: tensor<16x8xf32>, %C : tensor<16x8xf32>) -> tensor<16x8xf32> { + %sub = linalg.sub ins(%A, %B : tensor<16x8xf32>, tensor<16x8xf32>) outs(%C : tensor<16x8xf32>) -> tensor<16x8xf32> + return %sub : tensor<16x8xf32> +} + +// ---- + +// CHECK: @ternary_select(%[[A:.+]]: tensor<4x8x16xi1>, %[[B:.+]]: tensor<4x8x16xf32>, %[[C:.+]]: tensor<4x8x16xf32>) +// CHECK: %[[E:.+]] = tensor.empty() : tensor<4x8x16xf32> +// CHECK: {{.*}} = linalg.elementwise +// CHECK-SAME: kind=#linalg.elementwise_kind<select> +// CHECK-SAME: ins(%[[A]], %[[B]], %[[C]] : tensor<4x8x16xi1>, tensor<4x8x16xf32>, tensor<4x8x16xf32>) +// CHECK-SAME: outs(%[[E]] : tensor<4x8x16xf32>) -> tensor<4x8x16xf32> +// +func.func @ternary_select(%A: tensor<4x8x16xi1>, %B: tensor<4x8x16xf32>, %C: tensor<4x8x16xf32>) + -> tensor<4x8x16xf32> { + %empty = tensor.empty() : tensor<4x8x16xf32> + %select = linalg.select + ins(%A, %B, %C : tensor<4x8x16xi1>, tensor<4x8x16xf32>, tensor<4x8x16xf32>) + outs(%empty: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> + return %select : tensor<4x8x16xf32> +} diff --git a/mlir/test/Dialect/Linalg/fold-add-into-dest.mlir b/mlir/test/Dialect/Linalg/fold-add-into-dest.mlir index d8e92e4..e90247d 100644 --- a/mlir/test/Dialect/Linalg/fold-add-into-dest.mlir +++ b/mlir/test/Dialect/Linalg/fold-add-into-dest.mlir @@ -158,36 +158,6 @@ module attributes {transform.with_named_sequence} { // ----- !type = tensor<2048x2048xf32> -func.func @fold_add_on_transposed_matmuls(%arg0: !type, %arg1: !type) -> !type { - %0 = arith.constant dense<1.111111e+00> : !type - %cst = arith.constant 0.000000e+00 : f32 - %1 = tensor.empty() : !type - %2 = linalg.fill ins(%cst : f32) outs(%1 : !type) -> !type - %3 = linalg.matmul_transpose_a ins(%arg0, %0 : !type, !type) outs(%2 : !type) -> !type - %4 = linalg.matmul_transpose_b ins(%arg1, %0 : !type, !type) outs(%2 : !type) -> !type - %5 = linalg.add ins(%3, %4 : !type, !type) outs(%1 : !type) -> !type - return %5 : !type -} - -// CHECK-LABEL: func.func @fold_add_on_transposed_matmuls -// CHECK: %[[ACC:.+]] = linalg.matmul_transpose_a -// CHECK-NEXT: %[[RES:.+]] = linalg.matmul_transpose_b ins({{.+}}) outs(%[[ACC]] -// CHECK-NOT: linalg.add -// CHECK-NEXT: return %[[RES]] - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.linalg.fold_add_into_dest - } : !transform.any_op - transform.yield - } -} - -// ----- - -!type = tensor<2048x2048xf32> func.func @expect_no_fold_of_add_as_dominated_op_is_not_a_contraction(%arg0: !type, %arg1: !type) -> !type { %0 = arith.constant dense<1.111111e+00> : !type %cst = arith.constant 0.000000e+00 : f32 diff --git a/mlir/test/Dialect/Linalg/linalg-morph-category-ops.mlir b/mlir/test/Dialect/Linalg/linalg-morph-category-ops.mlir new file mode 100644 index 0000000..00602c4 --- /dev/null +++ b/mlir/test/Dialect/Linalg/linalg-morph-category-ops.mlir @@ -0,0 +1,15 @@ +// Forward path `named -> category -> generic` +// RUN: mlir-opt %s -linalg-morph-ops=named-to-category | FileCheck %s --check-prefix=NAMED_TO_CATEGORY + +// RUN: mlir-opt %s -linalg-morph-ops=named-to-category | \ +// RUN: mlir-opt %s -linalg-morph-ops=category-to-generic | FileCheck %s --check-prefix=CATEGORY_TO_GENERIC + +func.func @exp(%A : tensor<16x8xf32>, %B : tensor<16x8xf32>) -> tensor<16x8xf32> { + %exp = linalg.exp ins(%A : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32> + return %exp : tensor<16x8xf32> +} +// NAMED_TO_CATEGORY: linalg.elementwise +// NAMED_TO_CATEGORY-NOT: linalg.exp + +// CATEGORY_TO_GENERIC: linalg.generic +// CATEGORY_TO_GENERIC-NOT: linalg.elementwise diff --git a/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir b/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir new file mode 100644 index 0000000..bdd29b9 --- /dev/null +++ b/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir @@ -0,0 +1,14 @@ +// RUN: mlir-opt %s -linalg-morph-ops=named-to-generic | FileCheck %s --check-prefix=NAMED_TO_GENERIC +// RUN: mlir-opt %s -linalg-morph-ops=named-to-generic | mlir-opt -linalg-morph-ops=generic-to-named | \ +// RUN: FileCheck %s --check-prefix=ROUND_TRIP + +func.func @exp(%A : tensor<16x8xf32>, %B : tensor<16x8xf32>) -> tensor<16x8xf32> { + %exp = linalg.exp ins(%A : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32> + return %exp : tensor<16x8xf32> +} + +// NAMED_TO_GENERIC: linalg.generic +// NAMED_TO_GENERIC-NOT: linalg.exp + +// ROUND_TRIP: linalg.exp +// ROUND_TRIP-NOT: linalg.generic diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index 412f40d..a93e979 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -1222,17 +1222,6 @@ func.func @batch_reduce_matmul(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32 // ----- -// CHECK-LABEL: func @matmul_transpose_a -// CHECK: linalg.matmul_transpose_a -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5x3xf32>, memref<5x7xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) -func.func @matmul_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul_transpose_a ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - // CHECK-LABEL: func @matmul_transpose_a_explicit // CHECK: linalg.matmul // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5x3xf32>, memref<5x7xf32>) @@ -1478,17 +1467,6 @@ func.func @matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5xf3 // ----- -// CHECK-LABEL: func @matmul_transpose_b -// CHECK: linalg.matmul_transpose_b -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<7x5xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) -func.func @matmul_transpose_b(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul_transpose_b ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - // CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)> // CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> // CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -1806,28 +1784,6 @@ func.func @bcast_A_transpose_B(%A: memref<3x5xf32>, %B: memref<2x7x5xf32>, %C: m // ----- -// CHECK-LABEL: func @batchmatmul_transpose_a -// CHECK: linalg.batch_matmul_transpose_a -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<2x5x3xf32>, memref<2x5x7xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<2x3x7xf32>) -func.func @batchmatmul_transpose_a(%arg0: memref<2x5x3xf32>, %arg1: memref<2x5x7xf32>, %arg2: memref<2x3x7xf32>) { - linalg.batch_matmul_transpose_a ins(%arg0, %arg1 : memref<2x5x3xf32>, memref<2x5x7xf32>) outs(%arg2: memref<2x3x7xf32>) - return -} - -// ----- - -// CHECK-LABEL: func @batchmatmul_transpose_b -// CHECK: linalg.batch_matmul_transpose_b -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<2x3x5xf32>, memref<2x7x5xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<2x3x7xf32>) -func.func @batchmatmul_transpose_b(%arg0: memref<2x3x5xf32>, %arg1: memref<2x7x5xf32>, %arg2: memref<2x3x7xf32>) { - linalg.batch_matmul_transpose_b ins(%arg0, %arg1 : memref<2x3x5xf32>, memref<2x7x5xf32>) outs(%arg2: memref<2x3x7xf32>) - return -} - -// ----- - // CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> // CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> // CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> diff --git a/mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir b/mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir index 43bddb0..704576d 100644 --- a/mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir +++ b/mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir @@ -92,38 +92,6 @@ func.func @singleton_batch_vecmat(%arg0 : tensor<1x?xf32>, %arg1 : tensor<1x?x?x // ----- -func.func @singleton_batchmatmul_transpose_a(%arg0: memref<1x5x3xf32>, %arg1: memref<1x5x7xf32>, %arg2: memref<1x3x7xf32>) { - // CHECK-LABEL: @singleton_batchmatmul_transpose_a - // CHECK-SAME: %[[LHS:[a-zA-Z0-9]+]]: memref<1x5x3xf32> - // CHECK-SAME: %[[RHS:[a-zA-Z0-9]+]]: memref<1x5x7xf32> - // CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: memref<1x3x7xf32> - // CHECK-NEXT: %[[COLLAPSED_LHS:.*]] = memref.collapse_shape %[[LHS]] {{\[}}[0, 1], [2]] - // CHECK-NEXT: %[[COLLAPSED_RHS:.*]] = memref.collapse_shape %[[RHS]] {{\[}}[0, 1], [2]] - // CHECK-NEXT: %[[COLLAPSED_INIT:.*]] = memref.collapse_shape %[[INIT]] {{\[}}[0, 1], [2]] - // CHECK-NEXT: linalg.matmul_transpose_a ins(%[[COLLAPSED_LHS]], %[[COLLAPSED_RHS]] : memref<5x3xf32>, memref<5x7xf32>) outs(%[[COLLAPSED_INIT]] : memref<3x7xf32>) - // CHECK-NEXT: return - linalg.batch_matmul_transpose_a ins(%arg0, %arg1 : memref<1x5x3xf32>, memref<1x5x7xf32>) outs(%arg2: memref<1x3x7xf32>) - return -} - -// ----- - -func.func @singleton_batchmatmul_transpose_b(%arg0: memref<1x3x5xf32>, %arg1: memref<1x7x5xf32>, %arg2: memref<1x3x7xf32>) { - // CHECK-LABEL: @singleton_batchmatmul_transpose_b - // CHECK-SAME: %[[LHS:[a-zA-Z0-9]+]]: memref<1x3x5xf32> - // CHECK-SAME: %[[RHS:[a-zA-Z0-9]+]]: memref<1x7x5xf32> - // CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: memref<1x3x7xf32> - // CHECK-NEXT: %[[COLLAPSED_LHS:.*]] = memref.collapse_shape %[[LHS]] {{\[}}[0, 1], [2]] - // CHECK-NEXT: %[[COLLAPSED_RHS:.*]] = memref.collapse_shape %[[RHS]] {{\[}}[0, 1], [2]] - // CHECK-NEXT: %[[COLLAPSED_INIT:.*]] = memref.collapse_shape %[[INIT]] {{\[}}[0, 1], [2]] - // CHECK-NEXT: linalg.matmul_transpose_b ins(%[[COLLAPSED_LHS]], %[[COLLAPSED_RHS]] : memref<3x5xf32>, memref<7x5xf32>) outs(%[[COLLAPSED_INIT]] : memref<3x7xf32>) - // CHECK-NEXT: return - linalg.batch_matmul_transpose_b ins(%arg0, %arg1 : memref<1x3x5xf32>, memref<1x7x5xf32>) outs(%arg2: memref<1x3x7xf32>) - return -} - -// ----- - func.func @matmul_to_matvec_tensor(%arg0: tensor<?x?xf32>, %arg1: tensor<?x1xf32>, %arg2: tensor<?x1xf32>) -> tensor<?x1xf32> { // CHECK-LABEL: @matmul_to_matvec_tensor // CHECK-SAME: %[[LHS:[a-zA-Z0-9]+]]: tensor<?x?xf32> @@ -226,59 +194,6 @@ func.func @matvec_to_dot_tensor(%arg0: tensor<1x?xf32>, %arg1: tensor<?xf32>, %a // ----- -func.func @matmul_transpose_a_to_vecmat(%arg0: tensor<256x1xf32>, %arg1: tensor<256x512xf32>, %arg2: tensor<1x512xf32>) -> tensor<1x512xf32> { - // CHECK-LABEL: @matmul_transpose_a_to_vecmat - // CHECK: collapse_shape {{.*}} into tensor<256xf32> - // CHECK: collapse_shape {{.*}} into tensor<512xf32> - // CHECK: linalg.vecmat - // CHECK: expand_shape {{.*}} into tensor<1x512xf32> - %0 = linalg.matmul_transpose_a ins(%arg0, %arg1: tensor<256x1xf32>, tensor<256x512xf32>) outs(%arg2: tensor<1x512xf32>) -> tensor<1x512xf32> - return %0 : tensor<1x512xf32> -} - -// ----- - -func.func @batch_matmul_transpose_a_to_batch_vecmat(%arg0: tensor<64x256x1xf32>, %arg1: tensor<64x256x512xf32>, %arg2: tensor<64x1x512xf32>) -> tensor<64x1x512xf32> { - // CHECK-LABEL: @batch_matmul_transpose_a_to_batch_vecmat - // CHECK: collapse_shape {{.*}} into tensor<64x256xf32> - // CHECK: collapse_shape {{.*}} into tensor<64x512xf32> - // CHECK: linalg.batch_vecmat - // CHECK: expand_shape {{.*}} into tensor<64x1x512xf32> - %0 = linalg.batch_matmul_transpose_a ins(%arg0, %arg1: tensor<64x256x1xf32>, tensor<64x256x512xf32>) outs(%arg2: tensor<64x1x512xf32>) -> tensor<64x1x512xf32> - return %0 : tensor<64x1x512xf32> -} - -// ----- - -func.func @matmul_transpose_b_to_matvec(%arg0: memref<?x?xf32>, %arg1: memref<1x?xf32>, %arg2: memref<?x1xf32>) { - // CHECK-LABEL: @matmul_transpose_b_to_matvec - // CHECK: linalg.matvec - linalg.matmul_transpose_b ins(%arg0, %arg1: memref<?x?xf32>, memref<1x?xf32>) outs(%arg2: memref<?x1xf32>) - return -} - -// ----- - -func.func @batchmatmul_transpose_b_to_batchmatvec_tensor(%arg0: tensor<64x128x256xf32>, %arg1: tensor<64x1x256xf32>, %arg2: tensor<64x128x1xf32>) -> tensor<64x128x1xf32> { - // CHECK: collapse_shape {{.*}} into tensor<64x256xf32> - // CHECK: collapse_shape {{.*}} into tensor<64x128xf32> - // CHECK: linalg.batch_matvec - // CHECK: expand_shape {{.*}} into tensor<64x128x1xf32> - %0 = linalg.batch_matmul_transpose_b ins(%arg0, %arg1: tensor<64x128x256xf32>, tensor<64x1x256xf32>) outs(%arg2: tensor<64x128x1xf32>) -> tensor<64x128x1xf32> - return %0 : tensor<64x128x1xf32> -} - -// ----- - -func.func @batchmatmul_transpose_b_to_to_dot(%arg0: tensor<1x1x?xf32>, %arg1: tensor<1x1x?xf32>, %arg2: tensor<1x1x1xf32>) -> tensor<1x1x1xf32> { - // CHECK-LABEL: @batchmatmul_transpose_b_to_to_dot - // CHECK: linalg.dot - %0 = linalg.batch_matmul_transpose_b ins(%arg0, %arg1: tensor<1x1x?xf32>, tensor<1x1x?xf32>) outs(%arg2: tensor<1x1x1xf32>) -> tensor<1x1x1xf32> - return %0 : tensor<1x1x1xf32> -} - -// ----- - func.func @nonsingleton_batch_matmul(%arg0 : tensor<2x?x?xf32>, %arg1 : tensor<2x?x?xf32>, %arg2: tensor<2x?x?xf32>) -> tensor<2x?x?xf32> { // CHECK-LABEL: @nonsingleton_batch_matmul // CHECK-NOT: collapse_shape diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir index 778d5bb..1b0bade 100644 --- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir +++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir @@ -504,7 +504,7 @@ func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op %c10 = transform.param.constant 10 : i64 -> !transform.param<i64> %c20 = transform.param.constant 20 : i64 -> !transform.param<i64> %sz = transform.merge_handles %c10, %c20 : !transform.param<i64> diff --git a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir index f741876..9a3dcf0 100644 --- a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir @@ -14,11 +14,11 @@ module attributes {transform.with_named_sequence} { : (!transform.any_op) -> !transform.any_op // Tile to 5 then pad to 8 - %fill_l1, %loops_l1 = transform.structured.tile_using_for %fill tile_sizes [5] + %fill_l1, %loops_l1 = transform.structured.tile_using_for %fill tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) %fill_padded, %_ = transform.structured.pad_tiling_interface %fill_l1 to padding_sizes [8] { - padding_values=[0.0 : f32, 0.0 : f32] + padding_values= [#ub.poison, 0.0 : f32] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.yield @@ -33,9 +33,9 @@ func.func @pad_lhs( -> tensor<24x25xf32> { // CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>) - // CHECK: tensor.pad %{{.*}} + // CHECK: tensor.pad %{{.*}} // CHECK: : tensor<?x12xf32> to tensor<8x12xf32> - // CHECK: tensor.pad %{{.*}} + // CHECK: tensor.pad %{{.*}} // CHECK: : tensor<?x25xf32> to tensor<8x25xf32> // CHECK: linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<8x12xf32>, tensor<12x25xf32>) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32> // CHECK: tensor.extract_slice %{{.*}}[0, 0] [%{{.*}}, 25] [1, 1] @@ -92,7 +92,7 @@ module { %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [8, 0, 14] { padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.yield + transform.yield } } } @@ -147,7 +147,7 @@ module { %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [8, 0, 14] { padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.yield + transform.yield } } } diff --git a/mlir/test/Dialect/Linalg/transform-op-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-pad.mlir index f91eb9c..51bf4a2 100644 --- a/mlir/test/Dialect/Linalg/transform-op-pad.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-pad.mlir @@ -465,14 +465,14 @@ module attributes {transform.with_named_sequence} { // CHECK: %[[RHS:.*]] = tensor.pad // CHECK: scf.for // CHECK-DAG: tensor.extract_slice %[[LHS]][0, %{{.*}}] [%{{.*}}, 32] -// CHECK-DAG: tensor.extract_slice %[[RHS]][0, %{{.*}}] [%{{.*}}, 32] +// CHECK-DAG: tensor.extract_slice %[[RHS]][%{{.*}}, 0] [32, %{{.*}}] func.func @dyn_pad_tiling(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { - %0 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> + %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> return %0 : tensor<?x?xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op %padded, %pad, %copy = transform.structured.pad %0 pad_to_multiple_of [32] use_prescribed_tensor_shapes {padding_dimensions = [2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %tiled_linalg_op, %loops = transform.structured.tile_using_for %padded tile_sizes [0, 0, 32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) %1 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op diff --git a/mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir b/mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir index f64953b..bd4c655 100644 --- a/mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir @@ -30,66 +30,6 @@ module attributes {transform.with_named_sequence} { // ----- -#map = affine_map<(d0, d1, d2) -> (d2, d0)> -#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -func.func @matmul_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.generic - {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} - ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) outs(%arg2 : memref<3x7xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %0 = arith.mulf %in, %in_0 : f32 - %1 = arith.addf %out, %0 : f32 - linalg.yield %1 : f32 - } - return -} - -// CHECK-LABEL: @matmul_transpose_a -// CHECK-SAME: %[[ARG0:.+]]: memref<5x3xf32>, %[[ARG1:.+]]: memref<5x7xf32>, %[[ARG2:.+]]: memref<3x7xf32>) { -// CHECK-NOT: linalg.generic -// CHECK: linalg.matmul_transpose_a ins(%[[ARG0]], %[[ARG1]] : memref<5x3xf32>, memref<5x7xf32>) outs(%[[ARG2]] : memref<3x7xf32>) - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match interface{LinalgOp} in %arg0 : (!transform.any_op) -> !transform.any_op - %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - -#map = affine_map<(d0, d1, d2) -> (d0, d2)> -#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -func.func @matmul_transpose_b(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> { - %0 = linalg.generic - {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} - ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %1 = arith.mulf %in, %in_0 : f32 - %2 = arith.addf %out, %1 : f32 - linalg.yield %2 : f32 - } -> tensor<?x?xf32> - return %0 : tensor<?x?xf32> -} - -// CHECK-LABEL: @matmul_transpose_b -// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?xf32>, %[[ARG1:.+]]: tensor<?x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32> -// CHECK-NOT: linalg.generic -// CHECK: linalg.matmul_transpose_b ins(%[[ARG0]], %[[ARG1]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[ARG2]] : tensor<?x?xf32>) -> tensor<?x?xf32> - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match interface{LinalgOp} in %arg0 : (!transform.any_op) -> !transform.any_op - %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -117,32 +57,3 @@ module attributes {transform.with_named_sequence} { transform.yield } } - -// ----- -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> -#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> -func.func @batch_matmul_transpose_b(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>, %arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> { - %0 = linalg.generic - {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} - ins(%arg0, %arg1 : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%arg2 : tensor<?x?x?xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %1 = arith.mulf %in, %in_0 : f32 - %2 = arith.addf %out, %1 : f32 - linalg.yield %2 : f32 - } -> tensor<?x?x?xf32> - return %0 : tensor<?x?x?xf32> -} - -// CHECK-LABEL: @batch_matmul_transpose_b -// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?xf32>, %[[ARG2:.+]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> -// CHECK-NOT: linalg.generic -// CHECK: linalg.batch_matmul_transpose_b ins(%[[ARG0]], %[[ARG1]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[ARG2]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32> - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match interface{LinalgOp} in %arg0 : (!transform.any_op) -> !transform.any_op - %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} diff --git a/mlir/test/Dialect/Linalg/transpose-matmul.mlir b/mlir/test/Dialect/Linalg/transpose-matmul.mlir index d2b7e9f..4ee87fb 100644 --- a/mlir/test/Dialect/Linalg/transpose-matmul.mlir +++ b/mlir/test/Dialect/Linalg/transpose-matmul.mlir @@ -1,6 +1,20 @@ // RUN: mlir-opt -transform-preload-library='transform-library-paths=%p/transpose-matmul-a.mlir' -transform-interpreter -split-input-file %s | FileCheck %s --check-prefixes=CHECK,TRANSPOSE-A // RUN: mlir-opt -transform-preload-library='transform-library-paths=%p/transpose-matmul-b.mlir' -transform-interpreter -split-input-file %s | FileCheck %s --check-prefixes=CHECK,TRANSPOSE-B +// TRANSPOSE-A-DAG: #[[$MA:.*]] = affine_map<(d0, d1, d2) -> (d2, d0)> +// TRANSPOSE-A-DAG: #[[$MB:.*]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// TRANSPOSE-A-DAG: #[[$MC:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// TRANSPOSE-A-DAG: #[[$BMA:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)> +// TRANSPOSE-A-DAG: #[[$BMB:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +// TRANSPOSE-A-DAG: #[[$BMC:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> + +// TRANSPOSE-B-DAG: #[[$MA:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// TRANSPOSE-B-DAG: #[[$MB:.*]] = affine_map<(d0, d1, d2) -> (d1, d2)> +// TRANSPOSE-B-DAG: #[[$MC:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// TRANSPOSE-B-DAG: #[[$BMA:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +// TRANSPOSE-B-DAG: #[[$BMB:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> +// TRANSPOSE-B-DAG: #[[$BMC:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> + // CHECK-LABEL: func.func @matmul_static( // CHECK-SAME: %[[A:.*]]: tensor<16x8xf32>, // CHECK-SAME: %[[B:.*]]: tensor<8x16xf32>) -> tensor<16x16xf32> { @@ -9,10 +23,10 @@ // CHECK: %[[C_ZERO:.*]] = linalg.fill ins(%[[C0_F32]] : f32) outs(%[[C_INIT]] : tensor<16x16xf32>) -> tensor<16x16xf32> // TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty() : tensor<8x16xf32> // TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<16x8xf32>) outs(%[[A_TRANSP_INIT]] : tensor<8x16xf32>) permutation = [1, 0] -// TRANSPOSE-A: %[[C:.*]] = linalg.matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<8x16xf32>, tensor<8x16xf32>) outs(%[[C_ZERO]] : tensor<16x16xf32>) -> tensor<16x16xf32> +// TRANSPOSE-A: %[[C:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<8x16xf32>, tensor<8x16xf32>) outs(%[[C_ZERO]] : tensor<16x16xf32>) -> tensor<16x16xf32> // TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty() : tensor<16x8xf32> // TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<8x16xf32>) outs(%[[B_TRANSP_INIT]] : tensor<16x8xf32>) permutation = [1, 0] -// TRANSPOSE-B: %[[C:.*]] = linalg.matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<16x8xf32>, tensor<16x8xf32>) outs(%[[C_ZERO]] : tensor<16x16xf32>) -> tensor<16x16xf32> +// TRANSPOSE-B: %[[C:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<16x8xf32>, tensor<16x8xf32>) outs(%[[C_ZERO]] : tensor<16x16xf32>) -> tensor<16x16xf32> // CHECK: return %[[C]] : tensor<16x16xf32> // CHECK: } func.func @matmul_static(%A: tensor<16x8xf32>, %B: tensor<8x16xf32>) -> (tensor<16x16xf32>) { @@ -38,11 +52,11 @@ func.func @matmul_static(%A: tensor<16x8xf32>, %B: tensor<8x16xf32>) -> (tensor< // TRANSPOSE-A: %[[A_DIM1:.*]] = tensor.dim %[[A]], %[[C1]] : tensor<?x?xf32> // TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty(%[[A_DIM1]], %[[A_DIM0]]) : tensor<?x?xf32> // TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<?x?xf32>) outs(%[[A_TRANSP_INIT]] : tensor<?x?xf32>) permutation = [1, 0] -// TRANSPOSE-A: %[[C:.*]] = linalg.matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?xf32>) -> tensor<?x?xf32> +// TRANSPOSE-A: %[[C:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?xf32>) -> tensor<?x?xf32> // TRANSPOSE-B: %[[B_DIM0:.*]] = tensor.dim %[[B]], %[[C0]] : tensor<?x?xf32> // TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty(%[[B_DIM1]], %[[B_DIM0]]) : tensor<?x?xf32> // TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<?x?xf32>) outs(%[[B_TRANSP_INIT]] : tensor<?x?xf32>) permutation = [1, 0] -// TRANSPOSE-B: %[[C:.*]] = linalg.matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?xf32>) -> tensor<?x?xf32> +// TRANSPOSE-B: %[[C:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?xf32>) -> tensor<?x?xf32> // CHECK: return %[[C]] : tensor<?x?xf32> // CHECK: } func.func @matmul_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>) -> (tensor<?x?xf32>) { @@ -69,10 +83,10 @@ func.func @matmul_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>) -> (tensor<? // CHECK: %[[C_ZERO:.*]] = linalg.fill ins(%[[C0_F32]] : f32) outs(%[[C_INIT]] : tensor<?x16xf32>) -> tensor<?x16xf32> // TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty(%[[A_DIM0]]) : tensor<8x?xf32> // TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<?x8xf32>) outs(%[[A_TRANSP_INIT]] : tensor<8x?xf32>) permutation = [1, 0] -// TRANSPOSE-A: %[[B0:.*]] = linalg.matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<8x?xf32>, tensor<8x16xf32>) outs(%[[C_ZERO]] : tensor<?x16xf32>) -> tensor<?x16xf32> +// TRANSPOSE-A: %[[B0:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<8x?xf32>, tensor<8x16xf32>) outs(%[[C_ZERO]] : tensor<?x16xf32>) -> tensor<?x16xf32> // TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty() : tensor<16x8xf32> // TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<8x16xf32>) outs(%[[B_TRANSP_INIT]] : tensor<16x8xf32>) permutation = [1, 0] -// TRANSPOSE-B: %[[B0:.*]] = linalg.matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<?x8xf32>, tensor<16x8xf32>) outs(%[[C_ZERO]] : tensor<?x16xf32>) -> tensor<?x16xf32> +// TRANSPOSE-B: %[[B0:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<?x8xf32>, tensor<16x8xf32>) outs(%[[C_ZERO]] : tensor<?x16xf32>) -> tensor<?x16xf32> // CHECK: return %[[B0]] : tensor<?x16xf32> // CHECK: } func.func @matmul_mixed(%A: tensor<?x8xf32>, %B: tensor<8x16xf32>) -> (tensor<?x16xf32>) { @@ -96,10 +110,10 @@ func.func @matmul_mixed(%A: tensor<?x8xf32>, %B: tensor<8x16xf32>) -> (tensor<?x // CHECK: %[[C_ZERO:.*]] = linalg.fill ins(%[[C0_F32]] : f32) outs(%[[C_INIT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32> // TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty() : tensor<2x8x16xf32> // TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<2x16x8xf32>) outs(%[[A_TRANSP_INIT]] : tensor<2x8x16xf32>) permutation = [0, 2, 1] -// TRANSPOSE-A: %[[C:.*]] = linalg.batch_matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<2x8x16xf32>, tensor<2x8x16xf32>) outs(%[[C_ZERO]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32> +// TRANSPOSE-A: %[[C:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<2x8x16xf32>, tensor<2x8x16xf32>) outs(%[[C_ZERO]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32> // TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty() : tensor<2x16x8xf32> // TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<2x8x16xf32>) outs(%[[B_TRANSP_INIT]] : tensor<2x16x8xf32>) permutation = [0, 2, 1] -// TRANSPOSE-B: %[[C:.*]] = linalg.batch_matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<2x16x8xf32>, tensor<2x16x8xf32>) outs(%[[C_ZERO]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32> +// TRANSPOSE-B: %[[C:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<2x16x8xf32>, tensor<2x16x8xf32>) outs(%[[C_ZERO]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32> // CHECK: return %[[C]] : tensor<2x16x16xf32> // CHECK: } func.func @batch_matmul_static(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>) -> (tensor<2x16x16xf32>) { @@ -127,12 +141,12 @@ func.func @batch_matmul_static(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>) - // TRANSPOSE-A: %[[A_DIM2:.*]] = tensor.dim %[[A]], %[[C2]] : tensor<?x?x?xf32> // TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty(%[[A_DIM0]], %[[A_DIM2]], %[[A_DIM1]]) : tensor<?x?x?xf32> // TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<?x?x?xf32>) outs(%[[A_TRANSP_INIT]] : tensor<?x?x?xf32>) permutation = [0, 2, 1] -// TRANSPOSE-A: %[[C:.*]] = linalg.batch_matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32> +// TRANSPOSE-A: %[[C:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32> // TRANSPOSE-B: %[[B_DIM0:.*]] = tensor.dim %[[B]], %[[C0]] : tensor<?x?x?xf32> // TRANSPOSE-B: %[[B_DIM1:.*]] = tensor.dim %[[B]], %[[C1]] : tensor<?x?x?xf32> // TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty(%[[B_DIM0]], %[[B_DIM2]], %[[B_DIM1]]) : tensor<?x?x?xf32> // TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<?x?x?xf32>) outs(%[[B_TRANSP_INIT]] : tensor<?x?x?xf32>) permutation = [0, 2, 1] -// TRANSPOSE-B: %[[C:.*]] = linalg.batch_matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32> +// TRANSPOSE-B: %[[C:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32> // CHECK: return %[[C]] : tensor<?x?x?xf32> // CHECK: } func.func @batch_matmul_dynamic(%A: tensor<?x?x?xf32>, %B: tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>) { @@ -161,10 +175,10 @@ func.func @batch_matmul_dynamic(%A: tensor<?x?x?xf32>, %B: tensor<?x?x?xf32>) -> // CHECK: %[[C_ZERO:.*]] = linalg.fill ins(%[[C0_F32]] : f32) outs(%[[C_INIT]] : tensor<2x?x16xf32>) -> tensor<2x?x16xf32> // TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty(%[[A_DIM1]]) : tensor<2x8x?xf32> // TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<2x?x8xf32>) outs(%[[A_TRANSP_INIT]] : tensor<2x8x?xf32>) permutation = [0, 2, 1] -// TRANSPOSE-A: %[[B0:.*]] = linalg.batch_matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<2x8x?xf32>, tensor<2x8x16xf32>) outs(%[[C_ZERO]] : tensor<2x?x16xf32>) -> tensor<2x?x16xf32> +// TRANSPOSE-A: %[[B0:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<2x8x?xf32>, tensor<2x8x16xf32>) outs(%[[C_ZERO]] : tensor<2x?x16xf32>) -> tensor<2x?x16xf32> // TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty() : tensor<2x16x8xf32> // TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<2x8x16xf32>) outs(%[[B_TRANSP_INIT]] : tensor<2x16x8xf32>) permutation = [0, 2, 1] -// TRANSPOSE-B: %[[B0:.*]] = linalg.batch_matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<2x?x8xf32>, tensor<2x16x8xf32>) outs(%[[C_ZERO]] : tensor<2x?x16xf32>) -> tensor<2x?x16xf32> +// TRANSPOSE-B: %[[B0:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<2x?x8xf32>, tensor<2x16x8xf32>) outs(%[[C_ZERO]] : tensor<2x?x16xf32>) -> tensor<2x?x16xf32> // CHECK: return %[[B0]] : tensor<2x?x16xf32> // CHECK: } func.func @batch_matmul_mixed(%A: tensor<2x?x8xf32>, %B: tensor<2x8x16xf32>) -> (tensor<2x?x16xf32>) { diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir index 4eeae4c..25cbceb 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir @@ -61,6 +61,83 @@ module attributes {transform.with_named_sequence} { // ----- +// CHECK-LABEL: @float_mixed_precision_matmul +// CHECK-COUNT-3: vector.transfer_read +// CHECK-NOT: arith.extf +// CHECK: vector.contract {{.*}} : vector<1584x1584xbf16>, vector<1584x1584xbf16> into vector<1584x1584xf32> +func.func @float_mixed_precision_matmul(%A: memref<1584x1584xbf16>, %B: memref<1584x1584xbf16>, %C: memref<1584x1584xf32>) { + linalg.matmul ins(%A, %B: memref<1584x1584xbf16>, memref<1584x1584xbf16>) + outs(%C: memref<1584x1584xf32>) + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 { fold_type_extensions_into_contract } : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @vectorization_test_2 +func.func @vectorization_test_2(%A: memref<8x16xf32>, %B: memref<16x32xf32>, + %C: memref<8x32xf32>) { + // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<8x32x16xf32> + // CHECK: vector.multi_reduction <add>, %{{.*}}, {{.*}} [2] : vector<8x32x16xf32> to vector<8x32xf32> + linalg.matmul + ins(%A, %B: memref<8x16xf32>, memref<16x32xf32>) + outs(%C: memref<8x32xf32>) + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 { disable_multi_reduction_to_contract_patterns } : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @matmul_tensors +// CHECK-SAME: (%[[ARG0:.*]]: tensor<8x4xf32>, %[[ARG1:.*]]: tensor<4x12xf32>, +// CHECK-SAME: %[[ARG2:.*]]: tensor<8x12xf32>) -> tensor<8x12xf32> +func.func @matmul_tensors( + %arg0: tensor<8x4xf32>, %arg1: tensor<4x12xf32>, %arg2: tensor<8x12xf32>) + -> tensor<8x12xf32> { + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK-DAG: %[[V0:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], {{.*}} : tensor<8x4xf32>, vector<8x12x4xf32> + // CHECK-DAG: %[[V1:.*]] = vector.transfer_read %[[ARG1]][%[[C0]], %[[C0]]], {{.*}} : tensor<4x12xf32>, vector<8x12x4xf32> + // CHECK-DAG: %[[V2:.*]] = vector.transfer_read %[[ARG2]][%[[C0]], %[[C0]]], {{.*}} : tensor<8x12xf32>, vector<8x12xf32> + // + // linalg matmul lowers gets expanded to a 3D reduction, canonicalization later + // convert it to a 2D contract. + // CHECK: %[[MUL:.*]] = arith.mulf %[[V0]], %[[V1]] : vector<8x12x4xf32> + // CHECK: %[[R:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[V2]] [2] : vector<8x12x4xf32> to vector<8x12xf32> + // CHECK: %[[W:.*]] = vector.transfer_write %[[R]], %[[ARG2]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x12xf32>, tensor<8x12xf32> + %0 = linalg.matmul ins(%arg0, %arg1: tensor<8x4xf32>, tensor<4x12xf32>) + outs(%arg2: tensor<8x12xf32>) + -> tensor<8x12xf32> + // CHECK: return %[[W]] : tensor<8x12xf32> + return %0 : tensor<8x12xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 { disable_multi_reduction_to_contract_patterns, disable_transfer_permutation_map_lowering_patterns } : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + // CHECK-LABEL: contraction_batch_matmul func.func @contraction_batch_matmul(%A: memref<1584x1584x1584xf32>, %B: memref<1584x1584x1584xf32>, %C: memref<1584x1584x1584xf32>) { // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<1584x1584x1584x1584xf32> @@ -115,6 +192,265 @@ module attributes {transform.with_named_sequence} { // ----- +// CHECK-LABEL: @float_mixed_precision_matmul_as_contract +// CHECK-COUNT-3: vector.transfer_read +// CHECK-NOT: arith.extf +// CHECK: vector.contract {{.*}} : vector<24x12xbf16>, vector<12x25xbf16> into vector<24x25xf32> +// CHECK: vector.transfer_write +func.func @float_mixed_precision_matmul_as_contract(%A: tensor<24x12xbf16>, + %B: tensor<12x25xbf16>, + %C: tensor<24x25xf32>) -> tensor<24x25xf32> { + %0 = linalg.contract + indexing_maps = [affine_map<(m, n, k) -> (m, k)>, + affine_map<(m, n, k) -> (k, n)>, + affine_map<(m, n, k) -> (m, n)>] + ins(%A, %B : tensor<24x12xbf16>, tensor<12x25xbf16>) + outs(%C : tensor<24x25xf32>) -> tensor<24x25xf32> + func.return %0 : tensor<24x25xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 { fold_type_extensions_into_contract } : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_fill +func.func @test_vectorize_fill(%A : memref<8x16xf32>, %arg0 : f32) { + // CHECK: %[[V:.*]] = vector.broadcast {{.*}} : f32 to vector<8x16xf32> + // CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32> + linalg.fill ins(%arg0 : f32) outs(%A : memref<8x16xf32>) + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_fill +func.func @test_vectorize_fill_0d(%A : memref<f32>, %arg0 : f32) { + // CHECK-SAME: (%[[M:.*]]: memref<f32>, %[[val:.*]]: f32) + // CHECK: %[[VEC:.*]] = vector.broadcast %[[val]] : f32 to vector<f32> + // CHECK: vector.transfer_write %[[VEC]], %[[M]][] : vector<f32>, memref<f32> + linalg.fill ins(%arg0 : f32) outs(%A : memref<f32>) + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_copy +func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) { + // CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32> + // CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32> + memref.copy %A, %B : memref<8x16xf32> to memref<8x16xf32> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_copy_0d +func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) { + // CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>) + // CHECK: %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32> + // CHECK: %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32> + // CHECK: %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32> + // CHECK: vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32> + memref.copy %A, %B : memref<f32> to memref<f32> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_copy_complex +// CHECK-NOT: vector< +func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) { + memref.copy %A, %B : memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// Input identical as the test in vectorization.mlir. Output is different - +// vector sizes are inferred (rather than user-specified) and hence _no_ +// masking was used. + +func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { + %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> + return %pack : tensor<4x1x32x16x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// CHECK-LABEL: func.func @test_vectorize_pack( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x8x16xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { +// CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]]], %[[VAL_2]] {in_bounds = [true, true, true]} : tensor<32x8x16xf32>, vector<32x8x16xf32> +// CHECK: %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> +// CHECK: %[[VAL_6:.*]] = vector.transpose %[[VAL_5]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32> +// CHECK: %[[VAL_7:.*]] = tensor.empty() : tensor<4x1x32x16x2xf32> +// CHECK: %[[VAL_8:.*]] = vector.transfer_write %[[VAL_6]], %[[VAL_7]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true, true, true, true]} : vector<4x1x32x16x2xf32>, tensor<4x1x32x16x2xf32> +// CHECK: return %[[VAL_8]] : tensor<4x1x32x16x2xf32> + +// ----- + +func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { + %pad = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> + return %pack : tensor<32x4x1x16x2xf32> +} + +// CHECK-LABEL: func.func @test_vectorize_padded_pack( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x7x15xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { +// CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]]], %[[VAL_2]] {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32> +// CHECK: %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> +// CHECK: %[[VAL_6:.*]] = vector.transpose %[[VAL_5]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32> +// CHECK: %[[VAL_7:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32> +// CHECK: %[[VAL_8:.*]] = vector.transfer_write %[[VAL_6]], %[[VAL_7]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32> +// CHECK: return %[[VAL_8]] : tensor<32x4x1x16x2xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @vectorize_map(%arg0: memref<64xf32>, + %arg1: memref<64xf32>, %arg2: memref<64xf32>) { + linalg.map ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) + outs(%arg2 : memref<64xf32>) + (%in: f32, %in_0: f32) { + %0 = arith.addf %in, %in_0 : f32 + linalg.yield %0 : f32 + } + return +} +// CHECK-LABEL: func @vectorize_map +// CHECK: %[[LHS:.*]] = vector.transfer_read +// CHECK-NEXT: %[[RHS:.*]] = vector.transfer_read +// CHECK-NEXT: arith.addf %[[LHS]], %[[RHS]] : vector<64xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.map"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @vectorize_transpose(%arg0: memref<16x32x64xf32>, + %arg1: memref<32x64x16xf32>) { + linalg.transpose ins(%arg0 : memref<16x32x64xf32>) + outs(%arg1 : memref<32x64x16xf32>) permutation = [1, 2, 0] + return +} +// CHECK-LABEL: func @vectorize_transpose +// CHECK: vector.transpose +// CHECK-SAME: [1, 2, 0] : vector<16x32x64xf32> to vector<32x64x16xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.transpose"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @vectorize_reduce(%arg0: memref<16x32x64xf32>, + %arg1: memref<16x64xf32>) { + linalg.reduce ins(%arg0 : memref<16x32x64xf32>) + outs(%arg1 : memref<16x64xf32>) dimensions = [1] + (%in: f32, %init: f32) { + %0 = arith.addf %in, %init : f32 + linalg.yield %0 : f32 + } + return +} +// CHECK-LABEL: func @vectorize_reduce +// CHECK: vector.multi_reduction <add> +// CHECK-SAME: : vector<16x32x64xf32> to vector<16x64xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + #matmul_trait = { indexing_maps = [ affine_map<(m, n, k) -> (m, k)>, @@ -306,27 +642,6 @@ module attributes {transform.with_named_sequence} { // ----- -// CHECK-LABEL: func @vectorization_test_2 -func.func @vectorization_test_2(%A: memref<8x16xf32>, %B: memref<16x32xf32>, - %C: memref<8x32xf32>) { - // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<8x32x16xf32> - // CHECK: vector.multi_reduction <add>, %{{.*}}, {{.*}} [2] : vector<8x32x16xf32> to vector<8x32xf32> - linalg.matmul - ins(%A, %B: memref<8x16xf32>, memref<16x32xf32>) - outs(%C: memref<8x32xf32>) - return -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 { disable_multi_reduction_to_contract_patterns } : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- // CHECK-LABEL: func @test_vectorize_scalar_input func.func @test_vectorize_scalar_input(%A : memref<8x16xf32>, %arg0 : f32) { @@ -427,104 +742,6 @@ module attributes {transform.with_named_sequence} { // ----- -// CHECK-LABEL: func @test_vectorize_fill -func.func @test_vectorize_fill(%A : memref<8x16xf32>, %arg0 : f32) { - // CHECK: %[[V:.*]] = vector.broadcast {{.*}} : f32 to vector<8x16xf32> - // CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32> - linalg.fill ins(%arg0 : f32) outs(%A : memref<8x16xf32>) - return -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - -// CHECK-LABEL: func @test_vectorize_fill -func.func @test_vectorize_fill_0d(%A : memref<f32>, %arg0 : f32) { - // CHECK-SAME: (%[[M:.*]]: memref<f32>, %[[val:.*]]: f32) - // CHECK: %[[VEC:.*]] = vector.broadcast %[[val]] : f32 to vector<f32> - // CHECK: vector.transfer_write %[[VEC]], %[[M]][] : vector<f32>, memref<f32> - linalg.fill ins(%arg0 : f32) outs(%A : memref<f32>) - return -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - -// CHECK-LABEL: func @test_vectorize_copy -func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) { - // CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32> - // CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32> - memref.copy %A, %B : memref<8x16xf32> to memref<8x16xf32> - return -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - -// CHECK-LABEL: func @test_vectorize_copy_0d -func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) { - // CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>) - // CHECK: %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32> - // CHECK: %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32> - // CHECK: %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32> - // CHECK: vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32> - memref.copy %A, %B : memref<f32> to memref<f32> - return -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - -// CHECK-LABEL: func @test_vectorize_copy_complex -// CHECK-NOT: vector< -func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) { - memref.copy %A, %B : memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>> - return -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - // CHECK-LABEL: func @test_vectorize_trailing_index // CHECK-SAME: (%[[ARG0:.*]]: memref<1x2x4x8xindex>) func.func @test_vectorize_trailing_index(%arg0: memref<1x2x4x8xindex>) { @@ -855,40 +1072,6 @@ module attributes {transform.with_named_sequence} { // ----- -// CHECK-LABEL: func @matmul_tensors -// CHECK-SAME: (%[[ARG0:.*]]: tensor<8x4xf32>, %[[ARG1:.*]]: tensor<4x12xf32>, -// CHECK-SAME: %[[ARG2:.*]]: tensor<8x12xf32>) -> tensor<8x12xf32> -func.func @matmul_tensors( - %arg0: tensor<8x4xf32>, %arg1: tensor<4x12xf32>, %arg2: tensor<8x12xf32>) - -> tensor<8x12xf32> { - // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index - // CHECK-DAG: %[[V0:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], {{.*}} : tensor<8x4xf32>, vector<8x12x4xf32> - // CHECK-DAG: %[[V1:.*]] = vector.transfer_read %[[ARG1]][%[[C0]], %[[C0]]], {{.*}} : tensor<4x12xf32>, vector<8x12x4xf32> - // CHECK-DAG: %[[V2:.*]] = vector.transfer_read %[[ARG2]][%[[C0]], %[[C0]]], {{.*}} : tensor<8x12xf32>, vector<8x12xf32> - // - // linalg matmul lowers gets expanded to a 3D reduction, canonicalization later - // convert it to a 2D contract. - // CHECK: %[[MUL:.*]] = arith.mulf %[[V0]], %[[V1]] : vector<8x12x4xf32> - // CHECK: %[[R:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[V2]] [2] : vector<8x12x4xf32> to vector<8x12xf32> - // CHECK: %[[W:.*]] = vector.transfer_write %[[R]], %[[ARG2]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x12xf32>, tensor<8x12xf32> - %0 = linalg.matmul ins(%arg0, %arg1: tensor<8x4xf32>, tensor<4x12xf32>) - outs(%arg2: tensor<8x12xf32>) - -> tensor<8x12xf32> - // CHECK: return %[[W]] : tensor<8x12xf32> - return %0 : tensor<8x12xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 { disable_multi_reduction_to_contract_patterns, disable_transfer_permutation_map_lowering_patterns } : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - // CHECK-LABEL: func @sum_exp func.func @sum_exp(%input: tensor<4x16x8xf32>, %output: tensor<4x16xf32>) -> tensor<4x16xf32> @@ -914,7 +1097,6 @@ func.func @sum_exp(%input: tensor<4x16x8xf32>, %output: tensor<4x16xf32>) return %0 : tensor<4x16xf32> } - module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %3 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op @@ -993,7 +1175,6 @@ func.func @red_maximumf_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> { return %red : tensor<4xf32> } - module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %3 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op @@ -1428,78 +1609,6 @@ module attributes {transform.with_named_sequence} { // ----- -func.func @vectorize_map(%arg0: memref<64xf32>, - %arg1: memref<64xf32>, %arg2: memref<64xf32>) { - linalg.map ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) - outs(%arg2 : memref<64xf32>) - (%in: f32, %in_0: f32) { - %0 = arith.addf %in, %in_0 : f32 - linalg.yield %0 : f32 - } - return -} -// CHECK-LABEL: func @vectorize_map -// CHECK: %[[LHS:.*]] = vector.transfer_read -// CHECK-NEXT: %[[RHS:.*]] = vector.transfer_read -// CHECK-NEXT: arith.addf %[[LHS]], %[[RHS]] : vector<64xf32> - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.map"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - -func.func @vectorize_transpose(%arg0: memref<16x32x64xf32>, - %arg1: memref<32x64x16xf32>) { - linalg.transpose ins(%arg0 : memref<16x32x64xf32>) - outs(%arg1 : memref<32x64x16xf32>) permutation = [1, 2, 0] - return -} -// CHECK-LABEL: func @vectorize_transpose -// CHECK: vector.transpose -// CHECK-SAME: [1, 2, 0] : vector<16x32x64xf32> to vector<32x64x16xf32> - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.transpose"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - -func.func @vectorize_reduce(%arg0: memref<16x32x64xf32>, - %arg1: memref<16x64xf32>) { - linalg.reduce ins(%arg0 : memref<16x32x64xf32>) - outs(%arg1 : memref<16x64xf32>) dimensions = [1] - (%in: f32, %init: f32) { - %0 = arith.addf %in, %init : f32 - linalg.yield %0 : f32 - } - return -} -// CHECK-LABEL: func @vectorize_reduce -// CHECK: vector.multi_reduction <add> -// CHECK-SAME: : vector<16x32x64xf32> to vector<16x64xf32> - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// ----- - // This is a regression test. This IR cannot be vectorized, but // structured.vectorize_children_and_apply_patterns should nevertheless succeed. @@ -1715,65 +1824,77 @@ module attributes {transform.with_named_sequence} { // ----- -// Input identical as the test in vectorization.mlir. Output is different - -// vector sizes are inferred (rather than user-specified) and hence _no_ -// masking was used. - -func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { - %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> - return %pack : tensor<4x1x32x16x2xf32> +// CHECK-LABEL: func @float_mixed_precision_matmul_as_generic +// CHECK-COUNT-3: vector.transfer_read +// CHECK-NOT: arith.extf +// CHECK: vector.contract {{.*}} : vector<8x16xbf16>, vector<16x32xbf16> into vector<8x32xf32> +// CHECK: vector.transfer_write +func.func @float_mixed_precision_matmul_as_generic(%A: memref<8x16xbf16>, %B: memref<16x32xbf16>, + %C: memref<8x32xf32>) { + linalg.generic { + indexing_maps = [ + affine_map<(m, n, k) -> (m, k)>, + affine_map<(m, n, k) -> (k, n)>, + affine_map<(m, n, k) -> (m, n)> + ], + iterator_types = ["parallel", "parallel", "reduction"] + } + ins(%A, %B : memref<8x16xbf16>, memref<16x32xbf16>) + outs(%C : memref<8x32xf32>) { + ^bb(%in: bf16, %in_0: bf16, %c: f32) : + %a = arith.extf %in : bf16 to f32 + %b = arith.extf %in_0 : bf16 to f32 + %d = arith.mulf %a, %b: f32 + %e = arith.addf %c, %d: f32 + linalg.yield %e : f32 + } + return } module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 { fold_type_extensions_into_contract } : (!transform.any_op) -> !transform.any_op transform.yield } } -// CHECK-LABEL: func.func @test_vectorize_pack( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x8x16xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { -// CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_4:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]]], %[[VAL_2]] {in_bounds = [true, true, true]} : tensor<32x8x16xf32>, vector<32x8x16xf32> -// CHECK: %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> -// CHECK: %[[VAL_6:.*]] = vector.transpose %[[VAL_5]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32> -// CHECK: %[[VAL_7:.*]] = tensor.empty() : tensor<4x1x32x16x2xf32> -// CHECK: %[[VAL_8:.*]] = vector.transfer_write %[[VAL_6]], %[[VAL_7]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true, true, true, true]} : vector<4x1x32x16x2xf32>, tensor<4x1x32x16x2xf32> -// CHECK: return %[[VAL_8]] : tensor<4x1x32x16x2xf32> - // ----- -// Input identical as the test in vectorization.mlir. Output is different - -// vector sizes are inferred (rather than user-specified) and hence _no_ -// masking was used. - -func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { - %pad = arith.constant 0.000000e+00 : f32 - %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> - return %pack : tensor<32x4x1x16x2xf32> +// CHECK-LABEL: func @integer_mixed_precision_matmul_as_generic +// CHECK-COUNT-3: vector.transfer_read +// CHECK-NOT: arith.extsi +// CHECK: vector.contract {{.*}} : vector<8x16xi8>, vector<16x32xi8> into vector<8x32xi32> +// CHECK: vector.transfer_write +func.func @integer_mixed_precision_matmul_as_generic(%A: memref<8x16xi8>, %B: memref<16x32xi8>, + %C: memref<8x32xi32>) { + linalg.generic { + indexing_maps = [ + affine_map<(m, n, k) -> (m, k)>, + affine_map<(m, n, k) -> (k, n)>, + affine_map<(m, n, k) -> (m, n)> + ], + iterator_types = ["parallel", "parallel", "reduction"] + } + ins(%A, %B : memref<8x16xi8>, memref<16x32xi8>) + outs(%C : memref<8x32xi32>) { + ^bb(%in: i8, %in_0: i8, %c: i32) : + %a = arith.extsi %in : i8 to i32 + %b = arith.extsi %in_0 : i8 to i32 + %d = arith.muli %a, %b: i32 + %e = arith.addi %c, %d: i32 + linalg.yield %e : i32 + } + return } -// CHECK-LABEL: func.func @test_vectorize_padded_pack( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x7x15xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { -// CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_4:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]]], %[[VAL_2]] {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32> -// CHECK: %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> -// CHECK: %[[VAL_6:.*]] = vector.transpose %[[VAL_5]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32> -// CHECK: %[[VAL_7:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32> -// CHECK: %[[VAL_8:.*]] = vector.transfer_write %[[VAL_6]], %[[VAL_7]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32> -// CHECK: return %[[VAL_8]] : tensor<32x4x1x16x2xf32> - module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 { fold_type_extensions_into_contract } : (!transform.any_op) -> !transform.any_op transform.yield } } + diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir index d41d861..095810f 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir @@ -940,31 +940,100 @@ module attributes {transform.with_named_sequence} { ///---------------------------------------------------------------------------------------- // CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack -// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, -// CHECK-SAME: %[[ARG_1:.*]]: tensor<?x?x16x2xf32> -func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> { -// CHECK: %[[C0:.*]] = arith.constant 0 -// CHECK: %[[C01:.*]] = arith.constant 0 -// CHECK: %[[C02:.*]] = arith.constant 0 -// CHECK: %[[DIM_0:.*]] = tensor.dim %[[ARG_1]], %[[C02]] : tensor<?x?x16x2xf32> -// CHECK: %[[C1:.*]] = arith.constant 1 -// CHECK: %[[DIM6:.*]] = tensor.dim %[[ARG_1]], %[[C1]] : tensor<?x?x16x2xf32> -// CHECK: %[[CNST16:.*]] = arith.constant 16 : index -// CHECK: %[[CNST2:.*]] = arith.constant 2 : index -// CHECK: %[[readMsk0:.*]] = vector.create_mask %[[DIM_0]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1> -// CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32> -// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32> -// CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32> -// CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1> -// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[ARG_0]] -// CHECK: return %[[write0]] - %ret = linalg.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor<?x?x16x2xf32> -> tensor<?x?xf32> - return %ret : tensor<?x?xf32> +// CHECK-SAME: %[[DEST:.*]]: tensor<?x?xf32>, +// CHECK-SAME: %[[SRC:.*]]: tensor<?x?x16x2xf32> +func.func @test_vectorize_dynamic_shapes_unpack(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> { + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[C0_1:.*]] = arith.constant 0 : index + // CHECK: %[[DIM_0:.*]] = tensor.dim %[[SRC]], %[[C0_1]] : tensor<?x?x16x2xf32> + // CHECK: %[[C1:.*]] = arith.constant 1 + // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[C1]] : tensor<?x?x16x2xf32> + // CHECK: %[[CNST16:.*]] = arith.constant 16 : index + // CHECK: %[[CNST2:.*]] = arith.constant 2 : index + // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM_0]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1> + // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32> + // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32> + // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x16xf32> to vector<4x16xf32> + // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x16xi1> + // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]] + // CHECK: return %[[WRITE]] + %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32> + return %ret : tensor<?x?xf32> +} +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [2, 1, 16, 2] : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack_scalable_vec +// CHECK-SAME: %[[DEST:.*]]: tensor<?x?xf32>, +// CHECK-SAME: %[[SRC:.*]]: tensor<?x?x16x2xf32> +func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> { + // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 + // CHECK: %[[C01:.*]] = arith.constant 0 + // CHECK: %[[C02:.*]] = arith.constant 0 + // CHECK: %[[DIM4:.*]] = tensor.dim %[[SRC]], %[[C02]] : tensor<?x?x16x2xf32> + // CHECK: %[[CNST14:.*]] = arith.constant 1 + // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[CNST14]] : tensor<?x?x16x2xf32> + // CHECK: %[[CNST16:.*]] = arith.constant 16 : index + // CHECK: %[[CNST2:.*]] = arith.constant 2 : index + // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x[16]x2xi1> + // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32> + // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x[16]x2xf32> to vector<2x2x1x[16]xf32> + // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x[16]xf32> to vector<4x[16]xf32> + // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x[16]xi1> + // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]] + // CHECK: return %[[WRITE]] + %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32> + return %ret : tensor<?x?xf32> +} +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2] : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size +// CHECK-SAME: %[[DEST:.*]]: tensor<?x?xf32>, +// CHECK-SAME: %[[SRC:.*]]: tensor<?x?x?x2xf32> +func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size(%dest: tensor<?x?xf32>, %src: tensor<?x?x?x2xf32>) -> tensor<?x?xf32> { + // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 + // CHECK: %[[C01:.*]] = arith.constant 0 + // CHECK: %[[C02:.*]] = arith.constant 0 + // CHECK: %[[DIM4:.*]] = tensor.dim %[[SRC]], %[[C02]] : tensor<?x?x?x2xf32> + // CHECK: %[[C1_2:.*]] = arith.constant 1 + // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[C1_2]] : tensor<?x?x?x2xf32> + // CHECK: %[[C2:.*]] = arith.constant 2 : index + // CHECK: %[[DIM_2:.*]] = tensor.dim %[[SRC]], %[[C2]] : tensor<?x?x?x2xf32> + // CHECK: %[[C2_1:.*]] = arith.constant 2 : index + // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[DIM_2]], %[[C2_1]] : vector<2x1x[16]x2xi1> + // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x?x2xf32>, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32> + // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x[16]x2xf32> to vector<2x2x1x[16]xf32> + // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x[16]xf32> to vector<4x[16]xf32> + // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x[16]xi1> + // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]] + // CHECK: return %[[WRITE]] + + %vs = vector.vscale + %c16 = arith.constant 16 : index + %tile_size = arith.muli %vs, %c16 : index + + %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [%tile_size, 2] into %dest : tensor<?x?x?x2xf32> -> tensor<?x?xf32> + return %ret : tensor<?x?xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [4, 16] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2] : !transform.any_op transform.yield } } @@ -997,7 +1066,7 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [16, 8, 32, 16] : !transform.any_op transform.yield } } @@ -1022,7 +1091,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16] : !transform.any_op transform.yield } } @@ -1047,7 +1116,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16] : !transform.any_op transform.yield } } diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index f86fb38..4a7176e 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -1168,6 +1168,106 @@ func.func @canonicalize_broadcast_shapecast_both_possible(%arg0: vector<1xf32>) // ----- +// CHECK-LABEL: func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim +// CHECK-NOT: vector.shape_cast +// CHECK: vector.broadcast {{.+}} : vector<2xf32> to vector<32x2xf32> +func.func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim(%arg0 : vector<2xf32>) -> vector<32x2xf32> { + %0 = vector.shape_cast %arg0 : vector<2xf32> to vector<1x2xf32> + %1 = vector.broadcast %0 : vector<1x2xf32> to vector<32x2xf32> + return %1 : vector<32x2xf32> +} + +// ----- + +// CHECK-LABEL: func.func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim2( +// CHECK-SAME: %[[ARG0:.*]]: vector<2x1xf32>) -> vector<32x2x1xf32> { +// CHECK: %[[VAL_0:.*]] = vector.broadcast %[[ARG0]] : vector<2x1xf32> to vector<32x2x1xf32> +// CHECK: return %[[VAL_0]] : vector<32x2x1xf32> +// CHECK: } +func.func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim2(%arg0 : vector<2x1xf32>) -> vector<32x2x1xf32> { + %0 = vector.shape_cast %arg0 : vector<2x1xf32> to vector<1x2x1xf32> + %1 = vector.broadcast %0 : vector<1x2x1xf32> to vector<32x2x1xf32> + return %1 : vector<32x2x1xf32> +} + +// ----- + +// CHECK-LABEL: func.func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim3( +// CHECK-SAME: %[[ARG0:.*]]: vector<2x1xf32>) -> vector<32x2x4xf32> { +// CHECK: %[[VAL_0:.*]] = vector.broadcast %[[ARG0]] : vector<2x1xf32> to vector<32x2x4xf32> +// CHECK: return %[[VAL_0]] : vector<32x2x4xf32> +// CHECK: } +func.func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim3(%arg0 : vector<2x1xf32>) -> vector<32x2x4xf32> { + %0 = vector.shape_cast %arg0 : vector<2x1xf32> to vector<1x2x1xf32> + %1 = vector.broadcast %0 : vector<1x2x1xf32> to vector<32x2x4xf32> + return %1 : vector<32x2x4xf32> +} + +// ----- + +// CHECK-LABEL: func.func @canonicalize_shapecast_broadcast_to_broadcast_remove_leading_dim( +// CHECK-SAME: %[[ARG0:.*]]: vector<1x2xf32>) -> vector<32x2xf32> { +// CHECK: %[[VAL_0:.*]] = vector.broadcast %[[ARG0]] : vector<1x2xf32> to vector<32x2xf32> +// CHECK: return %[[VAL_0]] : vector<32x2xf32> +// CHECK: } +func.func @canonicalize_shapecast_broadcast_to_broadcast_remove_leading_dim(%arg0 : vector<1x2xf32>) -> vector<32x2xf32> { + %0 = vector.shape_cast %arg0 : vector<1x2xf32> to vector<2xf32> + %1 = vector.broadcast %0 : vector<2xf32> to vector<32x2xf32> + return %1 : vector<32x2xf32> +} + +// ----- + +// CHECK-LABEL: func @negative_canonicalize_shapecast_broadcast_invalid_shape +// CHECK: vector.shape_cast {{.+}} : vector<64xf32> to vector<4x16xf32> +// CHECK: vector.broadcast {{.+}} : vector<4x16xf32> to vector<2x4x16xf32> +func.func @negative_canonicalize_shapecast_broadcast_invalid_shape(%arg0 : vector<64xf32>) -> vector<2x4x16xf32> { + %0 = vector.shape_cast %arg0 : vector<64xf32> to vector<4x16xf32> + %1 = vector.broadcast %0 : vector<4x16xf32> to vector<2x4x16xf32> + return %1 : vector<2x4x16xf32> +} + +// ----- + +// CHECK-LABEL: func @negative_canonicalize_shapecast_broadcast_invalid_broadcasted_dims +// CHECK: vector.shape_cast {{.+}} : vector<2x1xf32> to vector<1x2xf32> +// CHECK: vector.broadcast {{.+}} : vector<1x2xf32> to vector<2x2xf32> +func.func @negative_canonicalize_shapecast_broadcast_invalid_broadcasted_dims(%arg0 : vector<2x1xf32>) -> vector<2x2xf32> { + %0 = vector.shape_cast %arg0 : vector<2x1xf32> to vector<1x2xf32> + %1 = vector.broadcast %0 : vector<1x2xf32> to vector<2x2xf32> + return %1 : vector<2x2xf32> +} + +// ----- + +// CHECK-LABEL: func.func @negative_canonicalize_shapecast_broadcast_to_broadcast_append_dim( +// CHECK-SAME: %[[ARG0:.*]]: vector<2xf32>) -> vector<2x4xf32> { +// CHECK: %[[VAL_0:.*]] = vector.shape_cast %[[ARG0]] : vector<2xf32> to vector<2x1xf32> +// CHECK: %[[VAL_1:.*]] = vector.broadcast %[[VAL_0]] : vector<2x1xf32> to vector<2x4xf32> +// CHECK: return %[[VAL_1]] : vector<2x4xf32> +// CHECK: } +func.func @negative_canonicalize_shapecast_broadcast_to_broadcast_append_dim(%arg0 : vector<2xf32>) -> vector<2x4xf32> { + %0 = vector.shape_cast %arg0 : vector<2xf32> to vector<2x1xf32> + %1 = vector.broadcast %0 : vector<2x1xf32> to vector<2x4xf32> + return %1 : vector<2x4xf32> +} + +// ----- + +// CHECK-LABEL: func.func @negative_canonicalize_shapecast_broadcast_to_broadcast_remove_trailing_dim( +// CHECK-SAME: %[[ARG0:.*]]: vector<2x1xf32>) -> vector<32x2xf32> { +// CHECK: %[[VAL_0:.*]] = vector.shape_cast %[[ARG0]] : vector<2x1xf32> to vector<2xf32> +// CHECK: %[[VAL_1:.*]] = vector.broadcast %[[VAL_0]] : vector<2xf32> to vector<32x2xf32> +// CHECK: return %[[VAL_1]] : vector<32x2xf32> +// CHECK: } +func.func @negative_canonicalize_shapecast_broadcast_to_broadcast_remove_trailing_dim(%arg0 : vector<2x1xf32>) -> vector<32x2xf32> { + %0 = vector.shape_cast %arg0 : vector<2x1xf32> to vector<2xf32> + %1 = vector.broadcast %0 : vector<2xf32> to vector<32x2xf32> + return %1 : vector<32x2xf32> +} + +// ----- + // CHECK-LABEL: fold_vector_transfer_masks func.func @fold_vector_transfer_masks(%A: memref<?x?xf32>) -> (vector<4x8xf32>, vector<4x[4]xf32>) { // CHECK: %[[C0:.+]] = arith.constant 0 : index diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir index c21de56..211e16d 100644 --- a/mlir/test/Dialect/Vector/invalid.mlir +++ b/mlir/test/Dialect/Vector/invalid.mlir @@ -1305,6 +1305,26 @@ func.func @store_memref_index_mismatch(%base : memref<?xf32>, %value : vector<16 // ----- +//===----------------------------------------------------------------------===// +// vector.maskedload +//===----------------------------------------------------------------------===// + +func.func @maskedload_negative_alignment(%base: memref<4xi32>, %mask: vector<32xi1>, %pass: vector<1xi32>, %index: index) { + // expected-error@below {{'vector.maskedload' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} + %val = vector.maskedload %base[%index], %mask, %pass { alignment = -1 } : memref<4xi32>, vector<32xi1>, vector<1xi32> into vector<1xi32> + return +} + +// ----- + +func.func @maskedload_nonpoweroftwo_alignment(%base: memref<4xi32>, %mask: vector<32xi1>, %pass: vector<1xi32>, %index: index) { + // expected-error@below {{'vector.maskedload' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} + %val = vector.maskedload %base[%index], %mask, %pass { alignment = 3 } : memref<4xi32>, vector<32xi1>, vector<1xi32> into vector<1xi32> + return +} + +// ----- + func.func @maskedload_base_type_mismatch(%base: memref<?xf64>, %mask: vector<16xi1>, %pass: vector<16xf32>) { %c0 = arith.constant 0 : index // expected-error@+1 {{'vector.maskedload' op base and result element type should match}} @@ -1336,6 +1356,26 @@ func.func @maskedload_memref_mismatch(%base: memref<?xf32>, %mask: vector<16xi1> // ----- +//===----------------------------------------------------------------------===// +// vector.maskedstore +//===----------------------------------------------------------------------===// + +func.func @maskedstore_negative_alignment(%base: memref<4xi32>, %mask: vector<32xi1>, %value: vector<1xi32>, %index: index) { + // expected-error@below {{'vector.maskedstore' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} + vector.maskedstore %base[%index], %mask, %value { alignment = -1 } : memref<4xi32>, vector<32xi1>, vector<1xi32> into vector<1xi32> + return +} + +// ----- + +func.func @maskedstore_nonpoweroftwo_alignment(%base: memref<4xi32>, %mask: vector<32xi1>, %value: vector<1xi32>, %index: index) { + // expected-error@below {{'vector.maskedstore' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} + vector.maskedstore %base[%index], %mask, %value { alignment = 3 } : memref<4xi32>, vector<32xi1>, vector<1xi32> into vector<1xi32> + return +} + +// ----- + func.func @maskedstore_base_type_mismatch(%base: memref<?xf64>, %mask: vector<16xi1>, %value: vector<16xf32>) { %c0 = arith.constant 0 : index // expected-error@+1 {{'vector.maskedstore' op base and valueToStore element type should match}} @@ -1912,8 +1952,7 @@ func.func @vector_load(%src : memref<?xi8>) { // ----- -func.func @invalid_load_alignment(%memref: memref<4xi32>) { - %c0 = arith.constant 0 : index +func.func @invalid_load_alignment(%memref: memref<4xi32>, %c0: index) { // expected-error @below {{'vector.load' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} %val = vector.load %memref[%c0] { alignment = -1 } : memref<4xi32>, vector<4xi32> return @@ -1921,6 +1960,14 @@ func.func @invalid_load_alignment(%memref: memref<4xi32>) { // ----- +func.func @invalid_load_alignment(%memref: memref<4xi32>, %c0: index) { + // expected-error @below {{'vector.load' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} + %val = vector.load %memref[%c0] { alignment = 3 } : memref<4xi32>, vector<4xi32> + return +} + +// ----- + //===----------------------------------------------------------------------===// // vector.store //===----------------------------------------------------------------------===// @@ -1934,8 +1981,15 @@ func.func @vector_store(%dest : memref<?xi8>, %vec : vector<16x16xi8>) { // ----- -func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: vector<4xi32>) { - %c0 = arith.constant 0 : index +func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: vector<4xi32>, %c0: index) { + // expected-error @below {{'vector.store' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} + vector.store %val, %memref[%c0] { alignment = -1 } : memref<4xi32>, vector<4xi32> + return +} + +// ----- + +func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: vector<4xi32>, %c0: index) { // expected-error @below {{'vector.store' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} vector.store %val, %memref[%c0] { alignment = 3 } : memref<4xi32>, vector<4xi32> return diff --git a/mlir/test/Dialect/Vector/vector-sink.mlir b/mlir/test/Dialect/Vector/vector-sink.mlir index ef881ba..577b06d 100644 --- a/mlir/test/Dialect/Vector/vector-sink.mlir +++ b/mlir/test/Dialect/Vector/vector-sink.mlir @@ -40,7 +40,7 @@ func.func @broadcast_scalar_with_bcast_scalable(%arg1: index, %arg2: index) -> v // CHECK: %[[BCAST:.*]] = vector.broadcast %[[ADD]] : index to vector<1x4xindex> // CHECK: return %[[BCAST]] : vector<1x4xindex> func.func @broadcast_scalar_with_bcast_and_splat(%arg1: index, %arg2: index) -> vector<1x4xindex> { - %0 = vector.splat %arg1 : vector<1x4xindex> + %0 = vector.broadcast %arg1 : index to vector<1x4xindex> %1 = vector.broadcast %arg2 : index to vector<1x4xindex> %2 = arith.addi %0, %1 : vector<1x4xindex> return %2 : vector<1x4xindex> @@ -53,7 +53,7 @@ func.func @broadcast_scalar_with_bcast_and_splat(%arg1: index, %arg2: index) -> // CHECK: %[[BCAST:.*]] = vector.broadcast %[[ADD]] : index to vector<1x[4]xindex> // CHECK: return %[[BCAST]] : vector<1x[4]xindex> func.func @broadcast_scalar_with_bcast_and_splat_scalable(%arg1: index, %arg2: index) -> vector<1x[4]xindex> { - %0 = vector.splat %arg1 : vector<1x[4]xindex> + %0 = vector.broadcast %arg1 : index to vector<1x[4]xindex> %1 = vector.broadcast %arg2 : index to vector<1x[4]xindex> %2 = arith.addi %0, %1 : vector<1x[4]xindex> return %2 : vector<1x[4]xindex> @@ -94,12 +94,12 @@ func.func @broadcast_vector_scalable(%arg1: vector<[4]xf32>, %arg2: vector<[4]xf // CHECK-LABEL: func.func @broadcast_scalar_and_vec( // CHECK-SAME: %[[ARG1:.*]]: index, // CHECK-SAME: %[[ARG2:.*]]: vector<4xindex>) -> vector<1x4xindex> { -// CHECK: %[[SPLAT:.*]] = vector.splat %[[ARG1]] : vector<1x4xindex> +// CHECK: %[[SPLAT:.*]] = vector.broadcast %[[ARG1]] : index to vector<1x4xindex> // CHECK: %[[BCAST:.*]] = vector.broadcast %[[ARG2]] : vector<4xindex> to vector<1x4xindex> // CHECK: %[[ADD:.*]] = arith.addi %[[SPLAT]], %[[BCAST]] : vector<1x4xindex> // CHECK: return %[[ADD]] : vector<1x4xindex> func.func @broadcast_scalar_and_vec(%arg1: index, %arg2: vector<4xindex>) -> vector<1x4xindex> { - %0 = vector.splat %arg1 : vector<1x4xindex> + %0 = vector.broadcast %arg1 : index to vector<1x4xindex> %1 = vector.broadcast %arg2 : vector<4xindex> to vector<1x4xindex> %2 = arith.addi %0, %1 : vector<1x4xindex> return %2 : vector<1x4xindex> @@ -108,12 +108,12 @@ func.func @broadcast_scalar_and_vec(%arg1: index, %arg2: vector<4xindex>) -> vec // CHECK-LABEL: func.func @broadcast_scalar_and_vec_scalable( // CHECK-SAME: %[[ARG1:.*]]: index, // CHECK-SAME: %[[ARG2:.*]]: vector<[4]xindex>) -> vector<1x[4]xindex> { -// CHECK: %[[SPLAT:.*]] = vector.splat %[[ARG1]] : vector<1x[4]xindex> +// CHECK: %[[SPLAT:.*]] = vector.broadcast %[[ARG1]] : index to vector<1x[4]xindex> // CHECK: %[[BCAST:.*]] = vector.broadcast %[[ARG2]] : vector<[4]xindex> to vector<1x[4]xindex> // CHECK: %[[ADD:.*]] = arith.addi %[[SPLAT]], %[[BCAST]] : vector<1x[4]xindex> // CHECK: return %[[ADD]] : vector<1x[4]xindex> func.func @broadcast_scalar_and_vec_scalable(%arg1: index, %arg2: vector<[4]xindex>) -> vector<1x[4]xindex> { - %0 = vector.splat %arg1 : vector<1x[4]xindex> + %0 = vector.broadcast %arg1 : index to vector<1x[4]xindex> %1 = vector.broadcast %arg2 : vector<[4]xindex> to vector<1x[4]xindex> %2 = arith.addi %0, %1 : vector<1x[4]xindex> return %2 : vector<1x[4]xindex> @@ -787,7 +787,7 @@ func.func @negative_extract_load_scalable(%arg0: memref<?xf32>, %arg1: index) -> // CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32) func.func @store_splat(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) { // CHECK: memref.store %[[ARG2]], %[[ARG0]][%[[ARG1]]] : memref<?xf32> - %0 = vector.splat %arg2 : vector<1xf32> + %0 = vector.broadcast %arg2 : f32 to vector<1xf32> vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<1xf32> return } @@ -813,9 +813,9 @@ func.func @store_broadcast_1d_to_2d(%arg0: memref<?x?xf32>, %arg1: index, %arg2: // CHECK-LABEL: @negative_store_scalable // CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32) func.func @negative_store_scalable(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) { -// CHECK: %[[RES:.*]] = vector.splat %[[ARG2]] : vector<[1]xf32> +// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<[1]xf32> // CHECK: vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xf32>, vector<[1]xf32> - %0 = vector.splat %arg2 : vector<[1]xf32> + %0 = vector.broadcast %arg2 : f32 to vector<[1]xf32> vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<[1]xf32> return } @@ -823,9 +823,9 @@ func.func @negative_store_scalable(%arg0: memref<?xf32>, %arg1: index, %arg2: f3 // CHECK-LABEL: @negative_store_memref_of_vec // CHECK-SAME: (%[[ARG0:.*]]: memref<?xvector<1xf32>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32) func.func @negative_store_memref_of_vec(%arg0: memref<?xvector<1xf32>>, %arg1: index, %arg2: f32) { -// CHECK: %[[RES:.*]] = vector.splat %[[ARG2]] : vector<1xf32> +// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<1xf32> // CHECK: vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xvector<1xf32>>, vector<1xf32> - %0 = vector.splat %arg2 : vector<1xf32> + %0 = vector.broadcast %arg2 : f32 to vector<1xf32> vector.store %0, %arg0[%arg1] : memref<?xvector<1xf32>>, vector<1xf32> return } @@ -833,9 +833,9 @@ func.func @negative_store_memref_of_vec(%arg0: memref<?xvector<1xf32>>, %arg1: i // CHECK-LABEL: @negative_store_more_than_one_element // CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32) func.func @negative_store_more_than_one_element(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) { -// CHECK: %[[RES:.*]] = vector.splat %[[ARG2]] : vector<4xf32> +// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<4xf32> // CHECK: vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xf32>, vector<4xf32> - %0 = vector.splat %arg2 : vector<4xf32> + %0 = vector.broadcast %arg2 : f32 to vector<4xf32> vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<4xf32> return } @@ -843,10 +843,10 @@ func.func @negative_store_more_than_one_element(%arg0: memref<?xf32>, %arg1: ind // CHECK-LABEL: @negative_store_no_single_use // CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32) func.func @negative_store_no_single_use(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) -> vector<1xf32> { -// CHECK: %[[RES:.*]] = vector.splat %[[ARG2]] : vector<1xf32> +// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<1xf32> // CHECK: vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xf32>, vector<1xf32> // CHECK: return %[[RES:.*]] : vector<1xf32> - %0 = vector.splat %arg2 : vector<1xf32> + %0 = vector.broadcast %arg2 : f32 to vector<1xf32> vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<1xf32> return %0 : vector<1xf32> } diff --git a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir index 1b54d54..45afbff 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir @@ -285,19 +285,19 @@ func.func @transfer_read_permutations(%mem_0 : memref<?x?xf32>, %mem_1 : memref< %c0 = arith.constant 0 : index // CHECK: %[[MASK0:.*]] = vector.broadcast %{{.*}} : i1 to vector<14x7xi1> - %mask0 = vector.splat %m : vector<14x7xi1> + %mask0 = vector.broadcast %m : i1 to vector<14x7xi1> %0 = vector.transfer_read %mem_1[%c0, %c0, %c0, %c0], %cst, %mask0 {in_bounds = [true, false, true, true], permutation_map = #map0} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32> // CHECK: vector.transfer_read {{.*}} %[[MASK0]] {in_bounds = [false, true, true, true], permutation_map = #[[$MAP0]]} : memref<?x?x?x?xf32>, vector<14x7x8x16xf32> // CHECK: vector.transpose %{{.*}}, [1, 0, 2, 3] : vector<14x7x8x16xf32> to vector<7x14x8x16xf32> // CHECK: %[[MASK1:.*]] = vector.broadcast %{{.*}} : i1 to vector<16x14xi1> - %mask1 = vector.splat %m : vector<16x14xi1> + %mask1 = vector.broadcast %m : i1 to vector<16x14xi1> %1 = vector.transfer_read %mem_1[%c0, %c0, %c0, %c0], %cst, %mask1 {in_bounds = [true, false, true, false], permutation_map = #map1} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32> // CHECK: vector.transfer_read {{.*}} %[[MASK1]] {in_bounds = [false, false, true, true], permutation_map = #[[$MAP0]]} : memref<?x?x?x?xf32>, vector<16x14x7x8xf32> // CHECK: vector.transpose %{{.*}}, [2, 1, 3, 0] : vector<16x14x7x8xf32> to vector<7x14x8x16xf32> // CHECK: %[[MASK3:.*]] = vector.broadcast %{{.*}} : i1 to vector<14x7xi1> - %mask2 = vector.splat %m : vector<14x7xi1> + %mask2 = vector.broadcast %m : i1 to vector<14x7xi1> %2 = vector.transfer_read %mem_1[%c0, %c0, %c0, %c0], %cst, %mask2 {in_bounds = [true, false, true, true], permutation_map = #map2} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32> // CHECK: vector.transfer_read {{.*}} %[[MASK3]] {in_bounds = [false, true, true], permutation_map = #[[$MAP1]]} : memref<?x?x?x?xf32>, vector<14x16x7xf32> // CHECK: vector.broadcast %{{.*}} : vector<14x16x7xf32> to vector<8x14x16x7xf32> @@ -337,7 +337,7 @@ func.func @transfer_write_permutations_tensor_masked( %c0 = arith.constant 0 : index // CHECK: %[[MASK:.*]] = vector.broadcast %[[M]] : i1 to vector<16x14x7x8xi1> - %mask0 = vector.splat %m : vector<16x14x7x8xi1> + %mask0 = vector.broadcast %m : i1 to vector<16x14x7x8xi1> %res = vector.transfer_write %vec, %dst[%c0, %c0, %c0, %c0], %mask0 {in_bounds = [true, false, false, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d1, d3, d0)>} : vector<7x14x8x16xf32>, tensor<?x?x?x?xf32> // CHECK: %[[NEW_VEC0:.*]] = vector.transpose %{{.*}} [3, 1, 0, 2] : vector<7x14x8x16xf32> to vector<16x14x7x8xf32> // CHECK: %[[NEW_RES0:.*]] = vector.transfer_write %[[NEW_VEC0]], %[[DST]][%c0, %c0, %c0, %c0], %[[MASK]] {in_bounds = [true, false, true, false]} : vector<16x14x7x8xf32>, tensor<?x?x?x?xf32> diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/if.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/if.mlir new file mode 100644 index 0000000..01068cb --- /dev/null +++ b/mlir/test/Dialect/WasmSSA/custom_parser/if.mlir @@ -0,0 +1,53 @@ +// RUN: mlir-opt %s | FileCheck %s + +// CHECK-LABEL: wasmssa.func nested @func_0( +// CHECK-SAME: %[[ARG0:.*]]: !wasmssa<local ref to i32>) -> i32 { +// CHECK: %[[VAL_0:.*]] = wasmssa.local_get %[[ARG0]] : ref to i32 +// CHECK: wasmssa.if %[[VAL_0]] : { +// CHECK: %[[VAL_1:.*]] = wasmssa.const 5.000000e-01 : f32 +// CHECK: wasmssa.block_return %[[VAL_1]] : f32 +// CHECK: } "else "{ +// CHECK: %[[VAL_2:.*]] = wasmssa.const 2.500000e-01 : f32 +// CHECK: wasmssa.block_return %[[VAL_2]] : f32 +// CHECK: }> ^bb1 +// CHECK: ^bb1(%[[VAL_3:.*]]: f32): +// CHECK: wasmssa.return %[[VAL_3]] : f32 +wasmssa.func nested @func_0(%arg0 : !wasmssa<local ref to i32>) -> i32 { + %cond = wasmssa.local_get %arg0 : ref to i32 + wasmssa.if %cond : { + %c0 = wasmssa.const 0.5 : f32 + wasmssa.block_return %c0 : f32 + } else { + %c1 = wasmssa.const 0.25 : f32 + wasmssa.block_return %c1 : f32 + } >^bb1 + ^bb1(%retVal: f32): + wasmssa.return %retVal : f32 +} + +// CHECK-LABEL: wasmssa.func nested @func_1( +// CHECK-SAME: %[[ARG0:.*]]: !wasmssa<local ref to i32>) -> i32 { +// CHECK: %[[VAL_0:.*]] = wasmssa.local_get %[[ARG0]] : ref to i32 +// CHECK: %[[VAL_1:.*]] = wasmssa.local of type i32 +// CHECK: %[[VAL_2:.*]] = wasmssa.const 0 : i64 +// CHECK: wasmssa.if %[[VAL_0]] : { +// CHECK: %[[VAL_3:.*]] = wasmssa.const 1 : i32 +// CHECK: wasmssa.local_set %[[VAL_1]] : ref to i32 to %[[VAL_3]] : i32 +// CHECK: wasmssa.block_return +// CHECK: } > ^bb1 +// CHECK: ^bb1: +// CHECK: %[[VAL_4:.*]] = wasmssa.local_get %[[VAL_1]] : ref to i32 +// CHECK: wasmssa.return %[[VAL_4]] : i32 +wasmssa.func nested @func_1(%arg0 : !wasmssa<local ref to i32>) -> i32 { + %cond = wasmssa.local_get %arg0 : ref to i32 + %var = wasmssa.local of type i32 + %zero = wasmssa.const 0 + wasmssa.if %cond : { + %c1 = wasmssa.const 1 : i32 + wasmssa.local_set %var : ref to i32 to %c1 : i32 + wasmssa.block_return + } >^bb1 + ^bb1: + %res = wasmssa.local_get %var : ref to i32 + wasmssa.return %res : i32 +} diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/memory.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/memory.mlir new file mode 100644 index 0000000..47551db --- /dev/null +++ b/mlir/test/Dialect/WasmSSA/custom_parser/memory.mlir @@ -0,0 +1,7 @@ +// RUN: mlir-opt %s | FileCheck %s + +// CHECK: wasmssa.memory @mem0 public !wasmssa<limit[0: 65536]> +wasmssa.memory @mem0 public !wasmssa<limit[0:65536]> + +// CHECK: wasmssa.memory @mem1 nested !wasmssa<limit[512:]> +wasmssa.memory @mem1 !wasmssa<limit[512:]> diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/table.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/table.mlir new file mode 100644 index 0000000..5a874f4 --- /dev/null +++ b/mlir/test/Dialect/WasmSSA/custom_parser/table.mlir @@ -0,0 +1,7 @@ +// RUN: mlir-opt %s | FileCheck %s + +// CHECK: wasmssa.table @tab0 public !wasmssa<tabletype !wasmssa.externref [0: 65536]> +wasmssa.table @tab0 public !wasmssa<tabletype !wasmssa.externref [0:65536]> + +// CHECK: wasmssa.table @tab1 nested !wasmssa<tabletype !wasmssa.funcref [348:]> +wasmssa.table @tab1 !wasmssa<tabletype !wasmssa.funcref [348:]> diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index dff3ffa..44e15dd 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -52,14 +52,14 @@ func.func @create_nd_tdesc_7(%src: memref<128x128xf32>) { // ----- func.func @create_nd_tdesc_8(%src: ui64) { - // expected-error@+1 {{'xegpu.create_nd_tdesc' op Expecting strides and shape to be present for integer source}} + // expected-error@+1 {{'xegpu.create_nd_tdesc' op expecting strides and shape to be present for integer source}} %1 = xegpu.create_nd_tdesc %src : ui64-> !xegpu.tensor_desc<128x128xf32> return } // ----- func.func @create_nd_tdesc_9(%src: ui64) { - // expected-error@+1 {{expected mixed offsets rank to match mixed sizes rank}} + // expected-error@+1 {{expecting strides and shape to be present for integer source}} %1 = xegpu.create_nd_tdesc %src[0, 0] : ui64-> !xegpu.tensor_desc<128x128xf32> return } @@ -149,7 +149,7 @@ func.func @subgroup_load_nd_offset_2(%src: memref<4x8x16xf16>, %x : index) { } // ----- -func.func @subgroup_load_nd_offset_3(%src: memref<4x8x16xf16>, %x : index) { +func.func @subgroup_load_nd_offset_3(%src: memref<4x8x16xf16>, %x : index) { %3 = xegpu.create_nd_tdesc %src: memref<4x8x16xf16> -> !xegpu.tensor_desc<8x16xf16> %5 = xegpu.load_nd %3[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> // expected-error@+1 {{Mismatched ranks between offsets and tensor descriptor}} @@ -418,7 +418,7 @@ func.func @store_scatter_offset_wi_1(%src: memref<?xf16>) { %offsets = arith.constant dense<[0]> : vector<1xindex> %mask = arith.constant dense<1>: vector<1xi1> // expected-error@+1 {{value elements must match chunk size}} - xegpu.store %val, %src[%offsets], %mask + xegpu.store %val, %src[%offsets], %mask : vector<4xf16>, memref<?xf16>, vector<1xindex>, vector<1xi1> return } @@ -429,7 +429,7 @@ func.func @store_scatter_offset_wi_2(%src: memref<4x4xf16>) { %offsets = arith.constant dense<[0]> : vector<1xindex> %mask = arith.constant dense<1>: vector<1xi1> // expected-error@+1 {{Expecting the dest is a 1D memref or pointer}} - xegpu.store %val, %src[%offsets], %mask + xegpu.store %val, %src[%offsets], %mask : vector<4xf16>, memref<4x4xf16>, vector<1xindex>, vector<1xi1> return } @@ -743,3 +743,22 @@ func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) { #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2], order = [0, 1, 2]>> return } + +// ----- +#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]> +// expected-error@+1 {{repeated dim (2) in slice attribute}} +#s = #xegpu.slice<#l, dims = [2, 2]> +func.func @slice_attr_repeat_dim() { + %offsets = arith.constant {layout_result_0 = #s} dense<0.8> : vector<16x8xindex> + return +} + +// ----- +#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]> +// expected-error@+1 {{invalid dim (3) in slice attribute}} +#s = #xegpu.slice<#l, dims = [3]> +func.func @slice_attr_repeat_dim() { + %offsets = arith.constant {layout_result_0 = #s} dense<0.8> : vector<16x8xindex> + return +} + diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir index 017dacc..e4b4e22 100644 --- a/mlir/test/Dialect/XeGPU/layout.mlir +++ b/mlir/test/Dialect/XeGPU/layout.mlir @@ -50,4 +50,27 @@ gpu.func @convert_layout_wg(%a: vector<32x64xf16>) { gpu.return } +gpu.func @slice_attr() { + //CHECK: arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex> + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex> + gpu.return +} + +gpu.func @nested_slice_attr() { + //CHECK: arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>, dims = [1]>} dense<8> : vector<16xindex> + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>, dims = [1]>} dense<8> : vector<16xindex> + gpu.return +} + +gpu.func @softmax_dim_0(%arg0: vector<256x128xf32>) -> vector<256x128xf32> { + %cst = arith.constant dense<0.000000e+00> : vector<128xf32> + %0 = math.exp %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32> + //CHECK: vector.multi_reduction <add>, {{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32> + %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32> + //CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32> + %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32> + %3 = arith.divf %0, %2 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32> + gpu.return %3 : vector<256x128xf32> +} + } diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index 6be2371..67c00f5 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -62,28 +62,28 @@ gpu.func @create_nd_tdesc_7(%src: memref<8x24x32x48x64xf32>) { } -// CHECK: gpu.func @test_create_nd_tdesc_7(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index, %[[arg5:.*]]: memref<24x32xf32>) +// CHECK: gpu.func @test_create_nd_tdesc_7(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index, %[[arg5:.*]]: memref<24x32xf32>) gpu.func @test_create_nd_tdesc_7(%src: ui64, %w : index, %h : index, %x : index, %y : index, %src2: memref<24x32xf32>) { //CHECK: %[[C:.*]] = arith.constant 1 : index %c1 = arith.constant 1 : index - - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg5]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg5]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %3 = xegpu.create_nd_tdesc %src2 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - + gpu.return } -// CHECK: gpu.func @test_create_nd_tdesc_8(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) +// CHECK: gpu.func @test_create_nd_tdesc_8(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) gpu.func @test_create_nd_tdesc_8(%src: ui64, %w : index, %h : index, %x : index, %y : index) { - - %c1 = arith.constant 1 : index - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0], shape : [%arg2, %arg1], strides : [%arg1, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> + + %c1 = arith.constant 1 : index + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0, shape : [%arg2, %arg1], strides : [%arg1, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> %2 = xegpu.create_nd_tdesc %src, shape : [%h, %w], strides : [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> - + gpu.return } -// CHECK-LABEL: func @test_create_nd_tdesc_9({{.*}}) +// CHECK-LABEL: func @test_create_nd_tdesc_9({{.*}}) gpu.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) { @@ -94,10 +94,10 @@ gpu.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index, gpu.return } -// CHECK-LABEL: func @test_create_nd_tdesc_10({{.*}}) -gpu.func @test_create_nd_tdesc_10(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) { +// CHECK-LABEL: func @test_create_nd_tdesc_10({{.*}}) +gpu.func @test_create_nd_tdesc_10(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) { %c1 = arith.constant 1 : index - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0], shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0, shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16> %2 = xegpu.create_nd_tdesc %src, shape:[%h, %w], strides:[%w, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16> gpu.return @@ -123,7 +123,7 @@ gpu.func @prefetch_nd_2(%src: memref<48x64xf16>) { // CHECK: gpu.func @prefetch_nd_offset_1(%[[arg0:.*]]: memref<48x64xf16>, %arg1: index, %arg2: index) { gpu.func @prefetch_nd_offset_1(%src: memref<48x64xf16>, %x : index, %y : index) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16> %1 = xegpu.create_nd_tdesc %src : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16> // CHECK: xegpu.prefetch_nd %[[R0]][%arg1, %arg2] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16> xegpu.prefetch_nd %1[%x, %y] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16> @@ -271,7 +271,7 @@ gpu.func @subgroup_load_nd_8(%src: memref<24x32xf32>) { // CHECK: func @subgroup_load_nd_offset_1(%[[arg0:.*]]: memref<24x32xf32>, %arg1: index, %arg2: index) { gpu.func @subgroup_load_nd_offset_1(%src: memref<24x32xf32>, %x : index, %y : index) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][%arg1, %arg2] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32> %2 = xegpu.load_nd %1[%x, %y] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32> @@ -290,7 +290,7 @@ gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) { // CHECK: func @simt_load_nd_offset_1(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @simt_load_nd_offset_1(%src: memref<24x32xf32>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32> %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32> @@ -323,7 +323,7 @@ gpu.func @simt_store_nd(%src: memref<24x32xf16>) { gpu.func @subgroup_store_nd_2(%dst: memref<24x32xf16>, %x : index) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16> %1 = arith.constant dense<1.0>: vector<32xf16> - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> %2 = xegpu.create_nd_tdesc %dst : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> // CHECK: xegpu.store_nd %[[C]], %[[R0]][%arg1] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<32xf16>, !xegpu.tensor_desc<32xf16> xegpu.store_nd %1, %2[%x] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<32xf16>, !xegpu.tensor_desc<32xf16> @@ -356,7 +356,7 @@ gpu.func @simt_store_nd_2(%src: memref<24x32xf16>) { gpu.func @simt_store_nd_offset_1(%src: memref<24x32xf16>) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16> %1 = arith.constant dense<1.0>: vector<2xf16> - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> %2 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> // CHECK: xegpu.store_nd %[[C]], %[[R0]][0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<32xf16> xegpu.store_nd %1, %2[0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2xf16>, !xegpu.tensor_desc<32xf16> diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir new file mode 100644 index 0000000..547c735 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir @@ -0,0 +1,37 @@ +// RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s + +//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)> +gpu.module @test { + gpu.func @slice_attr() -> vector<128xindex> { + //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index + //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]] + //CHECK: [[c32:%.+]] = arith.constant 32 : index + //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] + //CHECK: [[c0:%.+]] = arith.constant 0 : index + //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]] + //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> + //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> + //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> + %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex> + gpu.return %step : vector<128xindex> + } + + gpu.func @nested_slice_attr() -> vector<128xindex> { + //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index + //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]] + //CHECK: [[c32:%.+]] = arith.constant 32 : index + //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] + //CHECK: [[c0:%.+]] = arith.constant 0 : index + //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]] + //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> + //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> + //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> + %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 1], sg_data = [32, 32, 1]>, dims = [2]>, dims = [1]>} : vector<128xindex> + gpu.return %0 : vector<128xindex> + } + +}
\ No newline at end of file diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index 628a485..e5cc65e 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -1,5 +1,8 @@ // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s +#map = affine_map<()[s0] -> (s0 floordiv 4)> +#map1 = affine_map<()[s0] -> (s0 mod 4)> + gpu.module @test_round_robin_assignment { // CHECK-LABEL: create_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> @@ -12,6 +15,30 @@ gpu.module @test_round_robin_assignment { gpu.return } + // CHECK-LABEL: create_nd_tdesc_with_shared_data + // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32> + gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) { + //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index + //CHECK: [[IdY:%.+]] = affine.apply #map()[[[sgId]]] + //CHECK: [[IdX:%.+]] = affine.apply #map1()[[[sgId]]] + //CHECK: [[C16:%.+]] = arith.constant 16 : index + //CHECK: [[LY:%.+]] = index.mul [[IdY]], [[C16]] + //CHECK: [[C64:%.+]] = arith.constant 64 : index + //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]] + //CHECK: [[C0:%.+]] = arith.constant 0 : index + //CHECK: [[C0_1:%.+]] = arith.constant 0 : index + //CHECK: [[ADDY:%.+]] = arith.addi [[LY]], [[C0]] : index + //CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index + //CHECK: [[C128:%.+]] = arith.constant 128 : index + //CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]] + //CHECK: [[C64_2:%.+]] = arith.constant 64 : index + //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C64_2]] + //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32> + %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> + -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>> + gpu.return + } + // CHECK-LABEL: load_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) { diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index d4b0037..180ba8a 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -4,34 +4,26 @@ //CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)> gpu.module @test_1_1_assignment { // CHECK-LABEL: create_nd_tdesc - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> + // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32> gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) { - // CHECK: %[[SGID:.*]] = gpu.subgroup_id - // CHECK: %[[C8:.*]] = arith.constant 8 : index - // CHECK: %[[C32:.*]] = arith.constant 32 : index - // CHECK: %[[C4:.*]] = arith.constant 4 : index - // CHECK: %[[C32_0:.*]] = arith.constant 32 : index - // CHECK: %[[C4_1:.*]] = arith.constant 4 : index - // CHECK: %[[DIV:.*]] = affine.apply #map()[%[[SGID]]] - // CHECK: %[[REM:.*]] = affine.apply #map1()[%[[SGID]]] - // CHECK: %[[MUL1:.*]] = index.mul %[[DIV]], %[[C32]] - // CHECK: %[[MUL2:.*]] = index.mul %[[REM]], %[[C32_0]] - // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[C256:.*]] = arith.constant 256 : index - // CHECK: %[[MOD:.*]] = index.remu %[[MUL1]], %[[C256]] - // CHECK: %[[C0_2:.*]] = arith.constant 0 : index - // CHECK: %[[ADD1:.*]] = index.add %[[MOD]], %[[C0_2]] - // CHECK: %[[C0_3:.*]] = arith.constant 0 : index - // CHECK: %[[C128:.*]] = arith.constant 128 : index - // CHECK: %[[MOD1:.*]] = index.remu %[[MUL2]], %[[C128]] - // CHECK: %[[C0_4:.*]] = arith.constant 0 : index - // CHECK: %[[ADD2:.*]] = index.add %[[MOD1]], %[[C0_4]] - // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[ADD1]], %[[ADD2]]] : memref<256x128xf32> - // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK: gpu.return - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return + //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index + //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] + //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] + //CHECK: [[C32:%.+]] = arith.constant 32 : index + //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]] + //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]] + //CHECK: [[C0:%.+]] = arith.constant 0 : index + //CHECK: [[C0_1:%.+]] = arith.constant 0 : index + //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index + //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index + //CHECK: [[C256:%.+]] = arith.constant 256 : index + //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C256]] + //CHECK: [[C128:%.+]] = arith.constant 128 : index + //CHECK: [[X:%.+]] = index.remu [[UX]], [[C128]] + //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> + gpu.return } // CHECK-LABEL: load_nd_tdesc @@ -347,7 +339,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { // CHECK-LABEL: @subgroup_id_range_nested_if gpu.func @subgroup_id_range_nested_if(%src: memref<256x128xf32>, %src1: memref<128x64xf32>) { %sg_id = gpu.subgroup_id : index - %c1 = arith.constant 1 : i1 + %c1 = arith.constant 1 : i1 %c3 = arith.constant 3 : index %c32 = arith.constant 32 : index %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> diff --git a/mlir/test/Dialect/common_folders.mlir b/mlir/test/Dialect/common_folders.mlir new file mode 100644 index 0000000..92598b4 --- /dev/null +++ b/mlir/test/Dialect/common_folders.mlir @@ -0,0 +1,22 @@ +// RUN: mlir-opt %s --test-fold-type-converting-op --split-input-file | FileCheck %s + +// CHECK-LABEL: @test_fold_unary_op_f32_to_si32( +func.func @test_fold_unary_op_f32_to_si32() -> tensor<4x2xsi32> { + // CHECK-NEXT: %[[POSITIVE_ONE:.*]] = arith.constant dense<1> : tensor<4x2xsi32> + // CHECK-NEXT: return %[[POSITIVE_ONE]] : tensor<4x2xsi32> + %operand = arith.constant dense<5.1> : tensor<4x2xf32> + %sign = test.sign %operand : (tensor<4x2xf32>) -> tensor<4x2xsi32> + return %sign : tensor<4x2xsi32> +} + +// ----- + +// CHECK-LABEL: @test_fold_binary_op_f32_to_i1( +func.func @test_fold_binary_op_f32_to_i1() -> tensor<8xi1> { + // CHECK-NEXT: %[[FALSE:.*]] = arith.constant dense<false> : tensor<8xi1> + // CHECK-NEXT: return %[[FALSE]] : tensor<8xi1> + %lhs = arith.constant dense<5.1> : tensor<8xf32> + %rhs = arith.constant dense<4.2> : tensor<8xf32> + %less_than = test.less_than %lhs, %rhs : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xi1> + return %less_than : tensor<8xi1> +} diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir index 06a6e22..9d04357 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir @@ -9,7 +9,12 @@ // RUN: FileCheck %s func.func @matmul_transpose_a(%A : tensor<?x?xf32>, %B : tensor<?x?xf32>, %C : tensor<?x?xf32>) { - %res = linalg.matmul_transpose_a ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>) + %res = linalg.matmul + indexing_maps = [ + affine_map<(d0, d1, d2) -> (d2, d0)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)>] + ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>) outs(%C: tensor<?x?xf32>) -> tensor<?x?xf32> %xf = tensor.cast %res : tensor<?x?xf32> to tensor<*xf32> call @printMemrefF32(%xf) : (tensor<*xf32>) -> () @@ -56,7 +61,7 @@ func.func @main() { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module : !transform.any_op {transform.readonly}) { - %matmul_transpose_a = transform.structured.match ops{["linalg.matmul_transpose_a"]} in %module + %matmul_transpose_a = transform.structured.match ops{["linalg.matmul"]} in %module : (!transform.any_op) -> !transform.any_op // Step 1: Tile for size [4] x [4], which corresponds to SVLs x SVLs, where diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir index 0ee0166..219367a 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir @@ -46,7 +46,7 @@ func.func @test_outerproduct_with_accumulator_4x4xf32() { %c0 = arith.constant 0 : index %f10 = arith.constant 10.0 : f32 - %acc = vector.splat %f10 : vector<[4]x[4]xf32> + %acc = vector.broadcast %f10 : f32 to vector<[4]x[4]xf32> %vector_i32 = llvm.intr.stepvector : vector<[4]xi32> %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32> %tile = vector.outerproduct %vector, %vector, %acc : vector<[4]xf32>, vector<[4]xf32> @@ -103,7 +103,7 @@ func.func @test_masked_outerproduct_with_accumulator_4x4xf32() { %ones = arith.constant dense<1> : vector<[4]xi32> %f10 = arith.constant 10.0 : f32 - %acc = vector.splat %f10 : vector<[4]x[4]xf32> + %acc = vector.broadcast %f10 : f32 to vector<[4]x[4]xf32> %step_vector = llvm.intr.stepvector : vector<[4]xi32> %vector_i32 = arith.addi %step_vector, %ones : vector<[4]xi32> %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32> diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir index 8e81210..059f24a 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir @@ -52,7 +52,7 @@ func.func @test_outerproduct_with_accumulator_2x2xf64() { %ones = arith.constant dense<1> : vector<[2]xi32> %f10 = arith.constant 10.0 : f64 - %acc = vector.splat %f10 : vector<[2]x[2]xf64> + %acc = vector.broadcast %f10 : f64 to vector<[2]x[2]xf64> %step_vector = llvm.intr.stepvector : vector<[2]xi32> %vector_i32 = arith.addi %step_vector, %ones : vector<[2]xi32> %vector = arith.sitofp %vector_i32 : vector<[2]xi32> to vector<[2]xf64> @@ -108,7 +108,7 @@ func.func @test_masked_outerproduct_with_accumulator_2x2xf64() { %ones = arith.constant dense<1> : vector<[2]xi32> %f10 = arith.constant 10.0 : f64 - %acc = vector.splat %f10 : vector<[2]x[2]xf64> + %acc = vector.broadcast %f10 : f64 to vector<[2]x[2]xf64> %step_vector = llvm.intr.stepvector : vector<[2]xi32> %vector_i32 = arith.addi %step_vector, %ones : vector<[2]xi32> %vector = arith.sitofp %vector_i32 : vector<[2]xi32> to vector<[2]xf64> diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir index c3bf379..bf6900c 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir @@ -10,7 +10,7 @@ // Vector store. func.func @transfer_write_2d(%A : memref<?x?xf32>, %base1: index, %base2: index) { %c0 = arith.constant 0.0 : f32 - %zero = vector.splat %c0 : vector<[4]x[4]xf32> + %zero = vector.broadcast %c0 : f32 to vector<[4]x[4]xf32> vector.transfer_write %zero, %A[%base1, %base2] {in_bounds=[true, true]} : vector<[4]x[4]xf32>, memref<?x?xf32> return @@ -22,7 +22,7 @@ func.func @transfer_write_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: i %c2 = arith.constant 2 : index %c3 = arith.constant 3 : index %mask = vector.create_mask %c2, %c3 : vector<[4]x[4]xi1> - %zero = vector.splat %c0 : vector<[4]x[4]xf32> + %zero = vector.broadcast %c0 : f32 to vector<[4]x[4]xf32> vector.transfer_write %zero, %A[%base1, %base2], %mask {in_bounds=[true, true]} : vector<[4]x[4]xf32>, memref<?x?xf32> return diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir index c990432..192f291 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir @@ -106,7 +106,7 @@ func.func @matvec_i32() { // val = (123 * 314) * 4 * vscale // so ... %vscale = vector.vscale - %vscale_v = vector.splat %vscale : vector<3xindex> + %vscale_v = vector.broadcast %vscale : index to vector<3xindex> %vscale_i32 = arith.index_cast %vscale_v : vector<3xindex> to vector<3xi32> %mv1_div = arith.divui %mv1, %vscale_i32 : vector<3xi32> // ... val / vscale = 123 * 314 * 4 = 154488 diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir index d3b1fa4..2d8180a 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir @@ -7,8 +7,8 @@ func.func @entry() { %f1 = arith.constant 1.0 : f32 %f2 = arith.constant 2.0 : f32 - %v1 = vector.splat %f1 : vector<[4]xf32> - %v2 = vector.splat %f2 : vector<[4]xf32> + %v1 = vector.broadcast %f1 : f32 to vector<[4]xf32> + %v2 = vector.broadcast %f2 : f32 to vector<[4]xf32> vector.print %v1 : vector<[4]xf32> vector.print %v2 : vector<[4]xf32> // diff --git a/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir index f812c25..740c742 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir @@ -6,8 +6,8 @@ func.func @entry() { %f1 = arith.constant 1.0 : f32 %f2 = arith.constant 2.0 : f32 - %v1 = vector.splat %f1 : vector<2x4xf32> - %v2 = vector.splat %f2 : vector<2x4xf32> + %v1 = vector.broadcast %f1 : f32 to vector<2x4xf32> + %v2 = vector.broadcast %f2 : f32 to vector<2x4xf32> vector.print %v1 : vector<2x4xf32> vector.print %v2 : vector<2x4xf32> // diff --git a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir index f7e2229..e25795a 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir @@ -14,9 +14,9 @@ !vector_type_R = vector<7xf32> func.func @vector_outerproduct_splat_8x8(%fa: f32, %fb: f32, %fc: f32) -> !vector_type_C { - %a = vector.splat %fa: !vector_type_A - %b = vector.splat %fb: !vector_type_B - %c = vector.splat %fc: !vector_type_C + %a = vector.broadcast %fa: f32 to !vector_type_A + %b = vector.broadcast %fb: f32 to !vector_type_B + %c = vector.broadcast %fc: f32 to !vector_type_C %d = vector.outerproduct %a, %b, %c : !vector_type_A, !vector_type_B return %d: !vector_type_C } diff --git a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir index a19dfa1..0675102 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir @@ -14,9 +14,9 @@ !vector_type_R = vector<7xi64> func.func @vector_outerproduct_splat_8x8(%ia: i64, %ib: i64, %ic: i64) -> !vector_type_C { - %a = vector.splat %ia: !vector_type_A - %b = vector.splat %ib: !vector_type_B - %c = vector.splat %ic: !vector_type_C + %a = vector.broadcast %ia: i64 to !vector_type_A + %b = vector.broadcast %ib: i64 to !vector_type_B + %c = vector.broadcast %ic: i64 to !vector_type_C %d = vector.outerproduct %a, %b, %c : !vector_type_A, !vector_type_B return %d: !vector_type_C } diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir index 639eed4..895b881 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir @@ -137,7 +137,7 @@ func.func @transfer_read_1d_mask_in_bounds( // Non-contiguous, strided store. func.func @transfer_write_1d(%A : memref<?x?xf32>, %base1 : index, %base2 : index) { %fn1 = arith.constant -1.0 : f32 - %vf0 = vector.splat %fn1 : vector<7xf32> + %vf0 = vector.broadcast %fn1 : f32 to vector<7xf32> vector.transfer_write %vf0, %A[%base1, %base2] {permutation_map = affine_map<(d0, d1) -> (d0)>} : vector<7xf32>, memref<?x?xf32> @@ -147,7 +147,7 @@ func.func @transfer_write_1d(%A : memref<?x?xf32>, %base1 : index, %base2 : inde // Non-contiguous, strided store. func.func @transfer_write_1d_mask(%A : memref<?x?xf32>, %base1 : index, %base2 : index) { %fn1 = arith.constant -2.0 : f32 - %vf0 = vector.splat %fn1 : vector<7xf32> + %vf0 = vector.broadcast %fn1 : f32 to vector<7xf32> %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1]> : vector<7xi1> vector.transfer_write %vf0, %A[%base1, %base2], %mask {permutation_map = affine_map<(d0, d1) -> (d0)>} diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir index 009c137..80dff9d 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir @@ -100,7 +100,7 @@ func.func @transfer_read_2d_broadcast( // Vector store. func.func @transfer_write_2d(%A : memref<?x?xf32>, %base1: index, %base2: index) { %fn1 = arith.constant -1.0 : f32 - %vf0 = vector.splat %fn1 : vector<1x4xf32> + %vf0 = vector.broadcast %fn1 : f32 to vector<1x4xf32> vector.transfer_write %vf0, %A[%base1, %base2] {permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : vector<1x4xf32>, memref<?x?xf32> @@ -111,7 +111,7 @@ func.func @transfer_write_2d(%A : memref<?x?xf32>, %base1: index, %base2: index) func.func @transfer_write_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: index) { %fn1 = arith.constant -2.0 : f32 %mask = arith.constant dense<[[1, 0, 1, 0]]> : vector<1x4xi1> - %vf0 = vector.splat %fn1 : vector<1x4xf32> + %vf0 = vector.broadcast %fn1 : f32 to vector<1x4xf32> vector.transfer_write %vf0, %A[%base1, %base2], %mask {permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : vector<1x4xf32>, memref<?x?xf32> diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir index d41d9c9..93e6a12 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir @@ -62,7 +62,7 @@ func.func @transfer_read_3d_transposed(%A : memref<?x?x?x?xf32>, func.func @transfer_write_3d(%A : memref<?x?x?x?xf32>, %o: index, %a: index, %b: index, %c: index) { %fn1 = arith.constant -1.0 : f32 - %vf0 = vector.splat %fn1 : vector<2x9x3xf32> + %vf0 = vector.broadcast %fn1 : f32 to vector<2x9x3xf32> vector.transfer_write %vf0, %A[%o, %a, %b, %c] : vector<2x9x3xf32>, memref<?x?x?x?xf32> return diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir index d1a2790..18084e3 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir @@ -45,7 +45,7 @@ func.func @transfer_read_mask_inbounds_4(%A : memref<?xf32>, %base: index) { func.func @transfer_write_1d(%A : memref<?xf32>, %base: index) { %f0 = arith.constant 0.0 : f32 - %vf0 = vector.splat %f0 : vector<4xf32> + %vf0 = vector.broadcast %f0 : f32 to vector<4xf32> vector.transfer_write %vf0, %A[%base] {permutation_map = affine_map<(d0) -> (d0)>} : vector<4xf32>, memref<?xf32> diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir index def7081..2251738 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir @@ -5,7 +5,7 @@ func.func @transfer_write16_inbounds_1d(%A : memref<?xf32>, %base: index) { %f = arith.constant 16.0 : f32 - %v = vector.splat %f : vector<16xf32> + %v = vector.broadcast %f : f32 to vector<16xf32> vector.transfer_write %v, %A[%base] {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]} : vector<16xf32>, memref<?xf32> @@ -14,7 +14,7 @@ func.func @transfer_write16_inbounds_1d(%A : memref<?xf32>, %base: index) { func.func @transfer_write13_1d(%A : memref<?xf32>, %base: index) { %f = arith.constant 13.0 : f32 - %v = vector.splat %f : vector<13xf32> + %v = vector.broadcast %f : f32 to vector<13xf32> vector.transfer_write %v, %A[%base] {permutation_map = affine_map<(d0) -> (d0)>} : vector<13xf32>, memref<?xf32> @@ -23,7 +23,7 @@ func.func @transfer_write13_1d(%A : memref<?xf32>, %base: index) { func.func @transfer_write17_1d(%A : memref<?xf32>, %base: index) { %f = arith.constant 17.0 : f32 - %v = vector.splat %f : vector<17xf32> + %v = vector.broadcast %f : f32 to vector<17xf32> vector.transfer_write %v, %A[%base] {permutation_map = affine_map<(d0) -> (d0)>} : vector<17xf32>, memref<?xf32> @@ -42,7 +42,7 @@ func.func @transfer_read_1d(%A : memref<?xf32>) -> vector<32xf32> { func.func @transfer_write_inbounds_3d(%A : memref<4x4x4xf32>) { %c0 = arith.constant 0: index %f = arith.constant 0.0 : f32 - %v0 = vector.splat %f : vector<2x3x4xf32> + %v0 = vector.broadcast %f : f32 to vector<2x3x4xf32> %f1 = arith.constant 1.0 : f32 %f2 = arith.constant 2.0 : f32 %f3 = arith.constant 3.0 : f32 diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir new file mode 100644 index 0000000..7e66dee --- /dev/null +++ b/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir @@ -0,0 +1,59 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @add attributes {gpu.container_module} { + memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]> + memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]> + func.func @main() { + %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32> + %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32> + %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32> + %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32> + call @printMemrefF32(%cast) : (memref<*xf32>) -> () + return + } + func.func private @printMemrefF32(memref<*xf32>) + func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %mem = gpu.alloc host_shared () : memref<2x2x2xf32> + memref.copy %arg1, %mem : memref<2x2x2xf32> to memref<2x2x2xf32> + %memref_0 = gpu.alloc host_shared () : memref<2x2x2xf32> + memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32> + %memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32> + %2 = gpu.wait async + %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) + args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>) + gpu.wait [%3] + %alloc = memref.alloc() : memref<2x2x2xf32> + memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32> + %4 = gpu.wait async + %5 = gpu.dealloc async [%4] %memref_2 : memref<2x2x2xf32> + %6 = gpu.dealloc async [%5] %memref_0 : memref<2x2x2xf32> + %7 = gpu.dealloc async [%6] %mem : memref<2x2x2xf32> + gpu.wait [%7] + return %alloc : memref<2x2x2xf32> + } + gpu.module @test_kernel + attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel + attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32> + %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32> + %5 = arith.addf %3, %4 : f32 + memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32> + gpu.return + } + } + // CHECK: [2.3, 4.5] + // CHECK: [7.8, 10.2] + // CHECK: [12.7, 14.9] + // CHECK: [18.2, 20.6] +} diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir new file mode 100644 index 0000000..df8fbe4 --- /dev/null +++ b/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir @@ -0,0 +1,57 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @add attributes {gpu.container_module} { + memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]> + memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]> + func.func @main() { + %0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64> + %1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64> + %2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64> + %cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64> + call @printMemrefI64(%cast) : (memref<*xi64>) -> () + return + } + func.func private @printMemrefI64(memref<*xi64>) + func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %mem = gpu.alloc host_shared () : memref<3x3xi64> + memref.copy %arg1, %mem : memref<3x3xi64> to memref<3x3xi64> + %memref_0 = gpu.alloc host_shared () : memref<3x3xi64> + memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64> + %memref_2 = gpu.alloc host_shared () : memref<3x3xi64> + %2 = gpu.wait async + %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) + args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>) + gpu.wait [%3] + %alloc = memref.alloc() : memref<3x3xi64> + memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64> + %4 = gpu.wait async + %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64> + %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64> + %7 = gpu.dealloc async [%6] %mem : memref<3x3xi64> + gpu.wait [%7] + return %alloc : memref<3x3xi64> + } + gpu.module @test_kernel + attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel + attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = memref.load %arg0[%0, %1] : memref<3x3xi64> + %3 = memref.load %arg1[%0, %1] : memref<3x3xi64> + %4 = arith.addi %2, %3 : i64 + memref.store %4, %arg2[%0, %1] : memref<3x3xi64> + gpu.return + } + } + // CHECK: [2, 4100, 6], + // CHECK: [16777224, 10, 4294971404], + // CHECK: [16777230, 1103806595088, 1099511627794] +} diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir new file mode 100644 index 0000000..cd99f2c --- /dev/null +++ b/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir @@ -0,0 +1,56 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(gpu-async-region),spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @add attributes {gpu.container_module} { + memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]> + memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]> + func.func @main() { + %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32> + %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32> + %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32> + %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32> + call @printMemrefF32(%cast) : (memref<*xf32>) -> () + memref.dealloc %2 : memref<2x2x2xf32> + return + } + func.func private @printMemrefF32(memref<*xf32>) + func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %memref = gpu.alloc () : memref<2x2x2xf32> + gpu.memcpy %memref, %arg0 : memref<2x2x2xf32>, memref<2x2x2xf32> + %memref_0 = gpu.alloc () : memref<2x2x2xf32> + gpu.memcpy %memref_0, %arg1 : memref<2x2x2xf32>, memref<2x2x2xf32> + %memref_1 = gpu.alloc () : memref<2x2x2xf32> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) + args(%memref : memref<2x2x2xf32>, %memref_0 : memref<2x2x2xf32>, %memref_1 : memref<2x2x2xf32>) + %alloc = memref.alloc() : memref<2x2x2xf32> + gpu.memcpy %alloc, %memref_1 : memref<2x2x2xf32>, memref<2x2x2xf32> + gpu.dealloc %memref_1 : memref<2x2x2xf32> + gpu.dealloc %memref_0 : memref<2x2x2xf32> + gpu.dealloc %memref : memref<2x2x2xf32> + return %alloc : memref<2x2x2xf32> + } + gpu.module @test_kernel + attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel + attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32> + %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32> + %5 = arith.addf %3, %4 : f32 + memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32> + gpu.return + } + } + // CHECK: [2.3, 4.5] + // CHECK: [7.8, 10.2] + // CHECK: [12.7, 14.9] + // CHECK: [18.2, 20.6] +} diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir new file mode 100644 index 0000000..8d022ac --- /dev/null +++ b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir @@ -0,0 +1,86 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @relu attributes {gpu.container_module} { + memref.global "private" constant @__constant_4x5xf32 : memref<4x5xf32> = dense<[ + [-1.000000e-01, -2.000000e-01, -3.000000e-01, 4.000000e-01, 5.000000e-01], + [1.000000e-01, -2.000000e-01, 3.000000e-01, -4.000000e-01, 5.000000e-01], + [1.000000e-01, 2.000000e-01, 3.000000e-01, -4.000000e-01, -5.000000e-01], + [1.000000e-01, 2.000000e-01, 3.000000e-01, 4.000000e-01, 5.000000e-01] + ]> + + func.func @main() { + %c1 = arith.constant 1 : index + %c100 = arith.constant 100 : index + %c0 = arith.constant 0 : index + %0 = memref.get_global @__constant_4x5xf32 : memref<4x5xf32> + + scf.for %arg0 = %c0 to %c100 step %c1 { + %1 = func.call @test(%0) : (memref<4x5xf32>) -> memref<4x5xf32> + %cast = memref.cast %1 : memref<4x5xf32> to memref<*xf32> + func.call @printMemrefF32(%cast) : (memref<*xf32>) -> () + // CHECK: [0, 0, 0, 0.4, 0.5], + // CHECK: [0.1, 0, 0.3, 0, 0.5], + // CHECK: [0.1, 0.2, 0.3, 0, 0], + // CHECK: [0.1, 0.2, 0.3, 0.4, 0.5] + } + return + } + + func.func private @printMemrefF32(memref<*xf32>) + func.func @test(%arg0: memref<4x5xf32>) -> memref<4x5xf32> { + %c5 = arith.constant 5 : index + %c4 = arith.constant 4 : index + %cst = arith.constant 0.000000e+00 : f32 + %c1 = arith.constant 1 : index + %memref = gpu.alloc host_shared () : memref<4x5xf32> + memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32> + %memref_0 = gpu.alloc host_shared () : memref<4x5xi1> + %2 = gpu.wait async + %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) + args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>) + gpu.wait [%3] + %memref_1 = gpu.alloc host_shared () : memref<4x5xf32> + %4 = gpu.wait async + %5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) + args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, + %memref_1 : memref<4x5xf32>) + gpu.wait [%5] + %alloc = memref.alloc() : memref<4x5xf32> + memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32> + %6 = gpu.wait async + %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32> + %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1> + %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32> + return %alloc : memref<4x5xf32> + } + gpu.module @test_kernel + attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel + attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = memref.load %arg0[%0, %1] : memref<4x5xf32> + %3 = arith.cmpf olt, %2, %arg1 : f32 + memref.store %3, %arg2[%0, %1] : memref<4x5xi1> + gpu.return + } + } + gpu.module @test_kernel_0 + attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel + attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = memref.load %arg0[%0, %1] : memref<4x5xi1> + %3 = memref.load %arg1[%0, %1] : memref<4x5xf32> + %4 = arith.select %2, %arg2, %3 : f32 + memref.store %4, %arg3[%0, %1] : memref<4x5xf32> + gpu.return + } + } +} diff --git a/mlir/test/Integration/GPU/LevelZero/lit.local.cfg b/mlir/test/Integration/GPU/LevelZero/lit.local.cfg new file mode 100644 index 0000000..36c7ad5 --- /dev/null +++ b/mlir/test/Integration/GPU/LevelZero/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.enable_levelzero_runner: + config.unsupported = True diff --git a/mlir/test/Pass/pipeline-options-parsing.mlir b/mlir/test/Pass/pipeline-options-parsing.mlir index 9385d35..03ac38e 100644 --- a/mlir/test/Pass/pipeline-options-parsing.mlir +++ b/mlir/test/Pass/pipeline-options-parsing.mlir @@ -13,6 +13,7 @@ // RUN: mlir-opt %s -verify-each=false -pass-pipeline='builtin.module(builtin.module(func.func(test-options-pass{list=3}), func.func(test-options-pass{enum=one list=1,2,3,4 string=foo"bar"baz})))' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_6 %s // RUN: mlir-opt %s -verify-each=false '-test-options-super-pass-pipeline=super-list={{enum=zero list=1 string=foo},{enum=one list=2 string="bar"},{enum=two list=3 string={baz}}}' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_7 %s // RUN: mlir-opt %s -verify-each=false -pass-pipeline='builtin.module(func.func(test-options-super-pass{list={{enum=zero list={1} string=foo },{enum=one list={2} string=bar },{enum=two list={3} string=baz }}}))' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_7 %s +// RUN: mlir-opt %s -verify-each=false -test-options-super-set-ab-pipeline='foo=true bar=false' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_11 %s // This test checks that lists-of-nested-options like 'option1={...},{....}' can be parsed @@ -106,3 +107,12 @@ // CHECK_10-NEXT: test-options-pass{enum=zero string= string-list={,}} // CHECK_10-NEXT: ) // CHECK_10-NEXT: ) + +// CHECK_11: builtin.module( +// CHECK_11-NEXT: func.func( +// CHECK_11-NEXT: test-options-pass-a +// CHECK_11-NEXT: ) +// CHECK_11-NEXT: func.func( +// CHECK_11-NEXT: test-options-pass-b +// CHECK_11-NEXT: ) +// CHECK_11-NEXT: ) diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll b/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll index 797a75c..18c9319 100644 --- a/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll +++ b/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll @@ -3,9 +3,9 @@ ; CHECK-LABEL: llvm.func @lifetime define void @lifetime() { %a = alloca [16 x i8] - ; CHECK: llvm.call_intrinsic "llvm.lifetime.start.p0"({{.*}}, %[[ptr:.*]]) : (i64, !llvm.ptr {llvm.nonnull}) -> () - call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %a) - ; CHECK: llvm.call_intrinsic "llvm.lifetime.end.p0"({{.*}}, %[[ptr]]) : (i64, !llvm.ptr {llvm.nonnull}) -> () - call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %a) + ; CHECK: llvm.call_intrinsic "llvm.lifetime.start.p0"(%[[ptr:.*]]) : (!llvm.ptr {llvm.nonnull}) -> () + call void @llvm.lifetime.start.p0(ptr nonnull %a) + ; CHECK: llvm.call_intrinsic "llvm.lifetime.end.p0"(%[[ptr]]) : (!llvm.ptr {llvm.nonnull}) -> () + call void @llvm.lifetime.end.p0(ptr nonnull %a) ret void } diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll index a419d75..9f882ad 100644 --- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll +++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll @@ -876,10 +876,10 @@ define void @stack_restore(ptr %0, ptr addrspace(1) %1) { ; CHECK-LABEL: llvm.func @lifetime define void @lifetime() { %a = alloca [16 x i8] - ; CHECK: llvm.intr.lifetime.start 16, %{{.*}} : !llvm.ptr - call void @llvm.lifetime.start.p0(i64 16, ptr %a) - ; CHECK: llvm.intr.lifetime.end 32, %{{.*}} : !llvm.ptr - call void @llvm.lifetime.end.p0(i64 32, ptr %a) + ; CHECK: llvm.intr.lifetime.start %{{.*}} : !llvm.ptr + call void @llvm.lifetime.start.p0(ptr %a) + ; CHECK: llvm.intr.lifetime.end %{{.*}} : !llvm.ptr + call void @llvm.lifetime.end.p0(ptr %a) ret void } @@ -1353,8 +1353,8 @@ declare <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double>, <8 x i1>, i32) declare <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double>, <8 x i1>, i32) declare <8 x i64> @llvm.vp.ptrtoint.v8i64.v8p0(<8 x ptr>, <8 x i1>, i32) declare <8 x ptr> @llvm.vp.inttoptr.v8p0.v8i64(<8 x i64>, <8 x i1>, i32) -declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) +declare void @llvm.lifetime.start.p0(ptr nocapture) +declare void @llvm.lifetime.end.p0(ptr nocapture) declare ptr @llvm.invariant.start.p0(i64 immarg, ptr nocapture) declare void @llvm.invariant.end.p0(ptr, i64 immarg, ptr nocapture) declare ptr @llvm.launder.invariant.group.p0(ptr nocapture) diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir index eb3510c..2b420ed 100644 --- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir @@ -1104,9 +1104,9 @@ llvm.func @lifetime() { %c = llvm.mlir.constant(16 : i64) : i64 %a = llvm.alloca %c x i8 : (i64) -> !llvm.ptr // CHECK: call void @llvm.lifetime.start - llvm.intr.lifetime.start 16, %a : !llvm.ptr + llvm.intr.lifetime.start %a : !llvm.ptr // CHECK: call void @llvm.lifetime.end - llvm.intr.lifetime.end 16, %a : !llvm.ptr + llvm.intr.lifetime.end %a : !llvm.ptr llvm.return } @@ -1418,8 +1418,8 @@ llvm.func @experimental_constrained_fpext(%s: f32, %v: vector<4xf32>) { // CHECK-DAG: declare <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32>, i64 immarg) // CHECK-DAG: declare { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double>) // CHECK-DAG: declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>) -// CHECK-DAG: declare void @llvm.lifetime.start.p0(i64 immarg, ptr captures(none)) -// CHECK-DAG: declare void @llvm.lifetime.end.p0(i64 immarg, ptr captures(none)) +// CHECK-DAG: declare void @llvm.lifetime.start.p0(ptr captures(none)) +// CHECK-DAG: declare void @llvm.lifetime.end.p0(ptr captures(none)) // CHECK-DAG: declare ptr @llvm.invariant.start.p0(i64 immarg, ptr captures(none)) // CHECK-DAG: declare void @llvm.invariant.end.p0(ptr, i64 immarg, ptr captures(none)) diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir index 85478cc..991222c 100644 --- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir @@ -1,5 +1,24 @@ // RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s +llvm.func @pmevent_no_id() { + // expected-error @below {{either `id` or `mask` must be set}} + nvvm.pmevent +} + +// ----- + +llvm.func @pmevent_bigger15() { + // expected-error @below {{`id` must be between 0 and 15}} + nvvm.pmevent id = 141 +} + +// ----- + +llvm.func @pmevent_many_ids() { + // expected-error @below {{`id` and `mask` cannot be set at the same time}} + nvvm.pmevent id = 1 mask = 1 +} + // ----- llvm.func @kernel_func(%numberOfThreads : i32) { diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 5c2cfa4..b1800e8 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -918,3 +918,14 @@ llvm.func @nvvm_dot_accumulate_2way(%a: vector<2xi16>, %b: vector<4xi8>, %c: i32 %7 = nvvm.dot.accumulate.2way %a <signed>, %b <signed>, %c {b_hi = true}: vector<2xi16>, vector<4xi8> llvm.return } + +// ----- + +// CHECK-LABEL: @nvvm_pmevent +llvm.func @nvvm_pmevent() { + // CHECK: call void @llvm.nvvm.pm.event.mask(i16 15000) + nvvm.pmevent mask = 15000 + // CHECK: call void @llvm.nvvm.pm.event.mask(i16 4) + nvvm.pmevent mask = 4 + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir b/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir new file mode 100644 index 0000000..3553907 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir @@ -0,0 +1,44 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// CHECK: atomicrmw add ptr %loadgep_, i32 1 monotonic, align 4, !amdgpu.no.remote.memory !{{.*}} + +module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, !llvm.ptr<1> = dense<64> : vector<4xi64>, !llvm.ptr<2> = dense<32> : vector<4xi64>, !llvm.ptr<3> = dense<32> : vector<4xi64>, !llvm.ptr<4> = dense<64> : vector<4xi64>, !llvm.ptr<5> = dense<32> : vector<4xi64>, !llvm.ptr<6> = dense<32> : vector<4xi64>, !llvm.ptr<7> = dense<[160, 256, 256, 32]> : vector<4xi64>, !llvm.ptr<8> = dense<[128, 128, 128, 48]> : vector<4xi64>, !llvm.ptr<9> = dense<[192, 256, 256, 32]> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.legal_int_widths" = array<i32: 32, 64>, "dlti.stack_alignment" = 32 : i64, "dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, fir.atomic_fine_grained_memory, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "generic-hsa", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags<openmp_device_version = 31>, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp<clause_requires none>, omp.target_triples = [], omp.version = #omp.version<version = 31>} { + llvm.func @_QQmain() attributes {fir.bindc_name = "TEST", omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, target_cpu = "generic-hsa"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "threads"} : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %3 x i32 {bindc_name = "capture"} : (i64) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %6 = llvm.mlir.constant(1 : i64) : i64 + %7 = llvm.alloca %6 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr<5> + %8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr + %9 = llvm.mlir.constant(0 : i32) : i32 + %10 = llvm.mlir.constant(128 : i32) : i32 + %11 = llvm.mlir.constant(1 : i64) : i64 + %12 = llvm.mlir.constant(1 : i64) : i64 + %13 = llvm.mlir.constant(1 : i64) : i64 + llvm.store %10, %2 : i32, !llvm.ptr + llvm.store %9, %8 : i32, !llvm.ptr + %14 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "threads"} + %15 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "capture"} + %16 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "a"} + omp.target map_entries(%14 -> %arg0, %15 -> %arg1, %16 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) { + %17 = llvm.mlir.constant(1 : i32) : i32 + %18 = llvm.load %arg0 : !llvm.ptr -> i32 + omp.parallel num_threads(%18 : i32) { + omp.atomic.capture { + omp.atomic.read %arg1 = %arg2 : !llvm.ptr, !llvm.ptr, i32 + omp.atomic.update %arg2 : !llvm.ptr { + ^bb0(%arg3: i32): + %19 = llvm.add %arg3, %17 : i32 + omp.yield(%19 : i32) + } {atomic_control = #omp.atomic_control<fine_grained_memory = true>} + } + omp.terminator + } + omp.terminator + } + llvm.return + } +} diff --git a/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir b/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir new file mode 100644 index 0000000..3b0005b --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir @@ -0,0 +1,36 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// CHECK: atomicrmw add ptr %loadgep_, i32 1 monotonic, align 4, !amdgpu.ignore.denormal.mode !{{.*}}, !amdgpu.no.fine.grained.memory !{{.*}}, !amdgpu.no.remote.memory !{{.*}} + +module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, !llvm.ptr<1> = dense<64> : vector<4xi64>, !llvm.ptr<2> = dense<32> : vector<4xi64>, !llvm.ptr<3> = dense<32> : vector<4xi64>, !llvm.ptr<4> = dense<64> : vector<4xi64>, !llvm.ptr<5> = dense<32> : vector<4xi64>, !llvm.ptr<6> = dense<32> : vector<4xi64>, !llvm.ptr<7> = dense<[160, 256, 256, 32]> : vector<4xi64>, !llvm.ptr<8> = dense<[128, 128, 128, 48]> : vector<4xi64>, !llvm.ptr<9> = dense<[192, 256, 256, 32]> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.legal_int_widths" = array<i32: 32, 64>, "dlti.stack_alignment" = 32 : i64, "dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, fir.atomic_ignore_denormal_mode, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "generic-hsa", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags<openmp_device_version = 31>, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp<clause_requires none>, omp.target_triples = [], omp.version = #omp.version<version = 31>} { + llvm.func @_QQmain() attributes {fir.bindc_name = "TEST", omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, target_cpu = "generic-hsa"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "threads"} : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %3 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %6 = llvm.mlir.constant(0 : i32) : i32 + %7 = llvm.mlir.constant(128 : i32) : i32 + %8 = llvm.mlir.constant(1 : i64) : i64 + %9 = llvm.mlir.constant(1 : i64) : i64 + llvm.store %7, %2 : i32, !llvm.ptr + llvm.store %6, %5 : i32, !llvm.ptr + %10 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "threads"} + %11 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "a"} + omp.target map_entries(%10 -> %arg0, %11 -> %arg1 : !llvm.ptr, !llvm.ptr) { + %12 = llvm.mlir.constant(1 : i32) : i32 + %13 = llvm.load %arg0 : !llvm.ptr -> i32 + omp.parallel num_threads(%13 : i32) { + omp.atomic.update %arg1 : !llvm.ptr { + ^bb0(%arg2: i32): + %14 = llvm.add %arg2, %12 : i32 + omp.yield(%14 : i32) + } {atomic_control = #omp.atomic_control<ignore_denormal_mode = true>} + omp.terminator + } + omp.terminator + } + llvm.return + } +} diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 740990a..ce43941 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -125,6 +125,23 @@ llvm.func @rocdl.ballot64(%pred : i1) -> i64 { llvm.return %0 : i64 } +llvm.func @rocdl.readfirstlane(%src0 : f32, %src1: f64, %src2: i32, %src3: vector<2 x f32>) -> f32 { + // CHECK-LABEL: rocdl.readfirstlane + // CHECK: call float @llvm.amdgcn.readfirstlane.f32(float %{{.*}}) + %0 = rocdl.readfirstlane %src0 : f32 + + // CHECK: call double @llvm.amdgcn.readfirstlane.f64(double %{{.*}}) + %1 = rocdl.readfirstlane %src1 : f64 + + // CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %{{.*}}) + %2 = rocdl.readfirstlane %src2 : i32 + + // CHECK: call <2 x float> @llvm.amdgcn.readfirstlane.v2f32(<2 x float> %{{.*}}) + %3 = rocdl.readfirstlane %src3 : vector<2 x f32> + + llvm.return %0 : f32 +} + llvm.func @rocdl.readlane(%src0 : f32, %src1: f64, %src2: i32, %src3: vector<2 x f32>) -> f32 { %idx = llvm.mlir.constant(0 : i32) : i32 diff --git a/mlir/test/Target/SPIRV/arm-tensor-constant.mlir b/mlir/test/Target/SPIRV/arm-tensor-constant.mlir index 275e586..7fb8af1 100644 --- a/mlir/test/Target/SPIRV/arm-tensor-constant.mlir +++ b/mlir/test/Target/SPIRV/arm-tensor-constant.mlir @@ -1,17 +1,36 @@ // RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s -// DISABLED: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv %s | spirv-val %} - -// FIXME(#152012): Fix arm tensor constant validation errors and reenable spirv-val tests. +// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv %s | spirv-val %} spirv.module Logical Vulkan requires #spirv.vce<v1.3, [VulkanMemoryModel, Shader, TensorsARM, Linkage], [SPV_KHR_vulkan_memory_model, SPV_ARM_tensors]> { - // CHECK-LABEL: @arm_tensor_of_i32 - spirv.func @arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3xi32>) "None" { + // CHECK-LABEL: @rank_1_arm_tensor_of_i32 + spirv.func @rank_1_arm_tensor_of_i32() -> (!spirv.arm.tensor<3xi32>) "None" { + // CHECK: {{%.*}} = spirv.Constant dense<[1, 2, 3]> : !spirv.arm.tensor<3xi32> + %0 = spirv.Constant dense<[1, 2, 3]> : !spirv.arm.tensor<3xi32> + spirv.ReturnValue %0 : !spirv.arm.tensor<3xi32> + } + + // CHECK-LABEL: @rank_2_arm_tensor_of_i32 + spirv.func @rank_2_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3xi32>) "None" { // CHECK: {{%.*}} = spirv.Constant dense<{{\[}}[1, 2, 3], [4, 5, 6]]> : !spirv.arm.tensor<2x3xi32> %0 = spirv.Constant dense<[[1, 2, 3], [4, 5, 6]]> : !spirv.arm.tensor<2x3xi32> spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xi32> } + // CHECK-LABEL: @rank_3_arm_tensor_of_i32 + spirv.func @rank_3_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x2x3xi32>) "None" { + // CHECK: {{%.*}} = spirv.Constant dense<{{\[}}{{\[}}[1, 2, 3], [4, 5, 6]], {{\[}}[7, 8, 9], [10, 11, 12]]]> : !spirv.arm.tensor<2x2x3xi32> + %0 = spirv.Constant dense<[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]> : !spirv.arm.tensor<2x2x3xi32> + spirv.ReturnValue %0 : !spirv.arm.tensor<2x2x3xi32> + } + + // CHECK-LABEL: @rank_4_arm_tensor_of_i32 + spirv.func @rank_4_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3x4x5xi32>) "None" { + // CHECK: {{%.*}} = spirv.Constant dense<5> : !spirv.arm.tensor<2x3x4x5xi32> + %0 = spirv.Constant dense<5> : !spirv.arm.tensor<2x3x4x5xi32> + spirv.ReturnValue %0 : !spirv.arm.tensor<2x3x4x5xi32> + } + // CHECK-LABEL: @splat_arm_tensor_of_i32 spirv.func @splat_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3xi32>) "None" { // CHECK: {{%.*}} = spirv.Constant dense<2> : !spirv.arm.tensor<2x3xi32> @@ -19,13 +38,34 @@ spirv.module Logical Vulkan requires #spirv.vce<v1.3, spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xi32> } - // CHECK-LABEL: @arm_tensor_of_f32 - spirv.func @arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3xf32>) "None" { + // CHECK-LABEL: @rank_1_arm_tensor_of_f32 + spirv.func @rank_1_arm_tensor_of_f32() -> (!spirv.arm.tensor<3xf32>) "None" { + // CHECK: {{%.*}} = spirv.Constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]> : !spirv.arm.tensor<3xf32> + %0 = spirv.Constant dense<[1.0, 2.0, 3.0]> : !spirv.arm.tensor<3xf32> + spirv.ReturnValue %0 : !spirv.arm.tensor<3xf32> + } + + // CHECK-LABEL: @rank_2_arm_tensor_of_f32 + spirv.func @rank_2_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3xf32>) "None" { // CHECK: {{%.*}} = spirv.Constant dense<{{\[}}[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : !spirv.arm.tensor<2x3xf32> - %0 = spirv.Constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]>: !spirv.arm.tensor<2x3xf32> + %0 = spirv.Constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : !spirv.arm.tensor<2x3xf32> spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xf32> } + // CHECK-LABEL: @rank_3_arm_tensor_of_f32 + spirv.func @rank_3_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x2x3xf32>) "None" { + // CHECK: {{%.*}} = spirv.Constant dense<{{\[}}{{\[}}[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]], {{\[}}[7.000000e+00, 8.000000e+00, 9.000000e+00], [1.000000e+01, 1.100000e+01, 1.200000e+01]]]> : !spirv.arm.tensor<2x2x3xf32> + %0 = spirv.Constant dense<[[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]]> : !spirv.arm.tensor<2x2x3xf32> + spirv.ReturnValue %0 : !spirv.arm.tensor<2x2x3xf32> + } + + // CHECK-LABEL: @rank_4_arm_tensor_of_f32 + spirv.func @rank_4_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3x4x5xf32>) "None" { + // CHECK: {{%.*}} = spirv.Constant dense<5.000000e+00> : !spirv.arm.tensor<2x3x4x5xf32> + %0 = spirv.Constant dense<5.0> : !spirv.arm.tensor<2x3x4x5xf32> + spirv.ReturnValue %0 : !spirv.arm.tensor<2x3x4x5xf32> + } + // CHECK-LABEL: @splat_arm_tensor_of_f32 spirv.func @splat_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3xf32>) "None" { // CHECK: {{%.*}} = spirv.Constant dense<2.000000e+00> : !spirv.arm.tensor<2x3xf32> diff --git a/mlir/test/Target/SPIRV/decorations.mlir b/mlir/test/Target/SPIRV/decorations.mlir index ee7ad81..90ba690e 100644 --- a/mlir/test/Target/SPIRV/decorations.mlir +++ b/mlir/test/Target/SPIRV/decorations.mlir @@ -58,6 +58,20 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> { // ----- +spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Tessellation, Linkage], []> { + // CHECK: patch + spirv.GlobalVariable @var {patch} : !spirv.ptr<vector<4xf32>, Input> +} + +// ----- + +spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> { + // CHECK: invariant + spirv.GlobalVariable @var {invariant} : !spirv.ptr<vector<2xf32>, Output> +} + +// ----- + spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> { // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = "outSideGlobalVar1", linkage_type = <Import>> spirv.GlobalVariable @var1 { diff --git a/mlir/test/Transforms/test-legalizer-fold-after.mlir b/mlir/test/Transforms/test-legalizer-fold-after.mlir new file mode 100644 index 0000000..7f80252 --- /dev/null +++ b/mlir/test/Transforms/test-legalizer-fold-after.mlir @@ -0,0 +1,9 @@ +// RUN: mlir-opt %s -test-legalize-patterns="test-legalize-folding-mode=after-patterns" | FileCheck %s + +// CHECK-LABEL: @fold_legalization +func.func @fold_legalization() -> i32 { + // CHECK-NOT: op_in_place_self_fold + // CHECK: 97 + %1 = "test.op_in_place_self_fold"() : () -> (i32) + "test.return"(%1) : (i32) -> () +} diff --git a/mlir/test/Transforms/test-legalizer-fold-before.mlir b/mlir/test/Transforms/test-legalizer-fold-before.mlir new file mode 100644 index 0000000..fe6e293 --- /dev/null +++ b/mlir/test/Transforms/test-legalizer-fold-before.mlir @@ -0,0 +1,9 @@ +// RUN: mlir-opt %s -test-legalize-patterns="test-legalize-folding-mode=before-patterns" | FileCheck %s + +// CHECK-LABEL: @fold_legalization +func.func @fold_legalization() -> i32 { + // CHECK: op_in_place_self_fold + // CHECK-SAME: folded + %1 = "test.op_in_place_self_fold"() : () -> (i32) + "test.return"(%1) : (i32) -> () +} diff --git a/mlir/test/Transforms/test-legalizer-no-fold.mlir b/mlir/test/Transforms/test-legalizer-no-fold.mlir new file mode 100644 index 0000000..720d17f --- /dev/null +++ b/mlir/test/Transforms/test-legalizer-no-fold.mlir @@ -0,0 +1,12 @@ +// RUN: mlir-opt %s -allow-unregistered-dialect -test-legalize-patterns="test-legalize-folding-mode=never" | FileCheck %s + +// CHECK-LABEL: @remove_foldable_op( +func.func @remove_foldable_op(%arg0 : i32) -> (i32) { + // Check that op was not folded. + // CHECK: "test.op_with_region_fold" + %0 = "test.op_with_region_fold"(%arg0) ({ + "foo.op_with_region_terminator"() : () -> () + }) : (i32) -> (i32) + "test.return"(%0) : (i32) -> () +} + diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir index e4406e6..5630d15 100644 --- a/mlir/test/Transforms/test-legalizer.mlir +++ b/mlir/test/Transforms/test-legalizer.mlir @@ -415,3 +415,20 @@ func.func @test_multiple_1_to_n_replacement() { %0 = "test.multiple_1_to_n_replacement"() : () -> (f16) "test.invalid"(%0) : (f16) -> () } + +// ----- + +// CHECK-LABEL: func @test_lookup_without_converter +// CHECK: %[[producer:.*]] = "test.valid_producer"() : () -> i16 +// CHECK: %[[cast:.*]] = "test.cast"(%[[producer]]) : (i16) -> f64 +// CHECK: "test.valid_consumer"(%[[cast]]) : (f64) -> () +// CHECK: "test.valid_consumer"(%[[producer]]) : (i16) -> () +func.func @test_lookup_without_converter() { + %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64) + "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> () + // Make sure that the second "replace_with_valid_consumer" lowering does not + // lookup the materialization that was created for the above op. + "test.replace_with_valid_consumer"(%0) : (i64) -> () + // expected-remark@+1 {{op 'func.return' is not legalizable}} + return +} diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index 2eaad55..231400e 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -1169,6 +1169,26 @@ def OpP : TEST_Op<"op_p"> { let results = (outs I32); } +// Test constant-folding a pattern that maps `(F32) -> SI32`. +def SignOp : TEST_Op<"sign", [SameOperandsAndResultShape]> { + let arguments = (ins RankedTensorOf<[F32]>:$operand); + let results = (outs RankedTensorOf<[SI32]>:$result); + + let assemblyFormat = [{ + $operand attr-dict `:` functional-type(operands, results) + }]; +} + +// Test constant-folding a pattern that maps `(F32, F32) -> I1`. +def LessThanOp : TEST_Op<"less_than", [SameOperandsAndResultShape]> { + let arguments = (ins RankedTensorOf<[F32]>:$lhs, RankedTensorOf<[F32]>:$rhs); + let results = (outs RankedTensorOf<[I1]>:$result); + + let assemblyFormat = [{ + $lhs `,` $rhs attr-dict `:` functional-type(operands, results) + }]; +} + // Test same operand name enforces equality condition check. def TestEqualArgsPattern : Pat<(OpN $a, $a), (OpO $a)>; @@ -1478,6 +1498,8 @@ def TestOpInPlaceSelfFold : TEST_Op<"op_in_place_self_fold"> { let results = (outs I32); let hasFolder = 1; } +def : Pat<(TestOpInPlaceSelfFold:$op $_), + (TestOpConstant ConstantAttr<I32Attr, "97">)>; // Test op that simply returns success. def TestOpInPlaceFoldSuccess : TEST_Op<"op_in_place_fold_success"> { @@ -2104,6 +2126,10 @@ def TestInvalidOp : TEST_Op<"invalid", [Terminator]>, Arguments<(ins Variadic<AnyType>)>; def TestTypeProducerOp : TEST_Op<"type_producer">, Results<(outs AnyType)>; +def TestValidProducerOp : TEST_Op<"valid_producer">, + Results<(outs AnyType)>; +def TestValidConsumerOp : TEST_Op<"valid_consumer">, + Arguments<(ins AnyType)>; def TestAnotherTypeProducerOp : TEST_Op<"another_type_producer">, Results<(outs AnyType)>; def TestTypeConsumerOp : TEST_Op<"type_consumer">, diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index eda618f..ff958d9 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -10,6 +10,7 @@ #include "TestOps.h" #include "TestTypes.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/CommonFolders.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Func/Transforms/FuncConversions.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" @@ -202,6 +203,66 @@ struct HoistEligibleOps : public OpRewritePattern<test::OneRegionOp> { } }; +struct FoldSignOpF32ToSI32 : public OpRewritePattern<test::SignOp> { + using OpRewritePattern<test::SignOp>::OpRewritePattern; + + LogicalResult matchAndRewrite(test::SignOp op, + PatternRewriter &rewriter) const override { + if (op->getNumOperands() != 1 || op->getNumResults() != 1) + return failure(); + + TypedAttr operandAttr; + matchPattern(op->getOperand(0), m_Constant(&operandAttr)); + if (!operandAttr) + return failure(); + + TypedAttr res = cast_or_null<TypedAttr>( + constFoldUnaryOp<FloatAttr, FloatAttr::ValueType, void, IntegerAttr>( + operandAttr, op.getType(), [](APFloat operand) -> APSInt { + static const APFloat zero(0.0f); + int operandSign = 0; + if (operand != zero) + operandSign = (operand < zero) ? -1 : +1; + return APSInt(APInt(32, operandSign), false); + })); + if (!res) + return failure(); + + rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, res); + return success(); + } +}; + +struct FoldLessThanOpF32ToI1 : public OpRewritePattern<test::LessThanOp> { + using OpRewritePattern<test::LessThanOp>::OpRewritePattern; + + LogicalResult matchAndRewrite(test::LessThanOp op, + PatternRewriter &rewriter) const override { + if (op->getNumOperands() != 2 || op->getNumResults() != 1) + return failure(); + + TypedAttr lhsAttr; + TypedAttr rhsAttr; + matchPattern(op->getOperand(0), m_Constant(&lhsAttr)); + matchPattern(op->getOperand(1), m_Constant(&rhsAttr)); + + if (!lhsAttr || !rhsAttr) + return failure(); + + Attribute operandAttrs[2] = {lhsAttr, rhsAttr}; + TypedAttr res = cast_or_null<TypedAttr>( + constFoldBinaryOp<FloatAttr, FloatAttr::ValueType, void, IntegerAttr>( + operandAttrs, op.getType(), [](APFloat lhs, APFloat rhs) -> APInt { + return APInt(1, lhs < rhs); + })); + if (!res) + return failure(); + + rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, res); + return success(); + } +}; + /// This pattern moves "test.move_before_parent_op" before the parent op. struct MoveBeforeParentOp : public RewritePattern { MoveBeforeParentOp(MLIRContext *context) @@ -1198,6 +1259,47 @@ public: } }; +/// Pattern that replaces test.replace_with_valid_producer with +/// test.valid_producer and the specified type. +class TestReplaceWithValidProducer : public ConversionPattern { +public: + TestReplaceWithValidProducer(MLIRContext *ctx) + : ConversionPattern("test.replace_with_valid_producer", 1, ctx) {} + LogicalResult + matchAndRewrite(Operation *op, ArrayRef<Value> operands, + ConversionPatternRewriter &rewriter) const final { + auto attr = op->getAttrOfType<TypeAttr>("type"); + if (!attr) + return failure(); + rewriter.replaceOpWithNewOp<TestValidProducerOp>(op, attr.getValue()); + return success(); + } +}; + +/// Pattern that replaces test.replace_with_valid_consumer with +/// test.valid_consumer. Can be used with and without a type converter. +class TestReplaceWithValidConsumer : public ConversionPattern { +public: + TestReplaceWithValidConsumer(MLIRContext *ctx, const TypeConverter &converter) + : ConversionPattern(converter, "test.replace_with_valid_consumer", 1, + ctx) {} + TestReplaceWithValidConsumer(MLIRContext *ctx) + : ConversionPattern("test.replace_with_valid_consumer", 1, ctx) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef<Value> operands, + ConversionPatternRewriter &rewriter) const final { + // with_converter present: pattern must have been initialized with a type + // converter. + // with_converter absent: pattern must have been initialized without a type + // converter. + if (op->hasAttr("with_converter") != static_cast<bool>(getTypeConverter())) + return failure(); + rewriter.replaceOpWithNewOp<TestValidConsumerOp>(op, operands[0]); + return success(); + } +}; + /// This pattern matches a test.convert_block_args op. It either: /// a) Duplicates all block arguments, /// b) or: drops all block arguments and replaces each with 2x the first @@ -1314,6 +1416,7 @@ struct TestTypeConverter : public TypeConverter { TestTypeConverter() { addConversion(convertType); addSourceMaterialization(materializeCast); + addTargetMaterialization(materializeCast); } static LogicalResult convertType(Type t, SmallVectorImpl<Type> &results) { @@ -1389,10 +1492,12 @@ struct TestLegalizePatternDriver TestBoundedRecursiveRewrite, TestNestedOpCreationUndoRewrite, TestReplaceEraseOp, TestCreateUnregisteredOp, TestUndoMoveOpBefore, TestUndoPropertiesModification, TestEraseOp, + TestReplaceWithValidProducer, TestReplaceWithValidConsumer, TestRepetitive1ToNConsumer>(&getContext()); patterns.add<TestDropOpSignatureConversion, TestDropAndReplaceInvalidOp, TestPassthroughInvalidOp, TestMultiple1ToNReplacement, - TestBlockArgReplace>(&getContext(), converter); + TestBlockArgReplace, TestReplaceWithValidConsumer>( + &getContext(), converter); patterns.add<TestConvertBlockArgs>(converter, &getContext()); mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns, converter); @@ -1402,7 +1507,8 @@ struct TestLegalizePatternDriver ConversionTarget target(getContext()); target.addLegalOp<ModuleOp>(); target.addLegalOp<LegalOpA, LegalOpB, LegalOpC, TestCastOp, TestValidOp, - TerminatorOp, OneRegionOp>(); + TerminatorOp, TestOpConstant, OneRegionOp, + TestValidProducerOp, TestValidConsumerOp>(); target.addLegalOp(OperationName("test.legal_op", &getContext())); target .addIllegalOp<ILLegalOpF, TestRegionBuilderOp, TestOpWithRegionFold>(); @@ -1457,6 +1563,7 @@ struct TestLegalizePatternDriver DumpNotifications dumpNotifications; config.listener = &dumpNotifications; config.unlegalizedOps = &unlegalizedOps; + config.foldingMode = foldingMode; if (failed(applyPartialConversion(getOperation(), target, std::move(patterns), config))) { getOperation()->emitRemark() << "applyPartialConversion failed"; @@ -1476,6 +1583,7 @@ struct TestLegalizePatternDriver ConversionConfig config; DumpNotifications dumpNotifications; + config.foldingMode = foldingMode; config.listener = &dumpNotifications; if (failed(applyFullConversion(getOperation(), target, std::move(patterns), config))) { @@ -1490,6 +1598,7 @@ struct TestLegalizePatternDriver // Analyze the convertible operations. DenseSet<Operation *> legalizedOps; ConversionConfig config; + config.foldingMode = foldingMode; config.legalizableOps = &legalizedOps; if (failed(applyAnalysisConversion(getOperation(), target, std::move(patterns), config))) @@ -1510,6 +1619,21 @@ struct TestLegalizePatternDriver clEnumValN(ConversionMode::Full, "full", "Perform a full conversion"), clEnumValN(ConversionMode::Partial, "partial", "Perform a partial conversion"))}; + + Option<DialectConversionFoldingMode> foldingMode{ + *this, "test-legalize-folding-mode", + llvm::cl::desc("The folding mode to use with the test driver"), + llvm::cl::init(DialectConversionFoldingMode::BeforePatterns), + llvm::cl::values(clEnumValN(DialectConversionFoldingMode::Never, "never", + "Never attempt to fold"), + clEnumValN(DialectConversionFoldingMode::BeforePatterns, + "before-patterns", + "Only attempt to fold not legal operations " + "before applying patterns"), + clEnumValN(DialectConversionFoldingMode::AfterPatterns, + "after-patterns", + "Only attempt to fold not legal operations " + "after applying patterns"))}; }; } // namespace @@ -2181,6 +2305,24 @@ struct TestSelectiveReplacementPatternDriver (void)applyPatternsGreedily(getOperation(), std::move(patterns)); } }; + +struct TestFoldTypeConvertingOp + : public PassWrapper<TestFoldTypeConvertingOp, OperationPass<>> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestFoldTypeConvertingOp) + + StringRef getArgument() const final { return "test-fold-type-converting-op"; } + StringRef getDescription() const final { + return "Test helper functions for folding ops whose input and output types " + "differ, e.g. float comparisons of the form `(f32, f32) -> i1`."; + } + void runOnOperation() override { + MLIRContext *context = &getContext(); + mlir::RewritePatternSet patterns(context); + patterns.add<FoldSignOpF32ToSI32, FoldLessThanOpF32ToI1>(context); + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) + signalPassFailure(); + } +}; } // namespace //===----------------------------------------------------------------------===// @@ -2211,6 +2353,8 @@ void registerPatternsTestPass() { PassRegistration<TestMergeBlocksPatternDriver>(); PassRegistration<TestSelectiveReplacementPatternDriver>(); + + PassRegistration<TestFoldTypeConvertingOp>(); } } // namespace test } // namespace mlir diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index c6245b6..3bea8ef 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -7,11 +7,14 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Index/IR/IndexDialect.h" #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" +#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" +#include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" using namespace mlir; @@ -147,12 +150,118 @@ struct TestXeGPUUnrollingPatterns } }; +#undef DEBUG_TYPE +#define DEBUG_TYPE "test-xegpu-layout-interface" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") +#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") + +// Test pattern for distributing vector::StepOp from workgroup to subgroup. +// Validates LayoutTrait interfaces for offset computation abstraction between +// LayoutAttr and SliceAttr. +class TestStepOpPattern : public OpConversionPattern<vector::StepOp> { + using OpConversionPattern<vector::StepOp>::OpConversionPattern; + + LogicalResult + matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + auto layoutName = xegpu::getLayoutName(op->getResult(0)); + auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName); + if (!sliceAttr || sliceAttr.getRank() != 1) + return failure(); + + std::optional<SmallVector<int64_t>> sgShape = sliceAttr.getSgDataAsInt(); + if (!sgShape) + return failure(); + + Location loc = op.getLoc(); + VectorType type = op.getResult().getType(); + auto wgShape = type.getShape(); + + Value sgId = + gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr); + auto maybeOffsets = sliceAttr.getOffsets(rewriter, loc, sgId, wgShape); + if (failed(maybeOffsets)) + return failure(); + + VectorType newTy = type.cloneWith(*sgShape, type.getElementType()); + Value base = vector::StepOp::create(rewriter, loc, newTy); + SmallVector<Value> newOps; + for (auto offsets : *maybeOffsets) { + Value bcast = + vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]); + Value add = arith::AddIOp::create(rewriter, loc, base, bcast); + newOps.push_back(add); + } + rewriter.replaceOpWithMultiple(op, {newOps}); + return success(); + } +}; + +struct TestXeGPULayoutInterface + : public PassWrapper<TestXeGPULayoutInterface, + OperationPass<gpu::GPUModuleOp>> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPULayoutInterface) + + StringRef getArgument() const final { return "test-xegpu-layout-interface"; } + + StringRef getDescription() const final { + return "Test the implementation of XeGPU Layout interfaces"; + } + + void getDependentDialects(::mlir::DialectRegistry ®istry) const override { + registry.insert<arith::ArithDialect>(); + registry.insert<memref::MemRefDialect>(); + registry.insert<xegpu::XeGPUDialect>(); + registry.insert<vector::VectorDialect>(); + registry.insert<index::IndexDialect>(); + } + + TestXeGPULayoutInterface() = default; + TestXeGPULayoutInterface(const TestXeGPULayoutInterface &pass) + : PassWrapper(pass) {} + + void runOnOperation() override { + MLIRContext *ctx = &getContext(); + + TypeConverter typeConverter; + auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type, + mlir::ValueRange inputs, + mlir::Location loc) -> mlir::Value { + return builder.create<UnrealizedConversionCastOp>(loc, type, inputs) + .getResult(0); + }; + typeConverter.addSourceMaterialization(materializeCast); + typeConverter.addTargetMaterialization(materializeCast); + + RewritePatternSet patterns(ctx); + patterns.add<TestStepOpPattern>(typeConverter, ctx); + + ConversionTarget target(*ctx); + auto isLegal = [&](xegpu::SliceAttr layout) -> bool { + return !layout || !layout.isWgLayout(); + }; + + target.addDynamicallyLegalOp<vector::StepOp>( + [&](vector::StepOp op) -> bool { + auto layoutName = xegpu::getLayoutName(op->getResult(0)); + auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName); + return isLegal(sliceAttr); + }); + + target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; }); + + (void)applyPartialConversion(getOperation(), target, std::move(patterns)); + } +}; + } // namespace namespace mlir { namespace test { void registerTestXeGPULowerings() { PassRegistration<TestXeGPUUnrollingPatterns>(); + PassRegistration<TestXeGPULayoutInterface>(); } } // namespace test } // namespace mlir diff --git a/mlir/test/lib/Pass/TestPassManager.cpp b/mlir/test/lib/Pass/TestPassManager.cpp index 25c8e53..df2736b 100644 --- a/mlir/test/lib/Pass/TestPassManager.cpp +++ b/mlir/test/lib/Pass/TestPassManager.cpp @@ -133,6 +133,51 @@ struct TestOptionsSuperPass llvm::cl::desc("Example list of PassPipelineOptions option")}; }; +struct TestOptionsPassA + : public PassWrapper<TestOptionsPassA, OperationPass<func::FuncOp>> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOptionsPassA) + + struct Options : public PassPipelineOptions<Options> { + Option<bool> foo{*this, "foo", llvm::cl::desc("Example boolean option")}; + }; + + TestOptionsPassA() = default; + TestOptionsPassA(const TestOptionsPassA &) : PassWrapper() {} + TestOptionsPassA(const Options &options) { this->options.foo = options.foo; } + + void runOnOperation() final {} + StringRef getArgument() const final { return "test-options-pass-a"; } + StringRef getDescription() const final { + return "Test superset options parsing capabilities - subset A"; + } + + Options options; +}; + +struct TestOptionsPassB + : public PassWrapper<TestOptionsPassB, OperationPass<func::FuncOp>> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOptionsPassB) + + struct Options : public PassPipelineOptions<Options> { + Option<bool> bar{*this, "bar", llvm::cl::desc("Example boolean option")}; + }; + + TestOptionsPassB() = default; + TestOptionsPassB(const TestOptionsPassB &) : PassWrapper() {} + TestOptionsPassB(const Options &options) { this->options.bar = options.bar; } + + void runOnOperation() final {} + StringRef getArgument() const final { return "test-options-pass-b"; } + StringRef getDescription() const final { + return "Test superset options parsing capabilities - subset B"; + } + + Options options; +}; + +struct TestPipelineOptionsSuperSetAB : TestOptionsPassA::Options, + TestOptionsPassB::Options {}; + /// A test pass that always aborts to enable testing the crash recovery /// mechanism of the pass manager. struct TestCrashRecoveryPass @@ -270,6 +315,9 @@ void registerPassManagerTestPass() { PassRegistration<TestOptionsPass>(); PassRegistration<TestOptionsSuperPass>(); + PassRegistration<TestOptionsPassA>(); + PassRegistration<TestOptionsPassB>(); + PassRegistration<TestModulePass>(); PassRegistration<TestFunctionPass>(); @@ -306,5 +354,16 @@ void registerPassManagerTestPass() { [](OpPassManager &pm, const TestOptionsSuperPass::Options &options) { pm.addPass(std::make_unique<TestOptionsSuperPass>(options)); }); + + PassPipelineRegistration<TestPipelineOptionsSuperSetAB> + registerPipelineOptionsSuperSetABPipeline( + "test-options-super-set-ab-pipeline", + "Parses options of PassPipelineOptions using pass pipeline " + "registration", + [](OpPassManager &pm, const TestPipelineOptionsSuperSetAB &options) { + // Pass superset AB options to subset options A and B + pm.addPass(std::make_unique<TestOptionsPassA>(options)); + pm.addPass(std::make_unique<TestOptionsPassB>(options)); + }); } } // namespace mlir diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index feaf5fb..f392bda 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -224,6 +224,9 @@ if config.enable_cuda_runner: if config.enable_sycl_runner: tools.extend([add_runtime("mlir_sycl_runtime")]) +if config.enable_levelzero_runner: + tools.extend([add_runtime("mlir_levelzero_runtime")]) + if config.enable_spirv_cpu_runner: tools.extend([add_runtime("mlir_spirv_cpu_runtime")]) diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index b1185e1..d904780 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -34,6 +34,7 @@ config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@ config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@" config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@" config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@ +config.enable_levelzero_runner = @MLIR_ENABLE_LEVELZERO_RUNNER@ config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@ config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@ config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@ diff --git a/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py b/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py index ee76b6d..bc273bf 100644 --- a/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py +++ b/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py @@ -1,7 +1,7 @@ # RUN: %PYTHON -m mlir.dialects.linalg.opdsl.dump_oplib .ops.core_named_ops | FileCheck %s # Just verify that at least one known op is generated. -# CHECK: name: matmul +# CHECK: name: copy # verify some special cases: negf->NegFOp, powf->PowFOp # CHECK cpp_class_name: NegFOp |