aboutsummaryrefslogtreecommitdiff
path: root/mlir/test
diff options
context:
space:
mode:
Diffstat (limited to 'mlir/test')
-rw-r--r--mlir/test/CMakeLists.txt4
-rw-r--r--mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir22
-rw-r--r--mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir14
-rw-r--r--mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir40
-rw-r--r--mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir91
-rw-r--r--mlir/test/Dialect/AMDGPU/invalid.mlir8
-rw-r--r--mlir/test/Dialect/AMDGPU/ops.mlir4
-rw-r--r--mlir/test/Dialect/Arith/canonicalize.mlir14
-rw-r--r--mlir/test/Dialect/GPU/ops.mlir12
-rw-r--r--mlir/test/Dialect/GPU/outlining.mlir30
-rw-r--r--mlir/test/Dialect/LLVMIR/call-intrin.mlir9
-rw-r--r--mlir/test/Dialect/LLVMIR/inlining.mlir18
-rw-r--r--mlir/test/Dialect/LLVMIR/mem2reg.mlir35
-rw-r--r--mlir/test/Dialect/LLVMIR/rocdl.mlir7
-rw-r--r--mlir/test/Dialect/LLVMIR/roundtrip.mlir8
-rw-r--r--mlir/test/Dialect/LLVMIR/sroa.mlir2
-rw-r--r--mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir50
-rw-r--r--mlir/test/Dialect/Linalg/block-pack-matmul.mlir144
-rw-r--r--mlir/test/Dialect/Linalg/elementwise/named-to-elementwise.mlir56
-rw-r--r--mlir/test/Dialect/Linalg/fold-add-into-dest.mlir30
-rw-r--r--mlir/test/Dialect/Linalg/linalg-morph-category-ops.mlir15
-rw-r--r--mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir14
-rw-r--r--mlir/test/Dialect/Linalg/named-ops.mlir44
-rw-r--r--mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir85
-rw-r--r--mlir/test/Dialect/Linalg/tile-to-forall.mlir2
-rw-r--r--mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir12
-rw-r--r--mlir/test/Dialect/Linalg/transform-op-pad.mlir6
-rw-r--r--mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir89
-rw-r--r--mlir/test/Dialect/Linalg/transpose-matmul.mlir38
-rw-r--r--mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir665
-rw-r--r--mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir117
-rw-r--r--mlir/test/Dialect/Vector/canonicalize.mlir100
-rw-r--r--mlir/test/Dialect/Vector/invalid.mlir62
-rw-r--r--mlir/test/Dialect/Vector/vector-sink.mlir30
-rw-r--r--mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir8
-rw-r--r--mlir/test/Dialect/WasmSSA/custom_parser/if.mlir53
-rw-r--r--mlir/test/Dialect/WasmSSA/custom_parser/memory.mlir7
-rw-r--r--mlir/test/Dialect/WasmSSA/custom_parser/table.mlir7
-rw-r--r--mlir/test/Dialect/XeGPU/invalid.mlir29
-rw-r--r--mlir/test/Dialect/XeGPU/layout.mlir23
-rw-r--r--mlir/test/Dialect/XeGPU/ops.mlir36
-rw-r--r--mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir37
-rw-r--r--mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir27
-rw-r--r--mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir48
-rw-r--r--mlir/test/Dialect/common_folders.mlir22
-rw-r--r--mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir9
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir4
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir4
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir4
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir2
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir4
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir4
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir6
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir6
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir4
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir4
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir2
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir2
-rw-r--r--mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir8
-rw-r--r--mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir59
-rw-r--r--mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir57
-rw-r--r--mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir56
-rw-r--r--mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir86
-rw-r--r--mlir/test/Integration/GPU/LevelZero/lit.local.cfg2
-rw-r--r--mlir/test/Pass/pipeline-options-parsing.mlir10
-rw-r--r--mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll8
-rw-r--r--mlir/test/Target/LLVMIR/Import/intrinsic.ll12
-rw-r--r--mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir8
-rw-r--r--mlir/test/Target/LLVMIR/nvvmir-invalid.mlir19
-rw-r--r--mlir/test/Target/LLVMIR/nvvmir.mlir11
-rw-r--r--mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir44
-rw-r--r--mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir36
-rw-r--r--mlir/test/Target/LLVMIR/rocdl.mlir17
-rw-r--r--mlir/test/Target/SPIRV/arm-tensor-constant.mlir56
-rw-r--r--mlir/test/Target/SPIRV/decorations.mlir14
-rw-r--r--mlir/test/Transforms/test-legalizer-fold-after.mlir9
-rw-r--r--mlir/test/Transforms/test-legalizer-fold-before.mlir9
-rw-r--r--mlir/test/Transforms/test-legalizer-no-fold.mlir12
-rw-r--r--mlir/test/Transforms/test-legalizer.mlir17
-rw-r--r--mlir/test/lib/Dialect/Test/TestOps.td26
-rw-r--r--mlir/test/lib/Dialect/Test/TestPatterns.cpp148
-rw-r--r--mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp109
-rw-r--r--mlir/test/lib/Pass/TestPassManager.cpp59
-rw-r--r--mlir/test/lit.cfg.py3
-rw-r--r--mlir/test/lit.site.cfg.py.in1
-rw-r--r--mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py2
86 files changed, 2170 insertions, 957 deletions
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 89568e7..a4a942d 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -167,6 +167,10 @@ if(MLIR_ENABLE_SYCL_RUNNER)
list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime)
endif()
+if(MLIR_ENABLE_LEVELZERO_RUNNER)
+ list(APPEND MLIR_TEST_DEPENDS mlir_levelzero_runtime)
+endif()
+
if (MLIR_RUN_ARM_SME_TESTS AND NOT ARM_SME_ABI_ROUTINES_SHLIB)
list(APPEND MLIR_TEST_DEPENDS mlir_arm_sme_abi_stubs)
endif()
diff --git a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
index 2a7be0b..e6321e9 100644
--- a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
+++ b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
@@ -85,6 +85,28 @@ func.func @load_i1(%src: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %i :
return %0: i1
}
+// CHECK-LABEL: func @load_aligned
+// CHECK-SAME: (%[[SRC:.+]]: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %[[IDX:.+]]: index)
+func.func @load_aligned(%src: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %i : index) -> i1 {
+ // CHECK: spirv.Load "StorageBuffer" {{.*}} ["Aligned", 32] : i8
+ %0 = memref.load %src[%i] { alignment = 32 } : memref<4xi1, #spirv.storage_class<StorageBuffer>>
+ return %0: i1
+}
+
+// CHECK-LABEL: func @load_aligned_nontemporal
+func.func @load_aligned_nontemporal(%src: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %i : index) -> i1 {
+ // CHECK: spirv.Load "StorageBuffer" {{.*}} ["Aligned|Nontemporal", 32] : i8
+ %0 = memref.load %src[%i] { alignment = 32, nontemporal = true } : memref<4xi1, #spirv.storage_class<StorageBuffer>>
+ return %0: i1
+}
+
+// CHECK-LABEL: func @load_aligned_psb
+func.func @load_aligned_psb(%src: memref<4xi1, #spirv.storage_class<PhysicalStorageBuffer>>, %i : index) -> i1 {
+ // CHECK: %[[VAL:.+]] = spirv.Load "PhysicalStorageBuffer" {{.*}} ["Aligned", 32] : i8
+ %0 = memref.load %src[%i] { alignment = 32 } : memref<4xi1, #spirv.storage_class<PhysicalStorageBuffer>>
+ return %0: i1
+}
+
// CHECK-LABEL: func @store_i1
// CHECK-SAME: %[[DST:.+]]: memref<4xi1, #spirv.storage_class<StorageBuffer>>,
// CHECK-SAME: %[[IDX:.+]]: index
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 580b09d..e505767 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -681,3 +681,17 @@ llvm.func @ex2(%input : f32, %pred : i1) {
%1 = nvvm.inline_ptx "ex2.approx.ftz.f32 $0, $1;" (%input), predicate = %pred : f32, i1 -> f32
llvm.return
}
+
+// -----
+
+// CHECK-LABEL: @nvvm_pmevent
+llvm.func @nvvm_pmevent() {
+ // CHECK: %[[S0:.+]] = llvm.mlir.constant(10 : i32) : i32
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "pmevent $0;", "n" %[[S0]] : (i32) -> ()
+
+ nvvm.pmevent id = 10
+ // CHECK: %[[S1:.+]] = llvm.mlir.constant(4 : i32) : i32
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "pmevent $0;", "n" %[[S1]] : (i32) -> ()
+ nvvm.pmevent id = 4
+ llvm.return
+}
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
index 31e17fb..5a424a8 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
@@ -1679,6 +1679,16 @@ func.func @load_0d(%memref : memref<200x100xf32>, %i : index, %j : index) -> vec
// -----
+func.func @load_with_alignment(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<8xf32> {
+ %0 = vector.load %memref[%i, %j] { alignment = 8 } : memref<200x100xf32>, vector<8xf32>
+ return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @load_with_alignment
+// CHECK: llvm.load {{.*}} {alignment = 8 : i64} : !llvm.ptr -> vector<8xf32>
+
+// -----
+
//===----------------------------------------------------------------------===//
// vector.store
//===----------------------------------------------------------------------===//
@@ -1785,6 +1795,16 @@ func.func @store_0d(%memref : memref<200x100xf32>, %i : index, %j : index) {
// -----
+func.func @store_with_alignment(%memref : memref<200x100xf32>, %i : index, %j : index, %val : vector<4xf32>) {
+ vector.store %val, %memref[%i, %j] {alignment = 8} : memref<200x100xf32>, vector<4xf32>
+ return
+}
+
+// CHECK-LABEL: func @store_with_alignment
+// CHECK: llvm.store %{{.*}} {alignment = 8 : i64} : vector<4xf32>, !llvm.ptr
+
+// -----
+
//===----------------------------------------------------------------------===//
// vector.maskedload
//===----------------------------------------------------------------------===//
@@ -1839,6 +1859,16 @@ func.func @masked_load_index_scalable(%arg0: memref<?xindex>, %arg1: vector<[16]
// -----
+func.func @masked_load_with_alignment(%arg0: memref<?xf32>, %arg1: vector<16xi1>, %arg2: vector<16xf32>, %arg3: index) -> vector<16xf32> {
+ %0 = vector.maskedload %arg0[%arg3], %arg1, %arg2 { alignment = 2 } : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+ return %0 : vector<16xf32>
+}
+
+// CHECK-LABEL: func @masked_load_with_alignment
+// CHECK: llvm.intr.masked.load %{{.*}} {alignment = 2 : i32} : (!llvm.ptr, vector<16xi1>, vector<16xf32>) -> vector<16xf32>
+
+// -----
+
//===----------------------------------------------------------------------===//
// vector.maskedstore
//===----------------------------------------------------------------------===//
@@ -1891,6 +1921,16 @@ func.func @masked_store_index_scalable(%arg0: memref<?xindex>, %arg1: vector<[16
// -----
+func.func @masked_store_with_alignment(%arg0: memref<?xf32>, %arg1: vector<16xi1>, %arg2: vector<16xf32>, %arg3: index) {
+ vector.maskedstore %arg0[%arg3], %arg1, %arg2 { alignment = 2 } : memref<?xf32>, vector<16xi1>, vector<16xf32>
+ return
+}
+
+// CHECK-LABEL: func @masked_store_with_alignment
+// CHECK: llvm.intr.masked.store %{{.*}} {alignment = 2 : i32} : vector<16xf32>, vector<16xi1> into !llvm.ptr
+
+// -----
+
//===----------------------------------------------------------------------===//
// vector.gather
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
index 57afa12..8ca3dd6 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
@@ -54,18 +54,20 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
// CHECK: func @test_expand_shape
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
- // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
// CHECK: %[[C0:.*]] = arith.constant 0 : index
- // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
- // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX]]], %[[LOCAL]][%[[C0]], %[[C0]]]
- // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, 3>
+ // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
+ // CHECK: %[[IDXL:.*]] = affine.linearize_index [%[[C0]], %[[C0]]] by (64, 64) : index
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDXM]]], %[[LOCAL]][%[[IDXL]]]
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3>
- %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+ %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
%mem = memref.alloc() : memref<8192xf16>
- %expand = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16> into memref<64x128xf16>
+ %expand_mem = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16> into memref<64x128xf16>
+ %expand_alloc = memref.expand_shape %alloc [[0, 1]] output_shape [64, 64] : memref<4096xf16, #gpu_lds_addrspace> into memref<64x64xf16, #gpu_lds_addrspace>
%c0 = arith.constant 0 : index
- amdgpu.gather_to_lds %expand[%offset_i, %offset_j], %alloc[%c0, %c0]
+ amdgpu.gather_to_lds %expand_mem[%offset_i, %offset_j], %expand_alloc[%c0, %c0]
: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu_lds_addrspace>
func.return
}
@@ -80,15 +82,82 @@ func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
// CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
// CHECK: %[[C0:.*]] = arith.constant 0 : index
- // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
- // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]]
+ // CHECK: %[[INDICES_MEM:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
+ // CHECK: %[[INDICES_LDS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (64, 64) : index, index
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES_MEM]]#0, %[[INDICES_MEM]]#1], %[[LOCAL]][%[[INDICES_LDS]]#0, %[[INDICES_LDS]]#1]
// CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+ %collapse_alloc = memref.collapse_shape %alloc [[0, 1]] : memref<64x64xf16, #gpu_lds_addrspace> into memref<4096xf16, #gpu_lds_addrspace>
%mem = memref.alloc() : memref<64x128xf16>
- %collapse = memref.collapse_shape %mem [[0, 1]] : memref<64x128xf16> into memref<8192xf16>
+ %collapse_mem = memref.collapse_shape %mem [[0, 1]] : memref<64x128xf16> into memref<8192xf16>
%c0 = arith.constant 0 : index
- amdgpu.gather_to_lds %collapse[%offset_i], %alloc[%c0, %c0]
+ amdgpu.gather_to_lds %collapse_mem[%offset_i], %collapse_alloc[%offset_j]
+ : vector<8xf16>, memref<8192xf16>, memref<4096xf16, #gpu_lds_addrspace>
+ func.return
+}
+
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+
+// CHECK: func @test_expand_shape_src_raw_buffer
+// CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
+func.func @test_expand_shape_src_raw_buffer(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) {
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG1]], %[[ARG2]]] by (64, 128) : index
+ // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[IDXM]]], %[[LOCAL]][%[[C0]]]
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
+
+ %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
+ %expand_mem = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>> into memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>>
+
+ %c0 = arith.constant 0 : index
+ amdgpu.gather_to_lds %expand_mem[%offset_i, %offset_j], %alloc[%c0]
+ : vector<8xf16>, memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu_lds_addrspace>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_expand_shape_dst_only
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_expand_shape_dst_only(%offset_i: index, %offset_j: index) {
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+ // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[IDX_LDS:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (64, 64) : index
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]]], %[[LOCAL]][%[[IDX_LDS]]]
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3>
+
+ %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
+ %mem = memref.alloc() : memref<8192xf16>
+ %expand_alloc = memref.expand_shape %alloc [[0, 1]] output_shape [64, 64] : memref<4096xf16, #gpu_lds_addrspace> into memref<64x64xf16, #gpu_lds_addrspace>
+
+ %c0 = arith.constant 0 : index
+ amdgpu.gather_to_lds %mem[%offset_i], %expand_alloc[%offset_j, %c0]
: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, #gpu_lds_addrspace>
func.return
}
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_nop
+// CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
+func.func @test_nop(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) {
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+ // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[ARG1]]], %[[LOCAL]][%[[ARG2]]]
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
+
+ %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
+ amdgpu.gather_to_lds %mem[%offset_i], %alloc[%offset_j]
+ : vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu_lds_addrspace>
+ func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 0d2fd24..66e7dd4 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -230,3 +230,11 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 :
amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16>
func.return
}
+
+// -----
+
+func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16, strided<[?]>, #gpu.address_space<workgroup>>) {
+ // expected-error@+1 {{'amdgpu.gather_to_lds' op destination type inner most dim must be contiguous}}
+ amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16, strided<[?]>, #gpu.address_space<workgroup>>
+ func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index fe78b53..87e11c0 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -539,13 +539,15 @@ func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16
}
// CHECK-LABEL: func @gather_to_lds
-func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>) {
+func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>, %smem3 : memref<?x?xf16, strided<[?, 1]>, #gpu.address_space<workgroup>>) {
// CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
// CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}]
// CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+ // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32x32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem1[%idx1] : vector<2xf16>, memref<32x32xf16>, memref<32xf16, #gpu.address_space<workgroup>>
amdgpu.gather_to_lds %mem1[%idx1], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
+ amdgpu.gather_to_lds %mem1[%idx1], %smem3[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<?x?xf16, strided<[?, 1]>, #gpu.address_space<workgroup>>
func.return
}
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 3d5a46d..78f6782 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -654,7 +654,7 @@ func.func @signExtendConstant() -> i16 {
// CHECK: return %[[cres]]
func.func @signExtendConstantSplat() -> vector<4xi16> {
%c-2 = arith.constant -2 : i8
- %splat = vector.splat %c-2 : vector<4xi8>
+ %splat = vector.broadcast %c-2 : i8 to vector<4xi8>
%ext = arith.extsi %splat : vector<4xi8> to vector<4xi16>
return %ext : vector<4xi16>
}
@@ -682,7 +682,7 @@ func.func @unsignedExtendConstant() -> i16 {
// CHECK: return %[[cres]]
func.func @unsignedExtendConstantSplat() -> vector<4xi16> {
%c2 = arith.constant 2 : i8
- %splat = vector.splat %c2 : vector<4xi8>
+ %splat = vector.broadcast %c2 : i8 to vector<4xi8>
%ext = arith.extui %splat : vector<4xi8> to vector<4xi16>
return %ext : vector<4xi16>
}
@@ -866,7 +866,7 @@ func.func @truncExtsiVector(%arg0: vector<2xi32>) -> vector<2xi16> {
// CHECK: return %[[cres]]
func.func @truncConstantSplat() -> vector<4xi8> {
%c-2 = arith.constant -2 : i16
- %splat = vector.splat %c-2 : vector<4xi16>
+ %splat = vector.broadcast %c-2 : i16 to vector<4xi16>
%trunc = arith.trunci %splat : vector<4xi16> to vector<4xi8>
return %trunc : vector<4xi8>
}
@@ -2334,7 +2334,7 @@ func.func @constant_FPtoUI_splat() -> vector<4xi32> {
// CHECK: %[[C0:.+]] = arith.constant dense<2> : vector<4xi32>
// CHECK: return %[[C0]]
%c0 = arith.constant 2.0 : f32
- %splat = vector.splat %c0 : vector<4xf32>
+ %splat = vector.broadcast %c0 : f32 to vector<4xf32>
%res = arith.fptoui %splat : vector<4xf32> to vector<4xi32>
return %res : vector<4xi32>
}
@@ -2374,7 +2374,7 @@ func.func @constant_FPtoSI_splat() -> vector<4xi32> {
// CHECK: %[[C0:.+]] = arith.constant dense<-2> : vector<4xi32>
// CHECK: return %[[C0]]
%c0 = arith.constant -2.0 : f32
- %splat = vector.splat %c0 : vector<4xf32>
+ %splat = vector.broadcast %c0 : f32 to vector<4xf32>
%res = arith.fptosi %splat : vector<4xf32> to vector<4xi32>
return %res : vector<4xi32>
}
@@ -2413,7 +2413,7 @@ func.func @constant_SItoFP_splat() -> vector<4xf32> {
// CHECK: %[[C0:.+]] = arith.constant dense<2.000000e+00> : vector<4xf32>
// CHECK: return %[[C0]]
%c0 = arith.constant 2 : i32
- %splat = vector.splat %c0 : vector<4xi32>
+ %splat = vector.broadcast %c0 : i32 to vector<4xi32>
%res = arith.sitofp %splat : vector<4xi32> to vector<4xf32>
return %res : vector<4xf32>
}
@@ -2442,7 +2442,7 @@ func.func @constant_UItoFP_splat() -> vector<4xf32> {
// CHECK: %[[C0:.+]] = arith.constant dense<2.000000e+00> : vector<4xf32>
// CHECK: return %[[C0]]
%c0 = arith.constant 2 : i32
- %splat = vector.splat %c0 : vector<4xi32>
+ %splat = vector.broadcast %c0 : i32 to vector<4xi32>
%res = arith.uitofp %splat : vector<4xi32> to vector<4xf32>
return %res : vector<4xf32>
}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index ee1fdfa..9cc0bf8 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -17,6 +17,18 @@ module attributes {gpu.container_module} {
return
}
+ // CHECK-LABEL:func @launch_with_module_func_attr(%{{.*}}: index)
+ func.func @launch_with_module_func_attr(%sz : index) {
+ // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) module(@test_module) function(@test_kernel_func)
+ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
+ threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz)
+ module(@test_module) function(@test_kernel_func) {
+ // CHECK: gpu.terminator
+ gpu.terminator
+ }
+ return
+ }
+
// CHECK-LABEL:func @args(%{{.*}}: index, %{{.*}}: index, %{{.*}}: f32, %{{.*}}: memref<?xf32, 1>) {
func.func @args(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index d48fa05..0490118 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -509,7 +509,7 @@ func.func @launch_cluster() {
// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
// -----
-// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch
+// This test tests the two optional attributes `module` and `function` for gpu.launch
// CHECK-LABEL: func.func @testKernelAttributes()
// CHECK: gpu.launch_func @test_module::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
// CHECK: gpu.module @test_module
@@ -523,15 +523,16 @@ func.func @testKernelAttributes() {
%bDimZ = arith.constant 8 : index
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
- threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
+ threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ)
+ module(@test_module) function(@test_kernel_func) {
"some_op"(%bx, %tx) : (index, index) -> ()
gpu.terminator
- } {kernelModule = @test_module, kernelFunc = @test_kernel_func}
+ }
return
}
// -----
-// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch, when kernelModule already exists.
+// This test tests the two optional attributes `module` and `function` for gpu.launch, when kernelModule already exists.
// CHECK-LABEL: gpu.module @existing_module
// CHECK: gpu.func @test_kernel_func()
@@ -556,15 +557,16 @@ func.func @testExistingModule() {
%bDimZ = arith.constant 8 : index
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
- threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
+ threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ)
+ module(@existing_module) function(@test_kernel_func) {
"some_op"(%bx, %tx) : (index, index) -> ()
gpu.terminator
- } {kernelModule = @existing_module, kernelFunc = @test_kernel_func}
+ }
return
}
// -----
-// This test tests the optional attribute kernelModule for gpu.launch.
+// This test tests the optional attribute `module` for gpu.launch.
// CHECK-LABEL: func.func @testKernelModuleOnly()
// CHECK: gpu.launch_func @test_module::@testKernelModuleOnly_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
// CHECK: gpu.module @test_module
@@ -578,15 +580,16 @@ func.func @testKernelModuleOnly() {
%bDimZ = arith.constant 8 : index
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
- threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
+ threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ)
+ module(@test_module) {
"some_op"(%bx, %tx) : (index, index) -> ()
gpu.terminator
- } {kernelModule = @test_module}
+ }
return
}
// -----
-// This test tests the optional attribute kernelFunc for gpu.launch.
+// This test tests the optional attribute `function` for gpu.launch.
// CHECK-LABEL: func.func @testKernelFuncOnly()
// CHECK: gpu.launch_func @test_kernel_func::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
@@ -601,15 +604,16 @@ func.func @testKernelFuncOnly() {
%bDimZ = arith.constant 8 : index
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
- threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
+ threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ)
+ function(@test_kernel_func) {
"some_op"(%bx, %tx) : (index, index) -> ()
gpu.terminator
- } {kernelFunc = @test_kernel_func}
+ }
return
}
// -----
-// This test tests gpu.launch when optional attributes kernelModule and kernelFunc are not specified.
+// This test tests gpu.launch when optional attributes `module` and `function` are not specified.
// CHECK-LABEL: func.func @testNoAttributes()
// CHECK: gpu.launch_func @testNoAttributes_kernel::@testNoAttributes_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
diff --git a/mlir/test/Dialect/LLVMIR/call-intrin.mlir b/mlir/test/Dialect/LLVMIR/call-intrin.mlir
index b8d845d..bf11e07 100644
--- a/mlir/test/Dialect/LLVMIR/call-intrin.mlir
+++ b/mlir/test/Dialect/LLVMIR/call-intrin.mlir
@@ -27,14 +27,13 @@ llvm.func @round_overloaded() -> f32 {
// CHECK: define void @lifetime_start() {
// CHECK: %1 = alloca float, i8 1, align 4
-// CHECK: call void @llvm.lifetime.start.p0(i64 4, ptr %1)
+// CHECK: call void @llvm.lifetime.start.p0(ptr %1)
// CHECK: ret void
// CHECK: }
llvm.func @lifetime_start() {
- %0 = llvm.mlir.constant(4 : i64) : i64
- %1 = llvm.mlir.constant(1 : i8) : i8
- %2 = llvm.alloca %1 x f32 : (i8) -> !llvm.ptr
- llvm.call_intrinsic "llvm.lifetime.start"(%0, %2) {} : (i64, !llvm.ptr) -> ()
+ %0 = llvm.mlir.constant(1 : i8) : i8
+ %1 = llvm.alloca %0 x f32 : (i8) -> !llvm.ptr
+ llvm.call_intrinsic "llvm.lifetime.start"(%1) {} : (!llvm.ptr) -> ()
llvm.return
}
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index 551e0c9..8e292f4 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -299,7 +299,7 @@ llvm.func @test_inline(%cond0 : i1, %cond1 : i1, %funcArg : f32) -> f32 {
^bb1:
// Make sure the lifetime begin intrinsic has been inserted where the call
// used to be, even though the alloca has been moved to the entry block.
- // CHECK-NEXT: llvm.intr.lifetime.start 4, %[[PTR]]
+ // CHECK-NEXT: llvm.intr.lifetime.start %[[PTR]]
%0 = llvm.call @static_alloca(%cond1) : (i1) -> f32
// CHECK: llvm.cond_br %{{.+}}, ^[[BB2:.+]], ^[[BB3:.+]]
llvm.br ^bb3(%0: f32)
@@ -307,9 +307,9 @@ llvm.func @test_inline(%cond0 : i1, %cond1 : i1, %funcArg : f32) -> f32 {
// return sites of the callee.
// CHECK: ^[[BB2]]:
// CHECK-NEXT: llvm.load
- // CHECK-NEXT: llvm.intr.lifetime.end 4, %[[PTR]]
+ // CHECK-NEXT: llvm.intr.lifetime.end %[[PTR]]
// CHECK: ^[[BB3]]:
- // CHECK-NEXT: llvm.intr.lifetime.end 4, %[[PTR]]
+ // CHECK-NEXT: llvm.intr.lifetime.end %[[PTR]]
^bb2:
llvm.br ^bb3(%funcArg: f32)
^bb3(%blockArg: f32):
@@ -334,9 +334,9 @@ llvm.func @test_inline(%cond0 : i1) {
// CHECK: "test.one_region_op"() ({
"test.one_region_op"() ({
%0 = llvm.call @static_alloca() : () -> f32
- // CHECK-NEXT: llvm.intr.lifetime.start 4, %[[ALLOCA]]
+ // CHECK-NEXT: llvm.intr.lifetime.start %[[ALLOCA]]
// CHECK-NEXT: %[[RES:.+]] = llvm.load %[[ALLOCA]]
- // CHECK-NEXT: llvm.intr.lifetime.end 4, %[[ALLOCA]]
+ // CHECK-NEXT: llvm.intr.lifetime.end %[[ALLOCA]]
// CHECK-NEXT: test.region_yield %[[RES]]
test.region_yield %0 : f32
}) : () -> ()
@@ -368,9 +368,9 @@ llvm.func @test_inline(%cond0 : i1) {
llvm.func @alloca_with_lifetime(%cond: i1) -> f32 {
%0 = llvm.mlir.constant(4 : i32) : i32
%1 = llvm.alloca %0 x f32 : (i32) -> !llvm.ptr
- llvm.intr.lifetime.start 4, %1 : !llvm.ptr
+ llvm.intr.lifetime.start %1 : !llvm.ptr
%2 = llvm.load %1 : !llvm.ptr -> f32
- llvm.intr.lifetime.end 4, %1 : !llvm.ptr
+ llvm.intr.lifetime.end %1 : !llvm.ptr
%3 = llvm.fadd %2, %2 : f32
llvm.return %3 : f32
}
@@ -385,9 +385,9 @@ llvm.func @test_inline(%cond0 : i1, %cond1 : i1, %funcArg : f32) -> f32 {
^bb1:
// Make sure the original lifetime intrinsic has been preserved, rather than
// inserting a new one with a larger scope.
- // CHECK: llvm.intr.lifetime.start 4, %[[PTR]]
+ // CHECK: llvm.intr.lifetime.start %[[PTR]]
// CHECK-NEXT: llvm.load %[[PTR]]
- // CHECK-NEXT: llvm.intr.lifetime.end 4, %[[PTR]]
+ // CHECK-NEXT: llvm.intr.lifetime.end %[[PTR]]
// CHECK: llvm.fadd
// CHECK-NOT: llvm.intr.lifetime.end
%0 = llvm.call @alloca_with_lifetime(%cond1) : (i1) -> f32
diff --git a/mlir/test/Dialect/LLVMIR/mem2reg.mlir b/mlir/test/Dialect/LLVMIR/mem2reg.mlir
index 56634cf..716a586 100644
--- a/mlir/test/Dialect/LLVMIR/mem2reg.mlir
+++ b/mlir/test/Dialect/LLVMIR/mem2reg.mlir
@@ -304,10 +304,9 @@ llvm.func @g()
// CHECK-NOT: = llvm.alloca
llvm.func amdgpu_kernelcc @addrspace_discard() {
%0 = llvm.mlir.constant(1 : i32) : i32
- %1 = llvm.mlir.constant(2 : i64) : i64
- %2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
- %3 = llvm.addrspacecast %2 : !llvm.ptr<5> to !llvm.ptr
- llvm.intr.lifetime.start 2, %3 : !llvm.ptr
+ %1 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
+ %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+ llvm.intr.lifetime.start %2 : !llvm.ptr
llvm.return
}
@@ -406,9 +405,9 @@ llvm.func @unreachable_jumps_to_merge_point(%arg0: i1) -> i32 {
llvm.func @ignore_lifetime() {
%0 = llvm.mlir.constant(1 : i32) : i32
%1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
- llvm.intr.lifetime.start 2, %1 : !llvm.ptr
+ llvm.intr.lifetime.start %1 : !llvm.ptr
llvm.store %0, %1 {alignment = 4 : i64} : i32, !llvm.ptr
- llvm.intr.lifetime.end 2, %1 : !llvm.ptr
+ llvm.intr.lifetime.end %1 : !llvm.ptr
llvm.return
}
@@ -437,9 +436,9 @@ llvm.func @ignore_discardable_tree() {
%5 = llvm.insertvalue %1, %4[1] : !llvm.struct<(i8, i16)>
%6 = llvm.alloca %0 x !llvm.struct<(i8, i16)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
%7 = llvm.getelementptr %6[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i8, i16)>
- llvm.intr.lifetime.start 2, %7 : !llvm.ptr
+ llvm.intr.lifetime.start %7 : !llvm.ptr
llvm.store %5, %6 {alignment = 2 : i64} : !llvm.struct<(i8, i16)>, !llvm.ptr
- llvm.intr.lifetime.end 2, %7 : !llvm.ptr
+ llvm.intr.lifetime.end %7 : !llvm.ptr
llvm.return
}
@@ -517,8 +516,8 @@ llvm.func @discardable_use_tree() {
%2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr
%3 = llvm.bitcast %2 : !llvm.ptr to !llvm.ptr
%4 = llvm.bitcast %3 : !llvm.ptr to !llvm.ptr
- llvm.intr.lifetime.start 2, %3 : !llvm.ptr
- llvm.intr.lifetime.start 2, %4 : !llvm.ptr
+ llvm.intr.lifetime.start %3 : !llvm.ptr
+ llvm.intr.lifetime.start %4 : !llvm.ptr
%5 = llvm.intr.invariant.start 2, %3 : !llvm.ptr
llvm.intr.invariant.end %5, 2, %3 : !llvm.ptr
llvm.return
@@ -534,8 +533,8 @@ llvm.func @non_discardable_use_tree() {
%2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr
%3 = llvm.bitcast %2 : !llvm.ptr to !llvm.ptr
%4 = llvm.bitcast %3 : !llvm.ptr to !llvm.ptr
- llvm.intr.lifetime.start 2, %3 : !llvm.ptr
- llvm.intr.lifetime.start 2, %4 : !llvm.ptr
+ llvm.intr.lifetime.start %3 : !llvm.ptr
+ llvm.intr.lifetime.start %4 : !llvm.ptr
llvm.call @use(%4) : (!llvm.ptr) -> i1
llvm.return
}
@@ -551,8 +550,8 @@ llvm.func @trivial_get_element_ptr() {
%2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr
%3 = llvm.bitcast %2 : !llvm.ptr to !llvm.ptr
%4 = llvm.getelementptr %3[0] : (!llvm.ptr) -> !llvm.ptr, i8
- llvm.intr.lifetime.start 2, %3 : !llvm.ptr
- llvm.intr.lifetime.start 2, %4 : !llvm.ptr
+ llvm.intr.lifetime.start %3 : !llvm.ptr
+ llvm.intr.lifetime.start %4 : !llvm.ptr
llvm.return
}
@@ -565,8 +564,8 @@ llvm.func @nontrivial_get_element_ptr() {
// CHECK: = llvm.alloca
%2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr
%4 = llvm.getelementptr %2[1] : (!llvm.ptr) -> !llvm.ptr, i8
- llvm.intr.lifetime.start 2, %2 : !llvm.ptr
- llvm.intr.lifetime.start 2, %4 : !llvm.ptr
+ llvm.intr.lifetime.start %2 : !llvm.ptr
+ llvm.intr.lifetime.start %4 : !llvm.ptr
llvm.return
}
@@ -580,8 +579,8 @@ llvm.func @dynamic_get_element_ptr() {
%2 = llvm.alloca %0 x i8 {alignment = 8 : i64} : (i32) -> !llvm.ptr
%3 = llvm.bitcast %2 : !llvm.ptr to !llvm.ptr
%4 = llvm.getelementptr %3[%0] : (!llvm.ptr, i32) -> !llvm.ptr, i8
- llvm.intr.lifetime.start 2, %3 : !llvm.ptr
- llvm.intr.lifetime.start 2, %4 : !llvm.ptr
+ llvm.intr.lifetime.start %3 : !llvm.ptr
+ llvm.intr.lifetime.start %4 : !llvm.ptr
llvm.return
}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index a2b2f84..db5271c 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -981,6 +981,13 @@ llvm.func @rocdl.s.wait.expcnt() {
// -----
+llvm.func @rocdl.readfirstlane(%src : f32) -> f32 {
+ // CHECK-LABEL: rocdl.readfirstlane
+ // CHECK: rocdl.readfirstlane %{{.*}} : f32
+ %ret = rocdl.readfirstlane %src : f32
+ llvm.return %ret : f32
+}
+
llvm.func @rocdl.readlane(%src : f32) -> f32 {
%cst0 = llvm.mlir.constant(0 : i32) : i32
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index a0273fb..7344797 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -685,10 +685,10 @@ func.func @fastmathFlags(%arg0: f32, %arg1: f32, %arg2: i32, %arg3: vector<2 x f
// CHECK-LABEL: @lifetime
// CHECK-SAME: %[[P:.*]]: !llvm.ptr
llvm.func @lifetime(%p: !llvm.ptr) {
- // CHECK: llvm.intr.lifetime.start 16, %[[P]]
- llvm.intr.lifetime.start 16, %p : !llvm.ptr
- // CHECK: llvm.intr.lifetime.end 16, %[[P]]
- llvm.intr.lifetime.end 16, %p : !llvm.ptr
+ // CHECK: llvm.intr.lifetime.start %[[P]]
+ llvm.intr.lifetime.start %p : !llvm.ptr
+ // CHECK: llvm.intr.lifetime.end %[[P]]
+ llvm.intr.lifetime.end %p : !llvm.ptr
llvm.return
}
diff --git a/mlir/test/Dialect/LLVMIR/sroa.mlir b/mlir/test/Dialect/LLVMIR/sroa.mlir
index fe1531d..1674bbd 100644
--- a/mlir/test/Dialect/LLVMIR/sroa.mlir
+++ b/mlir/test/Dialect/LLVMIR/sroa.mlir
@@ -177,7 +177,7 @@ llvm.func @direct_promotable_use_is_fine() -> i32 {
// CHECK: %[[RES:.*]] = llvm.load %[[ALLOCA]]
%3 = llvm.load %2 : !llvm.ptr -> i32
// This is a direct use of the slot but it can be removed because it implements PromotableOpInterface.
- llvm.intr.lifetime.start 2, %1 : !llvm.ptr
+ llvm.intr.lifetime.start %1 : !llvm.ptr
// CHECK: llvm.return %[[RES]] : i32
llvm.return %3 : i32
}
diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir
index 4ba4b09..2f30e8b 100644
--- a/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir
+++ b/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir
@@ -20,20 +20,6 @@ func.func @block_matmul(
return %0 : tensor<64x64xf32>
}
-func.func @block_matmul_transpose_a(
- %A: tensor<128x64xf32>, %B: tensor<128x64xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> {
- %0 = linalg.matmul_transpose_a ins(%A, %B : tensor<128x64xf32>, tensor<128x64xf32>)
- outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32>
- return %0 : tensor<64x64xf32>
-}
-
-func.func @block_matmul_transpose_b(
- %A: tensor<64x128xf32>, %B: tensor<64x128xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> {
- %0 = linalg.matmul_transpose_b ins(%A, %B : tensor<64x128xf32>, tensor<64x128xf32>)
- outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32>
- return %0 : tensor<64x64xf32>
-}
-
// MMT4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
// MMT4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
// MMT4D-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
@@ -43,18 +29,6 @@ func.func @block_matmul_transpose_b(
// MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
// MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
// MMT4D-COUNT-1: linalg.unpack
-// MMT4D-LABEL: func @block_matmul_transpose_a
-// MMT4D-COUNT-3: linalg.pack
-// MMT4D: linalg.generic
-// MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
-// MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
-// MMT4D-COUNT-1: linalg.unpack
-// MMT4D-LABEL: func @block_matmul_transpose_b
-// MMT4D-COUNT-3: linalg.pack
-// MMT4D: linalg.generic
-// MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
-// MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
-// MMT4D-COUNT-1: linalg.unpack
// MM4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
// MM4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>
@@ -65,18 +39,6 @@ func.func @block_matmul_transpose_b(
// MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
// MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
// MM4D-COUNT-1: linalg.unpack
-// MM4D-LABEL: func @block_matmul_transpose_a
-// MM4D-COUNT-3: linalg.pack
-// MM4D: linalg.generic
-// MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
-// MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
-// MM4D-COUNT-1: linalg.unpack
-// MM4D-LABEL: func @block_matmul_transpose_b
-// MM4D-COUNT-3: linalg.pack
-// MM4D: linalg.generic
-// MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
-// MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
-// MM4D-COUNT-1: linalg.unpack
// MTM4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d5, d3)>
// MTM4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>
@@ -87,15 +49,3 @@ func.func @block_matmul_transpose_b(
// MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
// MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
// MTM4D-COUNT-1: linalg.unpack
-// MTM4D-LABEL: func @block_matmul_transpose_a
-// MTM4D-COUNT-3: linalg.pack
-// MTM4D: linalg.generic
-// MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
-// MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
-// MTM4D-COUNT-1: linalg.unpack
-// MTM4D-LABEL: func @block_matmul_transpose_b
-// MTM4D-COUNT-3: linalg.pack
-// MTM4D: linalg.generic
-// MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
-// MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
-// MTM4D-COUNT-1: linalg.unpack
diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir
index aa860db..e16af1f 100644
--- a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir
+++ b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir
@@ -197,150 +197,6 @@ func.func @block_batch_matmul(
// -----
-func.func @block_matmul_transpose_a(
- %A: tensor<128x64xf32>, %B: tensor<128x64xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> {
- %0 = linalg.matmul_transpose_a ins(%A, %B : tensor<128x64xf32>, tensor<128x64xf32>)
- outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32>
- return %0 : tensor<64x64xf32>
-}
-
-// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
-// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
-
-// CHECK-LABEL: func @block_matmul_transpose_a(
-// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<128x64xf32>, %[[B:[0-9a-z]+]]: tensor<128x64xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32>
-// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32>
-// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]]
-// CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 64]
-// CHECK-SAME: into %[[PACK_DST_0]] : tensor<128x64xf32> -> tensor<2x2x32x64xf32>
-// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32>
-// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]]
-// CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64]
-// CHECK-SAME: into %[[PACK_DST_1]] : tensor<128x64xf32> -> tensor<4x2x16x64xf32>
-// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32>
-// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]]
-// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16]
-// CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32>
-// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
-// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
-// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
-// CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>)
-// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]]
-// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16]
-// CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32>
-// CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32>
-
-// -----
-
-func.func @block_batch_matmul_transpose_a(
- %A: tensor<512x128x64xf32>, %B: tensor<512x128x64xf32>, %C: tensor<512x64x64xf32>) -> tensor<512x64x64xf32> {
- %0 = linalg.batch_matmul_transpose_a ins(%A, %B : tensor<512x128x64xf32>, tensor<512x128x64xf32>)
- outs(%C : tensor<512x64x64xf32>) -> tensor<512x64x64xf32>
- return %0 : tensor<512x64x64xf32>
-}
-
-// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d3, d4, d6)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d5, d6)>
-// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5)>
-
-// CHECK-LABEL: func @block_batch_matmul_transpose_a(
-// CHECK-SAME: %[[A:.+]]: tensor<512x128x64xf32>, %[[B:.+]]: tensor<512x128x64xf32>, %[[C:.+]]: tensor<512x64x64xf32>
-// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32>
-// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]]
-// CHECK-SAME: outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [32, 64]
-// CHECK-SAME: into %[[PACK_DST_0]] : tensor<512x128x64xf32> -> tensor<512x2x2x32x64xf32>
-// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32>
-// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]]
-// CHECK-SAME: outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [16, 64]
-// CHECK-SAME: into %[[PACK_DST_1]] : tensor<512x128x64xf32> -> tensor<512x4x2x16x64xf32>
-// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32>
-// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]]
-// CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16]
-// CHECK-SAME: into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32>
-// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
-// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
-// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
-// CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>)
-// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]]
-// CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16]
-// CHECK-SAME: into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32>
-// CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32>
-
-// -----
-
-func.func @block_matmul_transpose_b(
- %A: tensor<64x128xf32>, %B: tensor<64x128xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> {
- %0 = linalg.matmul_transpose_b ins(%A, %B : tensor<64x128xf32>, tensor<64x128xf32>)
- outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32>
- return %0 : tensor<64x64xf32>
-}
-
-// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
-// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
-
-// CHECK-LABEL: func @block_matmul_transpose_b(
-// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<64x128xf32>, %[[B:[0-9a-z]+]]: tensor<64x128xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32>
-// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32>
-// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]]
-// CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64]
-// CHECK-SAME: into %[[PACK_DST_0]] : tensor<64x128xf32> -> tensor<2x2x32x64xf32>
-// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32>
-// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]]
-// CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 64]
-// CHECK-SAME: into %[[PACK_DST_1]] : tensor<64x128xf32> -> tensor<4x2x16x64xf32>
-// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32>
-// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]]
-// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16]
-// CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32>
-// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
-// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
-// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
-// CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>)
-// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]]
-// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16]
-// CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32>
-// CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32>
-
-// -----
-
-func.func @block_batch_matmul_transpose_b(
- %A: tensor<512x64x128xf32>, %B: tensor<512x64x128xf32>, %C: tensor<512x64x64xf32>) -> tensor<512x64x64xf32> {
- %0 = linalg.batch_matmul_transpose_b ins(%A, %B : tensor<512x64x128xf32>, tensor<512x64x128xf32>)
- outs(%C : tensor<512x64x64xf32>) -> tensor<512x64x64xf32>
- return %0 : tensor<512x64x64xf32>
-}
-
-// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d3, d4, d6)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d5, d6)>
-// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5)>
-
-// CHECK-LABEL: func @block_batch_matmul_transpose_b(
-// CHECK-SAME: %[[A:.+]]: tensor<512x64x128xf32>, %[[B:.+]]: tensor<512x64x128xf32>, %[[C:.+]]: tensor<512x64x64xf32>
-// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32>
-// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]]
-// CHECK-SAME: outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [32, 64]
-// CHECK-SAME: into %[[PACK_DST_0]] : tensor<512x64x128xf32> -> tensor<512x2x2x32x64xf32>
-// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32>
-// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]]
-// CHECK-SAME: outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 64]
-// CHECK-SAME: into %[[PACK_DST_1]] : tensor<512x64x128xf32> -> tensor<512x4x2x16x64xf32>
-// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32>
-// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]]
-// CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16]
-// CHECK-SAME: into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32>
-// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
-// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
-// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
-// CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>)
-// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]]
-// CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16]
-// CHECK-SAME: into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32>
-// CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32>
-
-// -----
-
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
diff --git a/mlir/test/Dialect/Linalg/elementwise/named-to-elementwise.mlir b/mlir/test/Dialect/Linalg/elementwise/named-to-elementwise.mlir
new file mode 100644
index 0000000..2332b28
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/elementwise/named-to-elementwise.mlir
@@ -0,0 +1,56 @@
+// RUN: mlir-opt %s -linalg-morph-ops=named-to-category -split-input-file | FileCheck %s
+
+// CHECK: @exp(%[[A:.+]]: tensor<16x8xf32>, %[[B:.+]]: tensor<16x8xf32>) -> tensor<16x8xf32> {
+// CHECK: {{.*}} = linalg.elementwise
+// CHECK-SAME: kind=#linalg.elementwise_kind<exp>
+// CHECK-SAME: ins(%[[A]] : tensor<16x8xf32>)
+// CHECK-SAME: outs(%[[B]] : tensor<16x8xf32>) -> tensor<16x8xf32>
+//
+func.func @exp(%A : tensor<16x8xf32>, %B : tensor<16x8xf32>) -> tensor<16x8xf32> {
+ %exp = linalg.exp ins(%A : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+ return %exp : tensor<16x8xf32>
+}
+
+// ----
+
+// CHECK: @add(%[[A:.+]]: tensor<16x8xf32>, %[[B:.+]]: tensor<16x8xf32>, %[[C:.+]]: tensor<16x8xf32>) -> tensor<16x8xf32> {
+// CHECK: {{.*}} = linalg.elementwise
+// CHECK-SAME: kind=#linalg.elementwise_kind<add>
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<16x8xf32>, tensor<16x8xf32>)
+// CHECK-SAME: outs(%[[C]] : tensor<16x8xf32>) -> tensor<16x8xf32>
+//
+func.func @add(%A : tensor<16x8xf32>, %B: tensor<16x8xf32>, %C : tensor<16x8xf32>) -> tensor<16x8xf32> {
+ %add = linalg.add ins(%A, %B : tensor<16x8xf32>, tensor<16x8xf32>) outs(%C : tensor<16x8xf32>) -> tensor<16x8xf32>
+ return %add : tensor<16x8xf32>
+}
+
+// ----
+
+// CHECK: @sub(%[[A:.+]]: tensor<16x8xf32>, %[[B:.+]]: tensor<16x8xf32>, %[[C:.+]]: tensor<16x8xf32>) -> tensor<16x8xf32> {
+// CHECK: {{.*}} = linalg.elementwise
+// CHECK-SAME: kind=#linalg.elementwise_kind<sub>
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<16x8xf32>, tensor<16x8xf32>)
+// CHECK-SAME: outs(%[[C]] : tensor<16x8xf32>)
+//
+func.func @sub(%A : tensor<16x8xf32>, %B: tensor<16x8xf32>, %C : tensor<16x8xf32>) -> tensor<16x8xf32> {
+ %sub = linalg.sub ins(%A, %B : tensor<16x8xf32>, tensor<16x8xf32>) outs(%C : tensor<16x8xf32>) -> tensor<16x8xf32>
+ return %sub : tensor<16x8xf32>
+}
+
+// ----
+
+// CHECK: @ternary_select(%[[A:.+]]: tensor<4x8x16xi1>, %[[B:.+]]: tensor<4x8x16xf32>, %[[C:.+]]: tensor<4x8x16xf32>)
+// CHECK: %[[E:.+]] = tensor.empty() : tensor<4x8x16xf32>
+// CHECK: {{.*}} = linalg.elementwise
+// CHECK-SAME: kind=#linalg.elementwise_kind<select>
+// CHECK-SAME: ins(%[[A]], %[[B]], %[[C]] : tensor<4x8x16xi1>, tensor<4x8x16xf32>, tensor<4x8x16xf32>)
+// CHECK-SAME: outs(%[[E]] : tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+//
+func.func @ternary_select(%A: tensor<4x8x16xi1>, %B: tensor<4x8x16xf32>, %C: tensor<4x8x16xf32>)
+ -> tensor<4x8x16xf32> {
+ %empty = tensor.empty() : tensor<4x8x16xf32>
+ %select = linalg.select
+ ins(%A, %B, %C : tensor<4x8x16xi1>, tensor<4x8x16xf32>, tensor<4x8x16xf32>)
+ outs(%empty: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+ return %select : tensor<4x8x16xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/fold-add-into-dest.mlir b/mlir/test/Dialect/Linalg/fold-add-into-dest.mlir
index d8e92e4..e90247d 100644
--- a/mlir/test/Dialect/Linalg/fold-add-into-dest.mlir
+++ b/mlir/test/Dialect/Linalg/fold-add-into-dest.mlir
@@ -158,36 +158,6 @@ module attributes {transform.with_named_sequence} {
// -----
!type = tensor<2048x2048xf32>
-func.func @fold_add_on_transposed_matmuls(%arg0: !type, %arg1: !type) -> !type {
- %0 = arith.constant dense<1.111111e+00> : !type
- %cst = arith.constant 0.000000e+00 : f32
- %1 = tensor.empty() : !type
- %2 = linalg.fill ins(%cst : f32) outs(%1 : !type) -> !type
- %3 = linalg.matmul_transpose_a ins(%arg0, %0 : !type, !type) outs(%2 : !type) -> !type
- %4 = linalg.matmul_transpose_b ins(%arg1, %0 : !type, !type) outs(%2 : !type) -> !type
- %5 = linalg.add ins(%3, %4 : !type, !type) outs(%1 : !type) -> !type
- return %5 : !type
-}
-
-// CHECK-LABEL: func.func @fold_add_on_transposed_matmuls
-// CHECK: %[[ACC:.+]] = linalg.matmul_transpose_a
-// CHECK-NEXT: %[[RES:.+]] = linalg.matmul_transpose_b ins({{.+}}) outs(%[[ACC]]
-// CHECK-NOT: linalg.add
-// CHECK-NEXT: return %[[RES]]
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.linalg.fold_add_into_dest
- } : !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
-!type = tensor<2048x2048xf32>
func.func @expect_no_fold_of_add_as_dominated_op_is_not_a_contraction(%arg0: !type, %arg1: !type) -> !type {
%0 = arith.constant dense<1.111111e+00> : !type
%cst = arith.constant 0.000000e+00 : f32
diff --git a/mlir/test/Dialect/Linalg/linalg-morph-category-ops.mlir b/mlir/test/Dialect/Linalg/linalg-morph-category-ops.mlir
new file mode 100644
index 0000000..00602c4
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/linalg-morph-category-ops.mlir
@@ -0,0 +1,15 @@
+// Forward path `named -> category -> generic`
+// RUN: mlir-opt %s -linalg-morph-ops=named-to-category | FileCheck %s --check-prefix=NAMED_TO_CATEGORY
+
+// RUN: mlir-opt %s -linalg-morph-ops=named-to-category | \
+// RUN: mlir-opt %s -linalg-morph-ops=category-to-generic | FileCheck %s --check-prefix=CATEGORY_TO_GENERIC
+
+func.func @exp(%A : tensor<16x8xf32>, %B : tensor<16x8xf32>) -> tensor<16x8xf32> {
+ %exp = linalg.exp ins(%A : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+ return %exp : tensor<16x8xf32>
+}
+// NAMED_TO_CATEGORY: linalg.elementwise
+// NAMED_TO_CATEGORY-NOT: linalg.exp
+
+// CATEGORY_TO_GENERIC: linalg.generic
+// CATEGORY_TO_GENERIC-NOT: linalg.elementwise
diff --git a/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir b/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir
new file mode 100644
index 0000000..bdd29b9
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-opt %s -linalg-morph-ops=named-to-generic | FileCheck %s --check-prefix=NAMED_TO_GENERIC
+// RUN: mlir-opt %s -linalg-morph-ops=named-to-generic | mlir-opt -linalg-morph-ops=generic-to-named | \
+// RUN: FileCheck %s --check-prefix=ROUND_TRIP
+
+func.func @exp(%A : tensor<16x8xf32>, %B : tensor<16x8xf32>) -> tensor<16x8xf32> {
+ %exp = linalg.exp ins(%A : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+ return %exp : tensor<16x8xf32>
+}
+
+// NAMED_TO_GENERIC: linalg.generic
+// NAMED_TO_GENERIC-NOT: linalg.exp
+
+// ROUND_TRIP: linalg.exp
+// ROUND_TRIP-NOT: linalg.generic
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 412f40d..a93e979 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -1222,17 +1222,6 @@ func.func @batch_reduce_matmul(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32
// -----
-// CHECK-LABEL: func @matmul_transpose_a
-// CHECK: linalg.matmul_transpose_a
-// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5x3xf32>, memref<5x7xf32>)
-// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>)
-func.func @matmul_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
- linalg.matmul_transpose_a ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
- return
-}
-
-// -----
-
// CHECK-LABEL: func @matmul_transpose_a_explicit
// CHECK: linalg.matmul
// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5x3xf32>, memref<5x7xf32>)
@@ -1478,17 +1467,6 @@ func.func @matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5xf3
// -----
-// CHECK-LABEL: func @matmul_transpose_b
-// CHECK: linalg.matmul_transpose_b
-// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<7x5xf32>)
-// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>)
-func.func @matmul_transpose_b(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
- linalg.matmul_transpose_b ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>) outs(%arg2: memref<3x7xf32>)
- return
-}
-
-// -----
-
// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)>
// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
@@ -1806,28 +1784,6 @@ func.func @bcast_A_transpose_B(%A: memref<3x5xf32>, %B: memref<2x7x5xf32>, %C: m
// -----
-// CHECK-LABEL: func @batchmatmul_transpose_a
-// CHECK: linalg.batch_matmul_transpose_a
-// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<2x5x3xf32>, memref<2x5x7xf32>)
-// CHECK-SAME: outs(%{{.+}} : memref<2x3x7xf32>)
-func.func @batchmatmul_transpose_a(%arg0: memref<2x5x3xf32>, %arg1: memref<2x5x7xf32>, %arg2: memref<2x3x7xf32>) {
- linalg.batch_matmul_transpose_a ins(%arg0, %arg1 : memref<2x5x3xf32>, memref<2x5x7xf32>) outs(%arg2: memref<2x3x7xf32>)
- return
-}
-
-// -----
-
-// CHECK-LABEL: func @batchmatmul_transpose_b
-// CHECK: linalg.batch_matmul_transpose_b
-// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<2x3x5xf32>, memref<2x7x5xf32>)
-// CHECK-SAME: outs(%{{.+}} : memref<2x3x7xf32>)
-func.func @batchmatmul_transpose_b(%arg0: memref<2x3x5xf32>, %arg1: memref<2x7x5xf32>, %arg2: memref<2x3x7xf32>) {
- linalg.batch_matmul_transpose_b ins(%arg0, %arg1 : memref<2x3x5xf32>, memref<2x7x5xf32>) outs(%arg2: memref<2x3x7xf32>)
- return
-}
-
-// -----
-
// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
diff --git a/mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir b/mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir
index 43bddb0..704576d 100644
--- a/mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir
+++ b/mlir/test/Dialect/Linalg/rank-reduce-contraction-ops.mlir
@@ -92,38 +92,6 @@ func.func @singleton_batch_vecmat(%arg0 : tensor<1x?xf32>, %arg1 : tensor<1x?x?x
// -----
-func.func @singleton_batchmatmul_transpose_a(%arg0: memref<1x5x3xf32>, %arg1: memref<1x5x7xf32>, %arg2: memref<1x3x7xf32>) {
- // CHECK-LABEL: @singleton_batchmatmul_transpose_a
- // CHECK-SAME: %[[LHS:[a-zA-Z0-9]+]]: memref<1x5x3xf32>
- // CHECK-SAME: %[[RHS:[a-zA-Z0-9]+]]: memref<1x5x7xf32>
- // CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: memref<1x3x7xf32>
- // CHECK-NEXT: %[[COLLAPSED_LHS:.*]] = memref.collapse_shape %[[LHS]] {{\[}}[0, 1], [2]]
- // CHECK-NEXT: %[[COLLAPSED_RHS:.*]] = memref.collapse_shape %[[RHS]] {{\[}}[0, 1], [2]]
- // CHECK-NEXT: %[[COLLAPSED_INIT:.*]] = memref.collapse_shape %[[INIT]] {{\[}}[0, 1], [2]]
- // CHECK-NEXT: linalg.matmul_transpose_a ins(%[[COLLAPSED_LHS]], %[[COLLAPSED_RHS]] : memref<5x3xf32>, memref<5x7xf32>) outs(%[[COLLAPSED_INIT]] : memref<3x7xf32>)
- // CHECK-NEXT: return
- linalg.batch_matmul_transpose_a ins(%arg0, %arg1 : memref<1x5x3xf32>, memref<1x5x7xf32>) outs(%arg2: memref<1x3x7xf32>)
- return
-}
-
-// -----
-
-func.func @singleton_batchmatmul_transpose_b(%arg0: memref<1x3x5xf32>, %arg1: memref<1x7x5xf32>, %arg2: memref<1x3x7xf32>) {
- // CHECK-LABEL: @singleton_batchmatmul_transpose_b
- // CHECK-SAME: %[[LHS:[a-zA-Z0-9]+]]: memref<1x3x5xf32>
- // CHECK-SAME: %[[RHS:[a-zA-Z0-9]+]]: memref<1x7x5xf32>
- // CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: memref<1x3x7xf32>
- // CHECK-NEXT: %[[COLLAPSED_LHS:.*]] = memref.collapse_shape %[[LHS]] {{\[}}[0, 1], [2]]
- // CHECK-NEXT: %[[COLLAPSED_RHS:.*]] = memref.collapse_shape %[[RHS]] {{\[}}[0, 1], [2]]
- // CHECK-NEXT: %[[COLLAPSED_INIT:.*]] = memref.collapse_shape %[[INIT]] {{\[}}[0, 1], [2]]
- // CHECK-NEXT: linalg.matmul_transpose_b ins(%[[COLLAPSED_LHS]], %[[COLLAPSED_RHS]] : memref<3x5xf32>, memref<7x5xf32>) outs(%[[COLLAPSED_INIT]] : memref<3x7xf32>)
- // CHECK-NEXT: return
- linalg.batch_matmul_transpose_b ins(%arg0, %arg1 : memref<1x3x5xf32>, memref<1x7x5xf32>) outs(%arg2: memref<1x3x7xf32>)
- return
-}
-
-// -----
-
func.func @matmul_to_matvec_tensor(%arg0: tensor<?x?xf32>, %arg1: tensor<?x1xf32>, %arg2: tensor<?x1xf32>) -> tensor<?x1xf32> {
// CHECK-LABEL: @matmul_to_matvec_tensor
// CHECK-SAME: %[[LHS:[a-zA-Z0-9]+]]: tensor<?x?xf32>
@@ -226,59 +194,6 @@ func.func @matvec_to_dot_tensor(%arg0: tensor<1x?xf32>, %arg1: tensor<?xf32>, %a
// -----
-func.func @matmul_transpose_a_to_vecmat(%arg0: tensor<256x1xf32>, %arg1: tensor<256x512xf32>, %arg2: tensor<1x512xf32>) -> tensor<1x512xf32> {
- // CHECK-LABEL: @matmul_transpose_a_to_vecmat
- // CHECK: collapse_shape {{.*}} into tensor<256xf32>
- // CHECK: collapse_shape {{.*}} into tensor<512xf32>
- // CHECK: linalg.vecmat
- // CHECK: expand_shape {{.*}} into tensor<1x512xf32>
- %0 = linalg.matmul_transpose_a ins(%arg0, %arg1: tensor<256x1xf32>, tensor<256x512xf32>) outs(%arg2: tensor<1x512xf32>) -> tensor<1x512xf32>
- return %0 : tensor<1x512xf32>
-}
-
-// -----
-
-func.func @batch_matmul_transpose_a_to_batch_vecmat(%arg0: tensor<64x256x1xf32>, %arg1: tensor<64x256x512xf32>, %arg2: tensor<64x1x512xf32>) -> tensor<64x1x512xf32> {
- // CHECK-LABEL: @batch_matmul_transpose_a_to_batch_vecmat
- // CHECK: collapse_shape {{.*}} into tensor<64x256xf32>
- // CHECK: collapse_shape {{.*}} into tensor<64x512xf32>
- // CHECK: linalg.batch_vecmat
- // CHECK: expand_shape {{.*}} into tensor<64x1x512xf32>
- %0 = linalg.batch_matmul_transpose_a ins(%arg0, %arg1: tensor<64x256x1xf32>, tensor<64x256x512xf32>) outs(%arg2: tensor<64x1x512xf32>) -> tensor<64x1x512xf32>
- return %0 : tensor<64x1x512xf32>
-}
-
-// -----
-
-func.func @matmul_transpose_b_to_matvec(%arg0: memref<?x?xf32>, %arg1: memref<1x?xf32>, %arg2: memref<?x1xf32>) {
- // CHECK-LABEL: @matmul_transpose_b_to_matvec
- // CHECK: linalg.matvec
- linalg.matmul_transpose_b ins(%arg0, %arg1: memref<?x?xf32>, memref<1x?xf32>) outs(%arg2: memref<?x1xf32>)
- return
-}
-
-// -----
-
-func.func @batchmatmul_transpose_b_to_batchmatvec_tensor(%arg0: tensor<64x128x256xf32>, %arg1: tensor<64x1x256xf32>, %arg2: tensor<64x128x1xf32>) -> tensor<64x128x1xf32> {
- // CHECK: collapse_shape {{.*}} into tensor<64x256xf32>
- // CHECK: collapse_shape {{.*}} into tensor<64x128xf32>
- // CHECK: linalg.batch_matvec
- // CHECK: expand_shape {{.*}} into tensor<64x128x1xf32>
- %0 = linalg.batch_matmul_transpose_b ins(%arg0, %arg1: tensor<64x128x256xf32>, tensor<64x1x256xf32>) outs(%arg2: tensor<64x128x1xf32>) -> tensor<64x128x1xf32>
- return %0 : tensor<64x128x1xf32>
-}
-
-// -----
-
-func.func @batchmatmul_transpose_b_to_to_dot(%arg0: tensor<1x1x?xf32>, %arg1: tensor<1x1x?xf32>, %arg2: tensor<1x1x1xf32>) -> tensor<1x1x1xf32> {
- // CHECK-LABEL: @batchmatmul_transpose_b_to_to_dot
- // CHECK: linalg.dot
- %0 = linalg.batch_matmul_transpose_b ins(%arg0, %arg1: tensor<1x1x?xf32>, tensor<1x1x?xf32>) outs(%arg2: tensor<1x1x1xf32>) -> tensor<1x1x1xf32>
- return %0 : tensor<1x1x1xf32>
-}
-
-// -----
-
func.func @nonsingleton_batch_matmul(%arg0 : tensor<2x?x?xf32>, %arg1 : tensor<2x?x?xf32>, %arg2: tensor<2x?x?xf32>) -> tensor<2x?x?xf32> {
// CHECK-LABEL: @nonsingleton_batch_matmul
// CHECK-NOT: collapse_shape
diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
index 778d5bb..1b0bade 100644
--- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
@@ -504,7 +504,7 @@ func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
%c10 = transform.param.constant 10 : i64 -> !transform.param<i64>
%c20 = transform.param.constant 20 : i64 -> !transform.param<i64>
%sz = transform.merge_handles %c10, %c20 : !transform.param<i64>
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
index f741876..9a3dcf0 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
@@ -14,11 +14,11 @@ module attributes {transform.with_named_sequence} {
: (!transform.any_op) -> !transform.any_op
// Tile to 5 then pad to 8
- %fill_l1, %loops_l1 = transform.structured.tile_using_for %fill tile_sizes [5]
+ %fill_l1, %loops_l1 = transform.structured.tile_using_for %fill tile_sizes [5]
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%fill_padded, %_ = transform.structured.pad_tiling_interface %fill_l1 to padding_sizes [8] {
- padding_values=[0.0 : f32, 0.0 : f32]
+ padding_values= [#ub.poison, 0.0 : f32]
} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.yield
@@ -33,9 +33,9 @@ func.func @pad_lhs(
-> tensor<24x25xf32>
{
// CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>)
- // CHECK: tensor.pad %{{.*}}
+ // CHECK: tensor.pad %{{.*}}
// CHECK: : tensor<?x12xf32> to tensor<8x12xf32>
- // CHECK: tensor.pad %{{.*}}
+ // CHECK: tensor.pad %{{.*}}
// CHECK: : tensor<?x25xf32> to tensor<8x25xf32>
// CHECK: linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<8x12xf32>, tensor<12x25xf32>) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32>
// CHECK: tensor.extract_slice %{{.*}}[0, 0] [%{{.*}}, 25] [1, 1]
@@ -92,7 +92,7 @@ module {
%padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [8, 0, 14] {
padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.yield
+ transform.yield
}
}
}
@@ -147,7 +147,7 @@ module {
%padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [8, 0, 14] {
padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.yield
+ transform.yield
}
}
}
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
index f91eb9c..51bf4a2 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
@@ -465,14 +465,14 @@ module attributes {transform.with_named_sequence} {
// CHECK: %[[RHS:.*]] = tensor.pad
// CHECK: scf.for
// CHECK-DAG: tensor.extract_slice %[[LHS]][0, %{{.*}}] [%{{.*}}, 32]
-// CHECK-DAG: tensor.extract_slice %[[RHS]][0, %{{.*}}] [%{{.*}}, 32]
+// CHECK-DAG: tensor.extract_slice %[[RHS]][%{{.*}}, 0] [32, %{{.*}}]
func.func @dyn_pad_tiling(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
- %0 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
}
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %0 = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op
%padded, %pad, %copy = transform.structured.pad %0 pad_to_multiple_of [32] use_prescribed_tensor_shapes {padding_dimensions = [2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
%tiled_linalg_op, %loops = transform.structured.tile_using_for %padded tile_sizes [0, 0, 32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%1 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
diff --git a/mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir b/mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir
index f64953b..bd4c655 100644
--- a/mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-specialize-matmul.mlir
@@ -30,66 +30,6 @@ module attributes {transform.with_named_sequence} {
// -----
-#map = affine_map<(d0, d1, d2) -> (d2, d0)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @matmul_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
- linalg.generic
- {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
- ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) outs(%arg2 : memref<3x7xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %0 = arith.mulf %in, %in_0 : f32
- %1 = arith.addf %out, %0 : f32
- linalg.yield %1 : f32
- }
- return
-}
-
-// CHECK-LABEL: @matmul_transpose_a
-// CHECK-SAME: %[[ARG0:.+]]: memref<5x3xf32>, %[[ARG1:.+]]: memref<5x7xf32>, %[[ARG2:.+]]: memref<3x7xf32>) {
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul_transpose_a ins(%[[ARG0]], %[[ARG1]] : memref<5x3xf32>, memref<5x7xf32>) outs(%[[ARG2]] : memref<3x7xf32>)
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match interface{LinalgOp} in %arg0 : (!transform.any_op) -> !transform.any_op
- %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
-#map = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @matmul_transpose_b(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
- %0 = linalg.generic
- {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
- ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %1 = arith.mulf %in, %in_0 : f32
- %2 = arith.addf %out, %1 : f32
- linalg.yield %2 : f32
- } -> tensor<?x?xf32>
- return %0 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: @matmul_transpose_b
-// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?xf32>, %[[ARG1:.+]]: tensor<?x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul_transpose_b ins(%[[ARG0]], %[[ARG1]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[ARG2]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match interface{LinalgOp} in %arg0 : (!transform.any_op) -> !transform.any_op
- %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
@@ -117,32 +57,3 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}
-
-// -----
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @batch_matmul_transpose_b(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>, %arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
- %0 = linalg.generic
- {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
- ins(%arg0, %arg1 : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%arg2 : tensor<?x?x?xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %1 = arith.mulf %in, %in_0 : f32
- %2 = arith.addf %out, %1 : f32
- linalg.yield %2 : f32
- } -> tensor<?x?x?xf32>
- return %0 : tensor<?x?x?xf32>
-}
-
-// CHECK-LABEL: @batch_matmul_transpose_b
-// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?xf32>, %[[ARG2:.+]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul_transpose_b ins(%[[ARG0]], %[[ARG1]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[ARG2]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match interface{LinalgOp} in %arg0 : (!transform.any_op) -> !transform.any_op
- %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
diff --git a/mlir/test/Dialect/Linalg/transpose-matmul.mlir b/mlir/test/Dialect/Linalg/transpose-matmul.mlir
index d2b7e9f..4ee87fb 100644
--- a/mlir/test/Dialect/Linalg/transpose-matmul.mlir
+++ b/mlir/test/Dialect/Linalg/transpose-matmul.mlir
@@ -1,6 +1,20 @@
// RUN: mlir-opt -transform-preload-library='transform-library-paths=%p/transpose-matmul-a.mlir' -transform-interpreter -split-input-file %s | FileCheck %s --check-prefixes=CHECK,TRANSPOSE-A
// RUN: mlir-opt -transform-preload-library='transform-library-paths=%p/transpose-matmul-b.mlir' -transform-interpreter -split-input-file %s | FileCheck %s --check-prefixes=CHECK,TRANSPOSE-B
+// TRANSPOSE-A-DAG: #[[$MA:.*]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// TRANSPOSE-A-DAG: #[[$MB:.*]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// TRANSPOSE-A-DAG: #[[$MC:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// TRANSPOSE-A-DAG: #[[$BMA:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>
+// TRANSPOSE-A-DAG: #[[$BMB:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+// TRANSPOSE-A-DAG: #[[$BMC:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+// TRANSPOSE-B-DAG: #[[$MA:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// TRANSPOSE-B-DAG: #[[$MB:.*]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// TRANSPOSE-B-DAG: #[[$MC:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// TRANSPOSE-B-DAG: #[[$BMA:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// TRANSPOSE-B-DAG: #[[$BMB:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// TRANSPOSE-B-DAG: #[[$BMC:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
// CHECK-LABEL: func.func @matmul_static(
// CHECK-SAME: %[[A:.*]]: tensor<16x8xf32>,
// CHECK-SAME: %[[B:.*]]: tensor<8x16xf32>) -> tensor<16x16xf32> {
@@ -9,10 +23,10 @@
// CHECK: %[[C_ZERO:.*]] = linalg.fill ins(%[[C0_F32]] : f32) outs(%[[C_INIT]] : tensor<16x16xf32>) -> tensor<16x16xf32>
// TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty() : tensor<8x16xf32>
// TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<16x8xf32>) outs(%[[A_TRANSP_INIT]] : tensor<8x16xf32>) permutation = [1, 0]
-// TRANSPOSE-A: %[[C:.*]] = linalg.matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<8x16xf32>, tensor<8x16xf32>) outs(%[[C_ZERO]] : tensor<16x16xf32>) -> tensor<16x16xf32>
+// TRANSPOSE-A: %[[C:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<8x16xf32>, tensor<8x16xf32>) outs(%[[C_ZERO]] : tensor<16x16xf32>) -> tensor<16x16xf32>
// TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty() : tensor<16x8xf32>
// TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<8x16xf32>) outs(%[[B_TRANSP_INIT]] : tensor<16x8xf32>) permutation = [1, 0]
-// TRANSPOSE-B: %[[C:.*]] = linalg.matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<16x8xf32>, tensor<16x8xf32>) outs(%[[C_ZERO]] : tensor<16x16xf32>) -> tensor<16x16xf32>
+// TRANSPOSE-B: %[[C:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<16x8xf32>, tensor<16x8xf32>) outs(%[[C_ZERO]] : tensor<16x16xf32>) -> tensor<16x16xf32>
// CHECK: return %[[C]] : tensor<16x16xf32>
// CHECK: }
func.func @matmul_static(%A: tensor<16x8xf32>, %B: tensor<8x16xf32>) -> (tensor<16x16xf32>) {
@@ -38,11 +52,11 @@ func.func @matmul_static(%A: tensor<16x8xf32>, %B: tensor<8x16xf32>) -> (tensor<
// TRANSPOSE-A: %[[A_DIM1:.*]] = tensor.dim %[[A]], %[[C1]] : tensor<?x?xf32>
// TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty(%[[A_DIM1]], %[[A_DIM0]]) : tensor<?x?xf32>
// TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<?x?xf32>) outs(%[[A_TRANSP_INIT]] : tensor<?x?xf32>) permutation = [1, 0]
-// TRANSPOSE-A: %[[C:.*]] = linalg.matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// TRANSPOSE-A: %[[C:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?xf32>) -> tensor<?x?xf32>
// TRANSPOSE-B: %[[B_DIM0:.*]] = tensor.dim %[[B]], %[[C0]] : tensor<?x?xf32>
// TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty(%[[B_DIM1]], %[[B_DIM0]]) : tensor<?x?xf32>
// TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<?x?xf32>) outs(%[[B_TRANSP_INIT]] : tensor<?x?xf32>) permutation = [1, 0]
-// TRANSPOSE-B: %[[C:.*]] = linalg.matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// TRANSPOSE-B: %[[C:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?xf32>) -> tensor<?x?xf32>
// CHECK: return %[[C]] : tensor<?x?xf32>
// CHECK: }
func.func @matmul_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
@@ -69,10 +83,10 @@ func.func @matmul_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>) -> (tensor<?
// CHECK: %[[C_ZERO:.*]] = linalg.fill ins(%[[C0_F32]] : f32) outs(%[[C_INIT]] : tensor<?x16xf32>) -> tensor<?x16xf32>
// TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty(%[[A_DIM0]]) : tensor<8x?xf32>
// TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<?x8xf32>) outs(%[[A_TRANSP_INIT]] : tensor<8x?xf32>) permutation = [1, 0]
-// TRANSPOSE-A: %[[B0:.*]] = linalg.matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<8x?xf32>, tensor<8x16xf32>) outs(%[[C_ZERO]] : tensor<?x16xf32>) -> tensor<?x16xf32>
+// TRANSPOSE-A: %[[B0:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<8x?xf32>, tensor<8x16xf32>) outs(%[[C_ZERO]] : tensor<?x16xf32>) -> tensor<?x16xf32>
// TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty() : tensor<16x8xf32>
// TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<8x16xf32>) outs(%[[B_TRANSP_INIT]] : tensor<16x8xf32>) permutation = [1, 0]
-// TRANSPOSE-B: %[[B0:.*]] = linalg.matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<?x8xf32>, tensor<16x8xf32>) outs(%[[C_ZERO]] : tensor<?x16xf32>) -> tensor<?x16xf32>
+// TRANSPOSE-B: %[[B0:.*]] = linalg.matmul indexing_maps = [#[[$MA]], #[[$MB]], #[[$MC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<?x8xf32>, tensor<16x8xf32>) outs(%[[C_ZERO]] : tensor<?x16xf32>) -> tensor<?x16xf32>
// CHECK: return %[[B0]] : tensor<?x16xf32>
// CHECK: }
func.func @matmul_mixed(%A: tensor<?x8xf32>, %B: tensor<8x16xf32>) -> (tensor<?x16xf32>) {
@@ -96,10 +110,10 @@ func.func @matmul_mixed(%A: tensor<?x8xf32>, %B: tensor<8x16xf32>) -> (tensor<?x
// CHECK: %[[C_ZERO:.*]] = linalg.fill ins(%[[C0_F32]] : f32) outs(%[[C_INIT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
// TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty() : tensor<2x8x16xf32>
// TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<2x16x8xf32>) outs(%[[A_TRANSP_INIT]] : tensor<2x8x16xf32>) permutation = [0, 2, 1]
-// TRANSPOSE-A: %[[C:.*]] = linalg.batch_matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<2x8x16xf32>, tensor<2x8x16xf32>) outs(%[[C_ZERO]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+// TRANSPOSE-A: %[[C:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<2x8x16xf32>, tensor<2x8x16xf32>) outs(%[[C_ZERO]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
// TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty() : tensor<2x16x8xf32>
// TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<2x8x16xf32>) outs(%[[B_TRANSP_INIT]] : tensor<2x16x8xf32>) permutation = [0, 2, 1]
-// TRANSPOSE-B: %[[C:.*]] = linalg.batch_matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<2x16x8xf32>, tensor<2x16x8xf32>) outs(%[[C_ZERO]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+// TRANSPOSE-B: %[[C:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<2x16x8xf32>, tensor<2x16x8xf32>) outs(%[[C_ZERO]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
// CHECK: return %[[C]] : tensor<2x16x16xf32>
// CHECK: }
func.func @batch_matmul_static(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>) -> (tensor<2x16x16xf32>) {
@@ -127,12 +141,12 @@ func.func @batch_matmul_static(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>) -
// TRANSPOSE-A: %[[A_DIM2:.*]] = tensor.dim %[[A]], %[[C2]] : tensor<?x?x?xf32>
// TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty(%[[A_DIM0]], %[[A_DIM2]], %[[A_DIM1]]) : tensor<?x?x?xf32>
// TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<?x?x?xf32>) outs(%[[A_TRANSP_INIT]] : tensor<?x?x?xf32>) permutation = [0, 2, 1]
-// TRANSPOSE-A: %[[C:.*]] = linalg.batch_matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// TRANSPOSE-A: %[[C:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
// TRANSPOSE-B: %[[B_DIM0:.*]] = tensor.dim %[[B]], %[[C0]] : tensor<?x?x?xf32>
// TRANSPOSE-B: %[[B_DIM1:.*]] = tensor.dim %[[B]], %[[C1]] : tensor<?x?x?xf32>
// TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty(%[[B_DIM0]], %[[B_DIM2]], %[[B_DIM1]]) : tensor<?x?x?xf32>
// TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<?x?x?xf32>) outs(%[[B_TRANSP_INIT]] : tensor<?x?x?xf32>) permutation = [0, 2, 1]
-// TRANSPOSE-B: %[[C:.*]] = linalg.batch_matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// TRANSPOSE-B: %[[C:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[C_ZERO]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
// CHECK: return %[[C]] : tensor<?x?x?xf32>
// CHECK: }
func.func @batch_matmul_dynamic(%A: tensor<?x?x?xf32>, %B: tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>) {
@@ -161,10 +175,10 @@ func.func @batch_matmul_dynamic(%A: tensor<?x?x?xf32>, %B: tensor<?x?x?xf32>) ->
// CHECK: %[[C_ZERO:.*]] = linalg.fill ins(%[[C0_F32]] : f32) outs(%[[C_INIT]] : tensor<2x?x16xf32>) -> tensor<2x?x16xf32>
// TRANSPOSE-A: %[[A_TRANSP_INIT:.*]] = tensor.empty(%[[A_DIM1]]) : tensor<2x8x?xf32>
// TRANSPOSE-A: %[[A_TRANSP:.*]] = linalg.transpose ins(%[[A]] : tensor<2x?x8xf32>) outs(%[[A_TRANSP_INIT]] : tensor<2x8x?xf32>) permutation = [0, 2, 1]
-// TRANSPOSE-A: %[[B0:.*]] = linalg.batch_matmul_transpose_a ins(%[[A_TRANSP]], %[[B]] : tensor<2x8x?xf32>, tensor<2x8x16xf32>) outs(%[[C_ZERO]] : tensor<2x?x16xf32>) -> tensor<2x?x16xf32>
+// TRANSPOSE-A: %[[B0:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A_TRANSP]], %[[B]] : tensor<2x8x?xf32>, tensor<2x8x16xf32>) outs(%[[C_ZERO]] : tensor<2x?x16xf32>) -> tensor<2x?x16xf32>
// TRANSPOSE-B: %[[B_TRANSP_INIT:.*]] = tensor.empty() : tensor<2x16x8xf32>
// TRANSPOSE-B: %[[B_TRANSP:.*]] = linalg.transpose ins(%[[B]] : tensor<2x8x16xf32>) outs(%[[B_TRANSP_INIT]] : tensor<2x16x8xf32>) permutation = [0, 2, 1]
-// TRANSPOSE-B: %[[B0:.*]] = linalg.batch_matmul_transpose_b ins(%[[A]], %[[B_TRANSP]] : tensor<2x?x8xf32>, tensor<2x16x8xf32>) outs(%[[C_ZERO]] : tensor<2x?x16xf32>) -> tensor<2x?x16xf32>
+// TRANSPOSE-B: %[[B0:.*]] = linalg.batch_matmul indexing_maps = [#[[$BMA]], #[[$BMB]], #[[$BMC]]] ins(%[[A]], %[[B_TRANSP]] : tensor<2x?x8xf32>, tensor<2x16x8xf32>) outs(%[[C_ZERO]] : tensor<2x?x16xf32>) -> tensor<2x?x16xf32>
// CHECK: return %[[B0]] : tensor<2x?x16xf32>
// CHECK: }
func.func @batch_matmul_mixed(%A: tensor<2x?x8xf32>, %B: tensor<2x8x16xf32>) -> (tensor<2x?x16xf32>) {
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
index 4eeae4c..25cbceb 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
@@ -61,6 +61,83 @@ module attributes {transform.with_named_sequence} {
// -----
+// CHECK-LABEL: @float_mixed_precision_matmul
+// CHECK-COUNT-3: vector.transfer_read
+// CHECK-NOT: arith.extf
+// CHECK: vector.contract {{.*}} : vector<1584x1584xbf16>, vector<1584x1584xbf16> into vector<1584x1584xf32>
+func.func @float_mixed_precision_matmul(%A: memref<1584x1584xbf16>, %B: memref<1584x1584xbf16>, %C: memref<1584x1584xf32>) {
+ linalg.matmul ins(%A, %B: memref<1584x1584xbf16>, memref<1584x1584xbf16>)
+ outs(%C: memref<1584x1584xf32>)
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 { fold_type_extensions_into_contract } : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @vectorization_test_2
+func.func @vectorization_test_2(%A: memref<8x16xf32>, %B: memref<16x32xf32>,
+ %C: memref<8x32xf32>) {
+ // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<8x32x16xf32>
+ // CHECK: vector.multi_reduction <add>, %{{.*}}, {{.*}} [2] : vector<8x32x16xf32> to vector<8x32xf32>
+ linalg.matmul
+ ins(%A, %B: memref<8x16xf32>, memref<16x32xf32>)
+ outs(%C: memref<8x32xf32>)
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 { disable_multi_reduction_to_contract_patterns } : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @matmul_tensors
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<8x4xf32>, %[[ARG1:.*]]: tensor<4x12xf32>,
+// CHECK-SAME: %[[ARG2:.*]]: tensor<8x12xf32>) -> tensor<8x12xf32>
+func.func @matmul_tensors(
+ %arg0: tensor<8x4xf32>, %arg1: tensor<4x12xf32>, %arg2: tensor<8x12xf32>)
+ -> tensor<8x12xf32> {
+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK-DAG: %[[V0:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], {{.*}} : tensor<8x4xf32>, vector<8x12x4xf32>
+ // CHECK-DAG: %[[V1:.*]] = vector.transfer_read %[[ARG1]][%[[C0]], %[[C0]]], {{.*}} : tensor<4x12xf32>, vector<8x12x4xf32>
+ // CHECK-DAG: %[[V2:.*]] = vector.transfer_read %[[ARG2]][%[[C0]], %[[C0]]], {{.*}} : tensor<8x12xf32>, vector<8x12xf32>
+ //
+ // linalg matmul lowers gets expanded to a 3D reduction, canonicalization later
+ // convert it to a 2D contract.
+ // CHECK: %[[MUL:.*]] = arith.mulf %[[V0]], %[[V1]] : vector<8x12x4xf32>
+ // CHECK: %[[R:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[V2]] [2] : vector<8x12x4xf32> to vector<8x12xf32>
+ // CHECK: %[[W:.*]] = vector.transfer_write %[[R]], %[[ARG2]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x12xf32>, tensor<8x12xf32>
+ %0 = linalg.matmul ins(%arg0, %arg1: tensor<8x4xf32>, tensor<4x12xf32>)
+ outs(%arg2: tensor<8x12xf32>)
+ -> tensor<8x12xf32>
+ // CHECK: return %[[W]] : tensor<8x12xf32>
+ return %0 : tensor<8x12xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 { disable_multi_reduction_to_contract_patterns, disable_transfer_permutation_map_lowering_patterns } : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
// CHECK-LABEL: contraction_batch_matmul
func.func @contraction_batch_matmul(%A: memref<1584x1584x1584xf32>, %B: memref<1584x1584x1584xf32>, %C: memref<1584x1584x1584xf32>) {
// CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<1584x1584x1584x1584xf32>
@@ -115,6 +192,265 @@ module attributes {transform.with_named_sequence} {
// -----
+// CHECK-LABEL: @float_mixed_precision_matmul_as_contract
+// CHECK-COUNT-3: vector.transfer_read
+// CHECK-NOT: arith.extf
+// CHECK: vector.contract {{.*}} : vector<24x12xbf16>, vector<12x25xbf16> into vector<24x25xf32>
+// CHECK: vector.transfer_write
+func.func @float_mixed_precision_matmul_as_contract(%A: tensor<24x12xbf16>,
+ %B: tensor<12x25xbf16>,
+ %C: tensor<24x25xf32>) -> tensor<24x25xf32> {
+ %0 = linalg.contract
+ indexing_maps = [affine_map<(m, n, k) -> (m, k)>,
+ affine_map<(m, n, k) -> (k, n)>,
+ affine_map<(m, n, k) -> (m, n)>]
+ ins(%A, %B : tensor<24x12xbf16>, tensor<12x25xbf16>)
+ outs(%C : tensor<24x25xf32>) -> tensor<24x25xf32>
+ func.return %0 : tensor<24x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 { fold_type_extensions_into_contract } : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_fill
+func.func @test_vectorize_fill(%A : memref<8x16xf32>, %arg0 : f32) {
+ // CHECK: %[[V:.*]] = vector.broadcast {{.*}} : f32 to vector<8x16xf32>
+ // CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
+ linalg.fill ins(%arg0 : f32) outs(%A : memref<8x16xf32>)
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_fill
+func.func @test_vectorize_fill_0d(%A : memref<f32>, %arg0 : f32) {
+ // CHECK-SAME: (%[[M:.*]]: memref<f32>, %[[val:.*]]: f32)
+ // CHECK: %[[VEC:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
+ // CHECK: vector.transfer_write %[[VEC]], %[[M]][] : vector<f32>, memref<f32>
+ linalg.fill ins(%arg0 : f32) outs(%A : memref<f32>)
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_copy
+func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) {
+ // CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32>
+ // CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
+ memref.copy %A, %B : memref<8x16xf32> to memref<8x16xf32>
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_copy_0d
+func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) {
+ // CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>)
+ // CHECK: %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32>
+ // CHECK: %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32>
+ // CHECK: %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
+ // CHECK: vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32>
+ memref.copy %A, %B : memref<f32> to memref<f32>
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_copy_complex
+// CHECK-NOT: vector<
+func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) {
+ memref.copy %A, %B : memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>>
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// Input identical as the test in vectorization.mlir. Output is different -
+// vector sizes are inferred (rather than user-specified) and hence _no_
+// masking was used.
+
+func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
+ %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
+ return %pack : tensor<4x1x32x16x2xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// CHECK-LABEL: func.func @test_vectorize_pack(
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x8x16xf32>,
+// CHECK-SAME: %[[VAL_1:.*]]: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
+// CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK: %[[VAL_4:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]]], %[[VAL_2]] {in_bounds = [true, true, true]} : tensor<32x8x16xf32>, vector<32x8x16xf32>
+// CHECK: %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+// CHECK: %[[VAL_6:.*]] = vector.transpose %[[VAL_5]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32>
+// CHECK: %[[VAL_7:.*]] = tensor.empty() : tensor<4x1x32x16x2xf32>
+// CHECK: %[[VAL_8:.*]] = vector.transfer_write %[[VAL_6]], %[[VAL_7]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true, true, true, true]} : vector<4x1x32x16x2xf32>, tensor<4x1x32x16x2xf32>
+// CHECK: return %[[VAL_8]] : tensor<4x1x32x16x2xf32>
+
+// -----
+
+func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+ %pad = arith.constant 0.000000e+00 : f32
+ %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+ return %pack : tensor<32x4x1x16x2xf32>
+}
+
+// CHECK-LABEL: func.func @test_vectorize_padded_pack(
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x7x15xf32>,
+// CHECK-SAME: %[[VAL_1:.*]]: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+// CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK: %[[VAL_4:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]]], %[[VAL_2]] {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
+// CHECK: %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+// CHECK: %[[VAL_6:.*]] = vector.transpose %[[VAL_5]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+// CHECK: %[[VAL_7:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
+// CHECK: %[[VAL_8:.*]] = vector.transfer_write %[[VAL_6]], %[[VAL_7]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
+// CHECK: return %[[VAL_8]] : tensor<32x4x1x16x2xf32>
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @vectorize_map(%arg0: memref<64xf32>,
+ %arg1: memref<64xf32>, %arg2: memref<64xf32>) {
+ linalg.map ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>)
+ outs(%arg2 : memref<64xf32>)
+ (%in: f32, %in_0: f32) {
+ %0 = arith.addf %in, %in_0 : f32
+ linalg.yield %0 : f32
+ }
+ return
+}
+// CHECK-LABEL: func @vectorize_map
+// CHECK: %[[LHS:.*]] = vector.transfer_read
+// CHECK-NEXT: %[[RHS:.*]] = vector.transfer_read
+// CHECK-NEXT: arith.addf %[[LHS]], %[[RHS]] : vector<64xf32>
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.map"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @vectorize_transpose(%arg0: memref<16x32x64xf32>,
+ %arg1: memref<32x64x16xf32>) {
+ linalg.transpose ins(%arg0 : memref<16x32x64xf32>)
+ outs(%arg1 : memref<32x64x16xf32>) permutation = [1, 2, 0]
+ return
+}
+// CHECK-LABEL: func @vectorize_transpose
+// CHECK: vector.transpose
+// CHECK-SAME: [1, 2, 0] : vector<16x32x64xf32> to vector<32x64x16xf32>
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.transpose"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @vectorize_reduce(%arg0: memref<16x32x64xf32>,
+ %arg1: memref<16x64xf32>) {
+ linalg.reduce ins(%arg0 : memref<16x32x64xf32>)
+ outs(%arg1 : memref<16x64xf32>) dimensions = [1]
+ (%in: f32, %init: f32) {
+ %0 = arith.addf %in, %init : f32
+ linalg.yield %0 : f32
+ }
+ return
+}
+// CHECK-LABEL: func @vectorize_reduce
+// CHECK: vector.multi_reduction <add>
+// CHECK-SAME: : vector<16x32x64xf32> to vector<16x64xf32>
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
#matmul_trait = {
indexing_maps = [
affine_map<(m, n, k) -> (m, k)>,
@@ -306,27 +642,6 @@ module attributes {transform.with_named_sequence} {
// -----
-// CHECK-LABEL: func @vectorization_test_2
-func.func @vectorization_test_2(%A: memref<8x16xf32>, %B: memref<16x32xf32>,
- %C: memref<8x32xf32>) {
- // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<8x32x16xf32>
- // CHECK: vector.multi_reduction <add>, %{{.*}}, {{.*}} [2] : vector<8x32x16xf32> to vector<8x32xf32>
- linalg.matmul
- ins(%A, %B: memref<8x16xf32>, memref<16x32xf32>)
- outs(%C: memref<8x32xf32>)
- return
-}
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 { disable_multi_reduction_to_contract_patterns } : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
// CHECK-LABEL: func @test_vectorize_scalar_input
func.func @test_vectorize_scalar_input(%A : memref<8x16xf32>, %arg0 : f32) {
@@ -427,104 +742,6 @@ module attributes {transform.with_named_sequence} {
// -----
-// CHECK-LABEL: func @test_vectorize_fill
-func.func @test_vectorize_fill(%A : memref<8x16xf32>, %arg0 : f32) {
- // CHECK: %[[V:.*]] = vector.broadcast {{.*}} : f32 to vector<8x16xf32>
- // CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
- linalg.fill ins(%arg0 : f32) outs(%A : memref<8x16xf32>)
- return
-}
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
-// CHECK-LABEL: func @test_vectorize_fill
-func.func @test_vectorize_fill_0d(%A : memref<f32>, %arg0 : f32) {
- // CHECK-SAME: (%[[M:.*]]: memref<f32>, %[[val:.*]]: f32)
- // CHECK: %[[VEC:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
- // CHECK: vector.transfer_write %[[VEC]], %[[M]][] : vector<f32>, memref<f32>
- linalg.fill ins(%arg0 : f32) outs(%A : memref<f32>)
- return
-}
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
-// CHECK-LABEL: func @test_vectorize_copy
-func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) {
- // CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32>
- // CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
- memref.copy %A, %B : memref<8x16xf32> to memref<8x16xf32>
- return
-}
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
-// CHECK-LABEL: func @test_vectorize_copy_0d
-func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) {
- // CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>)
- // CHECK: %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32>
- // CHECK: %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32>
- // CHECK: %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
- // CHECK: vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32>
- memref.copy %A, %B : memref<f32> to memref<f32>
- return
-}
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
-// CHECK-LABEL: func @test_vectorize_copy_complex
-// CHECK-NOT: vector<
-func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) {
- memref.copy %A, %B : memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>>
- return
-}
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
// CHECK-LABEL: func @test_vectorize_trailing_index
// CHECK-SAME: (%[[ARG0:.*]]: memref<1x2x4x8xindex>)
func.func @test_vectorize_trailing_index(%arg0: memref<1x2x4x8xindex>) {
@@ -855,40 +1072,6 @@ module attributes {transform.with_named_sequence} {
// -----
-// CHECK-LABEL: func @matmul_tensors
-// CHECK-SAME: (%[[ARG0:.*]]: tensor<8x4xf32>, %[[ARG1:.*]]: tensor<4x12xf32>,
-// CHECK-SAME: %[[ARG2:.*]]: tensor<8x12xf32>) -> tensor<8x12xf32>
-func.func @matmul_tensors(
- %arg0: tensor<8x4xf32>, %arg1: tensor<4x12xf32>, %arg2: tensor<8x12xf32>)
- -> tensor<8x12xf32> {
- // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
- // CHECK-DAG: %[[V0:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], {{.*}} : tensor<8x4xf32>, vector<8x12x4xf32>
- // CHECK-DAG: %[[V1:.*]] = vector.transfer_read %[[ARG1]][%[[C0]], %[[C0]]], {{.*}} : tensor<4x12xf32>, vector<8x12x4xf32>
- // CHECK-DAG: %[[V2:.*]] = vector.transfer_read %[[ARG2]][%[[C0]], %[[C0]]], {{.*}} : tensor<8x12xf32>, vector<8x12xf32>
- //
- // linalg matmul lowers gets expanded to a 3D reduction, canonicalization later
- // convert it to a 2D contract.
- // CHECK: %[[MUL:.*]] = arith.mulf %[[V0]], %[[V1]] : vector<8x12x4xf32>
- // CHECK: %[[R:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[V2]] [2] : vector<8x12x4xf32> to vector<8x12xf32>
- // CHECK: %[[W:.*]] = vector.transfer_write %[[R]], %[[ARG2]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x12xf32>, tensor<8x12xf32>
- %0 = linalg.matmul ins(%arg0, %arg1: tensor<8x4xf32>, tensor<4x12xf32>)
- outs(%arg2: tensor<8x12xf32>)
- -> tensor<8x12xf32>
- // CHECK: return %[[W]] : tensor<8x12xf32>
- return %0 : tensor<8x12xf32>
-}
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 { disable_multi_reduction_to_contract_patterns, disable_transfer_permutation_map_lowering_patterns } : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
// CHECK-LABEL: func @sum_exp
func.func @sum_exp(%input: tensor<4x16x8xf32>, %output: tensor<4x16xf32>)
-> tensor<4x16xf32>
@@ -914,7 +1097,6 @@ func.func @sum_exp(%input: tensor<4x16x8xf32>, %output: tensor<4x16xf32>)
return %0 : tensor<4x16xf32>
}
-
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%3 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
@@ -993,7 +1175,6 @@ func.func @red_maximumf_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
return %red : tensor<4xf32>
}
-
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%3 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
@@ -1428,78 +1609,6 @@ module attributes {transform.with_named_sequence} {
// -----
-func.func @vectorize_map(%arg0: memref<64xf32>,
- %arg1: memref<64xf32>, %arg2: memref<64xf32>) {
- linalg.map ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>)
- outs(%arg2 : memref<64xf32>)
- (%in: f32, %in_0: f32) {
- %0 = arith.addf %in, %in_0 : f32
- linalg.yield %0 : f32
- }
- return
-}
-// CHECK-LABEL: func @vectorize_map
-// CHECK: %[[LHS:.*]] = vector.transfer_read
-// CHECK-NEXT: %[[RHS:.*]] = vector.transfer_read
-// CHECK-NEXT: arith.addf %[[LHS]], %[[RHS]] : vector<64xf32>
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["linalg.map"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
-func.func @vectorize_transpose(%arg0: memref<16x32x64xf32>,
- %arg1: memref<32x64x16xf32>) {
- linalg.transpose ins(%arg0 : memref<16x32x64xf32>)
- outs(%arg1 : memref<32x64x16xf32>) permutation = [1, 2, 0]
- return
-}
-// CHECK-LABEL: func @vectorize_transpose
-// CHECK: vector.transpose
-// CHECK-SAME: [1, 2, 0] : vector<16x32x64xf32> to vector<32x64x16xf32>
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["linalg.transpose"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
-func.func @vectorize_reduce(%arg0: memref<16x32x64xf32>,
- %arg1: memref<16x64xf32>) {
- linalg.reduce ins(%arg0 : memref<16x32x64xf32>)
- outs(%arg1 : memref<16x64xf32>) dimensions = [1]
- (%in: f32, %init: f32) {
- %0 = arith.addf %in, %init : f32
- linalg.yield %0 : f32
- }
- return
-}
-// CHECK-LABEL: func @vectorize_reduce
-// CHECK: vector.multi_reduction <add>
-// CHECK-SAME: : vector<16x32x64xf32> to vector<16x64xf32>
-
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
- transform.yield
- }
-}
-
-// -----
-
// This is a regression test. This IR cannot be vectorized, but
// structured.vectorize_children_and_apply_patterns should nevertheless succeed.
@@ -1715,65 +1824,77 @@ module attributes {transform.with_named_sequence} {
// -----
-// Input identical as the test in vectorization.mlir. Output is different -
-// vector sizes are inferred (rather than user-specified) and hence _no_
-// masking was used.
-
-func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
- %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
- return %pack : tensor<4x1x32x16x2xf32>
+// CHECK-LABEL: func @float_mixed_precision_matmul_as_generic
+// CHECK-COUNT-3: vector.transfer_read
+// CHECK-NOT: arith.extf
+// CHECK: vector.contract {{.*}} : vector<8x16xbf16>, vector<16x32xbf16> into vector<8x32xf32>
+// CHECK: vector.transfer_write
+func.func @float_mixed_precision_matmul_as_generic(%A: memref<8x16xbf16>, %B: memref<16x32xbf16>,
+ %C: memref<8x32xf32>) {
+ linalg.generic {
+ indexing_maps = [
+ affine_map<(m, n, k) -> (m, k)>,
+ affine_map<(m, n, k) -> (k, n)>,
+ affine_map<(m, n, k) -> (m, n)>
+ ],
+ iterator_types = ["parallel", "parallel", "reduction"]
+ }
+ ins(%A, %B : memref<8x16xbf16>, memref<16x32xbf16>)
+ outs(%C : memref<8x32xf32>) {
+ ^bb(%in: bf16, %in_0: bf16, %c: f32) :
+ %a = arith.extf %in : bf16 to f32
+ %b = arith.extf %in_0 : bf16 to f32
+ %d = arith.mulf %a, %b: f32
+ %e = arith.addf %c, %d: f32
+ linalg.yield %e : f32
+ }
+ return
}
module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
%1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 { fold_type_extensions_into_contract } : (!transform.any_op) -> !transform.any_op
transform.yield
}
}
-// CHECK-LABEL: func.func @test_vectorize_pack(
-// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x8x16xf32>,
-// CHECK-SAME: %[[VAL_1:.*]]: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
-// CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK: %[[VAL_4:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]]], %[[VAL_2]] {in_bounds = [true, true, true]} : tensor<32x8x16xf32>, vector<32x8x16xf32>
-// CHECK: %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-// CHECK: %[[VAL_6:.*]] = vector.transpose %[[VAL_5]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32>
-// CHECK: %[[VAL_7:.*]] = tensor.empty() : tensor<4x1x32x16x2xf32>
-// CHECK: %[[VAL_8:.*]] = vector.transfer_write %[[VAL_6]], %[[VAL_7]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true, true, true, true]} : vector<4x1x32x16x2xf32>, tensor<4x1x32x16x2xf32>
-// CHECK: return %[[VAL_8]] : tensor<4x1x32x16x2xf32>
-
// -----
-// Input identical as the test in vectorization.mlir. Output is different -
-// vector sizes are inferred (rather than user-specified) and hence _no_
-// masking was used.
-
-func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
- %pad = arith.constant 0.000000e+00 : f32
- %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
- return %pack : tensor<32x4x1x16x2xf32>
+// CHECK-LABEL: func @integer_mixed_precision_matmul_as_generic
+// CHECK-COUNT-3: vector.transfer_read
+// CHECK-NOT: arith.extsi
+// CHECK: vector.contract {{.*}} : vector<8x16xi8>, vector<16x32xi8> into vector<8x32xi32>
+// CHECK: vector.transfer_write
+func.func @integer_mixed_precision_matmul_as_generic(%A: memref<8x16xi8>, %B: memref<16x32xi8>,
+ %C: memref<8x32xi32>) {
+ linalg.generic {
+ indexing_maps = [
+ affine_map<(m, n, k) -> (m, k)>,
+ affine_map<(m, n, k) -> (k, n)>,
+ affine_map<(m, n, k) -> (m, n)>
+ ],
+ iterator_types = ["parallel", "parallel", "reduction"]
+ }
+ ins(%A, %B : memref<8x16xi8>, memref<16x32xi8>)
+ outs(%C : memref<8x32xi32>) {
+ ^bb(%in: i8, %in_0: i8, %c: i32) :
+ %a = arith.extsi %in : i8 to i32
+ %b = arith.extsi %in_0 : i8 to i32
+ %d = arith.muli %a, %b: i32
+ %e = arith.addi %c, %d: i32
+ linalg.yield %e : i32
+ }
+ return
}
-// CHECK-LABEL: func.func @test_vectorize_padded_pack(
-// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x7x15xf32>,
-// CHECK-SAME: %[[VAL_1:.*]]: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
-// CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK: %[[VAL_4:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]]], %[[VAL_2]] {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
-// CHECK: %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-// CHECK: %[[VAL_6:.*]] = vector.transpose %[[VAL_5]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
-// CHECK: %[[VAL_7:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
-// CHECK: %[[VAL_8:.*]] = vector.transfer_write %[[VAL_6]], %[[VAL_7]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-// CHECK: return %[[VAL_8]] : tensor<32x4x1x16x2xf32>
-
module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
- %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
%1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
- %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+ %2 = transform.structured.vectorize_children_and_apply_patterns %1 { fold_type_extensions_into_contract } : (!transform.any_op) -> !transform.any_op
transform.yield
}
}
+
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index d41d861..095810f 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -940,31 +940,100 @@ module attributes {transform.with_named_sequence} {
///----------------------------------------------------------------------------------------
// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack
-// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>,
-// CHECK-SAME: %[[ARG_1:.*]]: tensor<?x?x16x2xf32>
-func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
-// CHECK: %[[C0:.*]] = arith.constant 0
-// CHECK: %[[C01:.*]] = arith.constant 0
-// CHECK: %[[C02:.*]] = arith.constant 0
-// CHECK: %[[DIM_0:.*]] = tensor.dim %[[ARG_1]], %[[C02]] : tensor<?x?x16x2xf32>
-// CHECK: %[[C1:.*]] = arith.constant 1
-// CHECK: %[[DIM6:.*]] = tensor.dim %[[ARG_1]], %[[C1]] : tensor<?x?x16x2xf32>
-// CHECK: %[[CNST16:.*]] = arith.constant 16 : index
-// CHECK: %[[CNST2:.*]] = arith.constant 2 : index
-// CHECK: %[[readMsk0:.*]] = vector.create_mask %[[DIM_0]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1>
-// CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
-// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
-// CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32>
-// CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
-// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[ARG_0]]
-// CHECK: return %[[write0]]
- %ret = linalg.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
- return %ret : tensor<?x?xf32>
+// CHECK-SAME: %[[DEST:.*]]: tensor<?x?xf32>,
+// CHECK-SAME: %[[SRC:.*]]: tensor<?x?x16x2xf32>
+func.func @test_vectorize_dynamic_shapes_unpack(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[C0_1:.*]] = arith.constant 0 : index
+ // CHECK: %[[DIM_0:.*]] = tensor.dim %[[SRC]], %[[C0_1]] : tensor<?x?x16x2xf32>
+ // CHECK: %[[C1:.*]] = arith.constant 1
+ // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[C1]] : tensor<?x?x16x2xf32>
+ // CHECK: %[[CNST16:.*]] = arith.constant 16 : index
+ // CHECK: %[[CNST2:.*]] = arith.constant 2 : index
+ // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM_0]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1>
+ // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
+ // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
+ // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x16xf32> to vector<4x16xf32>
+ // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
+ // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]]
+ // CHECK: return %[[WRITE]]
+ %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
+ return %ret : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [2, 1, 16, 2] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack_scalable_vec
+// CHECK-SAME: %[[DEST:.*]]: tensor<?x?xf32>,
+// CHECK-SAME: %[[SRC:.*]]: tensor<?x?x16x2xf32>
+func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
+ // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
+ // CHECK: %[[C01:.*]] = arith.constant 0
+ // CHECK: %[[C02:.*]] = arith.constant 0
+ // CHECK: %[[DIM4:.*]] = tensor.dim %[[SRC]], %[[C02]] : tensor<?x?x16x2xf32>
+ // CHECK: %[[CNST14:.*]] = arith.constant 1
+ // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[CNST14]] : tensor<?x?x16x2xf32>
+ // CHECK: %[[CNST16:.*]] = arith.constant 16 : index
+ // CHECK: %[[CNST2:.*]] = arith.constant 2 : index
+ // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x[16]x2xi1>
+ // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32>
+ // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x[16]x2xf32> to vector<2x2x1x[16]xf32>
+ // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x[16]xf32> to vector<4x[16]xf32>
+ // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x[16]xi1>
+ // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]]
+ // CHECK: return %[[WRITE]]
+ %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
+ return %ret : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size
+// CHECK-SAME: %[[DEST:.*]]: tensor<?x?xf32>,
+// CHECK-SAME: %[[SRC:.*]]: tensor<?x?x?x2xf32>
+func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size(%dest: tensor<?x?xf32>, %src: tensor<?x?x?x2xf32>) -> tensor<?x?xf32> {
+ // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
+ // CHECK: %[[C01:.*]] = arith.constant 0
+ // CHECK: %[[C02:.*]] = arith.constant 0
+ // CHECK: %[[DIM4:.*]] = tensor.dim %[[SRC]], %[[C02]] : tensor<?x?x?x2xf32>
+ // CHECK: %[[C1_2:.*]] = arith.constant 1
+ // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[C1_2]] : tensor<?x?x?x2xf32>
+ // CHECK: %[[C2:.*]] = arith.constant 2 : index
+ // CHECK: %[[DIM_2:.*]] = tensor.dim %[[SRC]], %[[C2]] : tensor<?x?x?x2xf32>
+ // CHECK: %[[C2_1:.*]] = arith.constant 2 : index
+ // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[DIM_2]], %[[C2_1]] : vector<2x1x[16]x2xi1>
+ // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x?x2xf32>, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32>
+ // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x[16]x2xf32> to vector<2x2x1x[16]xf32>
+ // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x[16]xf32> to vector<4x[16]xf32>
+ // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x[16]xi1>
+ // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]]
+ // CHECK: return %[[WRITE]]
+
+ %vs = vector.vscale
+ %c16 = arith.constant 16 : index
+ %tile_size = arith.muli %vs, %c16 : index
+
+ %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [%tile_size, 2] into %dest : tensor<?x?x?x2xf32> -> tensor<?x?xf32>
+ return %ret : tensor<?x?xf32>
}
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.structured.vectorize %0 vector_sizes [4, 16] : !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2] : !transform.any_op
transform.yield
}
}
@@ -997,7 +1066,7 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [16, 8, 32, 16] : !transform.any_op
transform.yield
}
}
@@ -1022,7 +1091,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest:
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16] : !transform.any_op
transform.yield
}
}
@@ -1047,7 +1116,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest:
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16] : !transform.any_op
transform.yield
}
}
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index f86fb38..4a7176e 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -1168,6 +1168,106 @@ func.func @canonicalize_broadcast_shapecast_both_possible(%arg0: vector<1xf32>)
// -----
+// CHECK-LABEL: func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim
+// CHECK-NOT: vector.shape_cast
+// CHECK: vector.broadcast {{.+}} : vector<2xf32> to vector<32x2xf32>
+func.func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim(%arg0 : vector<2xf32>) -> vector<32x2xf32> {
+ %0 = vector.shape_cast %arg0 : vector<2xf32> to vector<1x2xf32>
+ %1 = vector.broadcast %0 : vector<1x2xf32> to vector<32x2xf32>
+ return %1 : vector<32x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim2(
+// CHECK-SAME: %[[ARG0:.*]]: vector<2x1xf32>) -> vector<32x2x1xf32> {
+// CHECK: %[[VAL_0:.*]] = vector.broadcast %[[ARG0]] : vector<2x1xf32> to vector<32x2x1xf32>
+// CHECK: return %[[VAL_0]] : vector<32x2x1xf32>
+// CHECK: }
+func.func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim2(%arg0 : vector<2x1xf32>) -> vector<32x2x1xf32> {
+ %0 = vector.shape_cast %arg0 : vector<2x1xf32> to vector<1x2x1xf32>
+ %1 = vector.broadcast %0 : vector<1x2x1xf32> to vector<32x2x1xf32>
+ return %1 : vector<32x2x1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim3(
+// CHECK-SAME: %[[ARG0:.*]]: vector<2x1xf32>) -> vector<32x2x4xf32> {
+// CHECK: %[[VAL_0:.*]] = vector.broadcast %[[ARG0]] : vector<2x1xf32> to vector<32x2x4xf32>
+// CHECK: return %[[VAL_0]] : vector<32x2x4xf32>
+// CHECK: }
+func.func @canonicalize_shapecast_broadcast_to_broadcast_prepend_dim3(%arg0 : vector<2x1xf32>) -> vector<32x2x4xf32> {
+ %0 = vector.shape_cast %arg0 : vector<2x1xf32> to vector<1x2x1xf32>
+ %1 = vector.broadcast %0 : vector<1x2x1xf32> to vector<32x2x4xf32>
+ return %1 : vector<32x2x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @canonicalize_shapecast_broadcast_to_broadcast_remove_leading_dim(
+// CHECK-SAME: %[[ARG0:.*]]: vector<1x2xf32>) -> vector<32x2xf32> {
+// CHECK: %[[VAL_0:.*]] = vector.broadcast %[[ARG0]] : vector<1x2xf32> to vector<32x2xf32>
+// CHECK: return %[[VAL_0]] : vector<32x2xf32>
+// CHECK: }
+func.func @canonicalize_shapecast_broadcast_to_broadcast_remove_leading_dim(%arg0 : vector<1x2xf32>) -> vector<32x2xf32> {
+ %0 = vector.shape_cast %arg0 : vector<1x2xf32> to vector<2xf32>
+ %1 = vector.broadcast %0 : vector<2xf32> to vector<32x2xf32>
+ return %1 : vector<32x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_canonicalize_shapecast_broadcast_invalid_shape
+// CHECK: vector.shape_cast {{.+}} : vector<64xf32> to vector<4x16xf32>
+// CHECK: vector.broadcast {{.+}} : vector<4x16xf32> to vector<2x4x16xf32>
+func.func @negative_canonicalize_shapecast_broadcast_invalid_shape(%arg0 : vector<64xf32>) -> vector<2x4x16xf32> {
+ %0 = vector.shape_cast %arg0 : vector<64xf32> to vector<4x16xf32>
+ %1 = vector.broadcast %0 : vector<4x16xf32> to vector<2x4x16xf32>
+ return %1 : vector<2x4x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_canonicalize_shapecast_broadcast_invalid_broadcasted_dims
+// CHECK: vector.shape_cast {{.+}} : vector<2x1xf32> to vector<1x2xf32>
+// CHECK: vector.broadcast {{.+}} : vector<1x2xf32> to vector<2x2xf32>
+func.func @negative_canonicalize_shapecast_broadcast_invalid_broadcasted_dims(%arg0 : vector<2x1xf32>) -> vector<2x2xf32> {
+ %0 = vector.shape_cast %arg0 : vector<2x1xf32> to vector<1x2xf32>
+ %1 = vector.broadcast %0 : vector<1x2xf32> to vector<2x2xf32>
+ return %1 : vector<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @negative_canonicalize_shapecast_broadcast_to_broadcast_append_dim(
+// CHECK-SAME: %[[ARG0:.*]]: vector<2xf32>) -> vector<2x4xf32> {
+// CHECK: %[[VAL_0:.*]] = vector.shape_cast %[[ARG0]] : vector<2xf32> to vector<2x1xf32>
+// CHECK: %[[VAL_1:.*]] = vector.broadcast %[[VAL_0]] : vector<2x1xf32> to vector<2x4xf32>
+// CHECK: return %[[VAL_1]] : vector<2x4xf32>
+// CHECK: }
+func.func @negative_canonicalize_shapecast_broadcast_to_broadcast_append_dim(%arg0 : vector<2xf32>) -> vector<2x4xf32> {
+ %0 = vector.shape_cast %arg0 : vector<2xf32> to vector<2x1xf32>
+ %1 = vector.broadcast %0 : vector<2x1xf32> to vector<2x4xf32>
+ return %1 : vector<2x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @negative_canonicalize_shapecast_broadcast_to_broadcast_remove_trailing_dim(
+// CHECK-SAME: %[[ARG0:.*]]: vector<2x1xf32>) -> vector<32x2xf32> {
+// CHECK: %[[VAL_0:.*]] = vector.shape_cast %[[ARG0]] : vector<2x1xf32> to vector<2xf32>
+// CHECK: %[[VAL_1:.*]] = vector.broadcast %[[VAL_0]] : vector<2xf32> to vector<32x2xf32>
+// CHECK: return %[[VAL_1]] : vector<32x2xf32>
+// CHECK: }
+func.func @negative_canonicalize_shapecast_broadcast_to_broadcast_remove_trailing_dim(%arg0 : vector<2x1xf32>) -> vector<32x2xf32> {
+ %0 = vector.shape_cast %arg0 : vector<2x1xf32> to vector<2xf32>
+ %1 = vector.broadcast %0 : vector<2xf32> to vector<32x2xf32>
+ return %1 : vector<32x2xf32>
+}
+
+// -----
+
// CHECK-LABEL: fold_vector_transfer_masks
func.func @fold_vector_transfer_masks(%A: memref<?x?xf32>) -> (vector<4x8xf32>, vector<4x[4]xf32>) {
// CHECK: %[[C0:.+]] = arith.constant 0 : index
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index c21de56..211e16d 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1305,6 +1305,26 @@ func.func @store_memref_index_mismatch(%base : memref<?xf32>, %value : vector<16
// -----
+//===----------------------------------------------------------------------===//
+// vector.maskedload
+//===----------------------------------------------------------------------===//
+
+func.func @maskedload_negative_alignment(%base: memref<4xi32>, %mask: vector<32xi1>, %pass: vector<1xi32>, %index: index) {
+ // expected-error@below {{'vector.maskedload' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+ %val = vector.maskedload %base[%index], %mask, %pass { alignment = -1 } : memref<4xi32>, vector<32xi1>, vector<1xi32> into vector<1xi32>
+ return
+}
+
+// -----
+
+func.func @maskedload_nonpoweroftwo_alignment(%base: memref<4xi32>, %mask: vector<32xi1>, %pass: vector<1xi32>, %index: index) {
+ // expected-error@below {{'vector.maskedload' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+ %val = vector.maskedload %base[%index], %mask, %pass { alignment = 3 } : memref<4xi32>, vector<32xi1>, vector<1xi32> into vector<1xi32>
+ return
+}
+
+// -----
+
func.func @maskedload_base_type_mismatch(%base: memref<?xf64>, %mask: vector<16xi1>, %pass: vector<16xf32>) {
%c0 = arith.constant 0 : index
// expected-error@+1 {{'vector.maskedload' op base and result element type should match}}
@@ -1336,6 +1356,26 @@ func.func @maskedload_memref_mismatch(%base: memref<?xf32>, %mask: vector<16xi1>
// -----
+//===----------------------------------------------------------------------===//
+// vector.maskedstore
+//===----------------------------------------------------------------------===//
+
+func.func @maskedstore_negative_alignment(%base: memref<4xi32>, %mask: vector<32xi1>, %value: vector<1xi32>, %index: index) {
+ // expected-error@below {{'vector.maskedstore' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+ vector.maskedstore %base[%index], %mask, %value { alignment = -1 } : memref<4xi32>, vector<32xi1>, vector<1xi32> into vector<1xi32>
+ return
+}
+
+// -----
+
+func.func @maskedstore_nonpoweroftwo_alignment(%base: memref<4xi32>, %mask: vector<32xi1>, %value: vector<1xi32>, %index: index) {
+ // expected-error@below {{'vector.maskedstore' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+ vector.maskedstore %base[%index], %mask, %value { alignment = 3 } : memref<4xi32>, vector<32xi1>, vector<1xi32> into vector<1xi32>
+ return
+}
+
+// -----
+
func.func @maskedstore_base_type_mismatch(%base: memref<?xf64>, %mask: vector<16xi1>, %value: vector<16xf32>) {
%c0 = arith.constant 0 : index
// expected-error@+1 {{'vector.maskedstore' op base and valueToStore element type should match}}
@@ -1912,8 +1952,7 @@ func.func @vector_load(%src : memref<?xi8>) {
// -----
-func.func @invalid_load_alignment(%memref: memref<4xi32>) {
- %c0 = arith.constant 0 : index
+func.func @invalid_load_alignment(%memref: memref<4xi32>, %c0: index) {
// expected-error @below {{'vector.load' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
%val = vector.load %memref[%c0] { alignment = -1 } : memref<4xi32>, vector<4xi32>
return
@@ -1921,6 +1960,14 @@ func.func @invalid_load_alignment(%memref: memref<4xi32>) {
// -----
+func.func @invalid_load_alignment(%memref: memref<4xi32>, %c0: index) {
+ // expected-error @below {{'vector.load' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+ %val = vector.load %memref[%c0] { alignment = 3 } : memref<4xi32>, vector<4xi32>
+ return
+}
+
+// -----
+
//===----------------------------------------------------------------------===//
// vector.store
//===----------------------------------------------------------------------===//
@@ -1934,8 +1981,15 @@ func.func @vector_store(%dest : memref<?xi8>, %vec : vector<16x16xi8>) {
// -----
-func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: vector<4xi32>) {
- %c0 = arith.constant 0 : index
+func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: vector<4xi32>, %c0: index) {
+ // expected-error @below {{'vector.store' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+ vector.store %val, %memref[%c0] { alignment = -1 } : memref<4xi32>, vector<4xi32>
+ return
+}
+
+// -----
+
+func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: vector<4xi32>, %c0: index) {
// expected-error @below {{'vector.store' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
vector.store %val, %memref[%c0] { alignment = 3 } : memref<4xi32>, vector<4xi32>
return
diff --git a/mlir/test/Dialect/Vector/vector-sink.mlir b/mlir/test/Dialect/Vector/vector-sink.mlir
index ef881ba..577b06d 100644
--- a/mlir/test/Dialect/Vector/vector-sink.mlir
+++ b/mlir/test/Dialect/Vector/vector-sink.mlir
@@ -40,7 +40,7 @@ func.func @broadcast_scalar_with_bcast_scalable(%arg1: index, %arg2: index) -> v
// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ADD]] : index to vector<1x4xindex>
// CHECK: return %[[BCAST]] : vector<1x4xindex>
func.func @broadcast_scalar_with_bcast_and_splat(%arg1: index, %arg2: index) -> vector<1x4xindex> {
- %0 = vector.splat %arg1 : vector<1x4xindex>
+ %0 = vector.broadcast %arg1 : index to vector<1x4xindex>
%1 = vector.broadcast %arg2 : index to vector<1x4xindex>
%2 = arith.addi %0, %1 : vector<1x4xindex>
return %2 : vector<1x4xindex>
@@ -53,7 +53,7 @@ func.func @broadcast_scalar_with_bcast_and_splat(%arg1: index, %arg2: index) ->
// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ADD]] : index to vector<1x[4]xindex>
// CHECK: return %[[BCAST]] : vector<1x[4]xindex>
func.func @broadcast_scalar_with_bcast_and_splat_scalable(%arg1: index, %arg2: index) -> vector<1x[4]xindex> {
- %0 = vector.splat %arg1 : vector<1x[4]xindex>
+ %0 = vector.broadcast %arg1 : index to vector<1x[4]xindex>
%1 = vector.broadcast %arg2 : index to vector<1x[4]xindex>
%2 = arith.addi %0, %1 : vector<1x[4]xindex>
return %2 : vector<1x[4]xindex>
@@ -94,12 +94,12 @@ func.func @broadcast_vector_scalable(%arg1: vector<[4]xf32>, %arg2: vector<[4]xf
// CHECK-LABEL: func.func @broadcast_scalar_and_vec(
// CHECK-SAME: %[[ARG1:.*]]: index,
// CHECK-SAME: %[[ARG2:.*]]: vector<4xindex>) -> vector<1x4xindex> {
-// CHECK: %[[SPLAT:.*]] = vector.splat %[[ARG1]] : vector<1x4xindex>
+// CHECK: %[[SPLAT:.*]] = vector.broadcast %[[ARG1]] : index to vector<1x4xindex>
// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ARG2]] : vector<4xindex> to vector<1x4xindex>
// CHECK: %[[ADD:.*]] = arith.addi %[[SPLAT]], %[[BCAST]] : vector<1x4xindex>
// CHECK: return %[[ADD]] : vector<1x4xindex>
func.func @broadcast_scalar_and_vec(%arg1: index, %arg2: vector<4xindex>) -> vector<1x4xindex> {
- %0 = vector.splat %arg1 : vector<1x4xindex>
+ %0 = vector.broadcast %arg1 : index to vector<1x4xindex>
%1 = vector.broadcast %arg2 : vector<4xindex> to vector<1x4xindex>
%2 = arith.addi %0, %1 : vector<1x4xindex>
return %2 : vector<1x4xindex>
@@ -108,12 +108,12 @@ func.func @broadcast_scalar_and_vec(%arg1: index, %arg2: vector<4xindex>) -> vec
// CHECK-LABEL: func.func @broadcast_scalar_and_vec_scalable(
// CHECK-SAME: %[[ARG1:.*]]: index,
// CHECK-SAME: %[[ARG2:.*]]: vector<[4]xindex>) -> vector<1x[4]xindex> {
-// CHECK: %[[SPLAT:.*]] = vector.splat %[[ARG1]] : vector<1x[4]xindex>
+// CHECK: %[[SPLAT:.*]] = vector.broadcast %[[ARG1]] : index to vector<1x[4]xindex>
// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ARG2]] : vector<[4]xindex> to vector<1x[4]xindex>
// CHECK: %[[ADD:.*]] = arith.addi %[[SPLAT]], %[[BCAST]] : vector<1x[4]xindex>
// CHECK: return %[[ADD]] : vector<1x[4]xindex>
func.func @broadcast_scalar_and_vec_scalable(%arg1: index, %arg2: vector<[4]xindex>) -> vector<1x[4]xindex> {
- %0 = vector.splat %arg1 : vector<1x[4]xindex>
+ %0 = vector.broadcast %arg1 : index to vector<1x[4]xindex>
%1 = vector.broadcast %arg2 : vector<[4]xindex> to vector<1x[4]xindex>
%2 = arith.addi %0, %1 : vector<1x[4]xindex>
return %2 : vector<1x[4]xindex>
@@ -787,7 +787,7 @@ func.func @negative_extract_load_scalable(%arg0: memref<?xf32>, %arg1: index) ->
// CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32)
func.func @store_splat(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) {
// CHECK: memref.store %[[ARG2]], %[[ARG0]][%[[ARG1]]] : memref<?xf32>
- %0 = vector.splat %arg2 : vector<1xf32>
+ %0 = vector.broadcast %arg2 : f32 to vector<1xf32>
vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<1xf32>
return
}
@@ -813,9 +813,9 @@ func.func @store_broadcast_1d_to_2d(%arg0: memref<?x?xf32>, %arg1: index, %arg2:
// CHECK-LABEL: @negative_store_scalable
// CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32)
func.func @negative_store_scalable(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) {
-// CHECK: %[[RES:.*]] = vector.splat %[[ARG2]] : vector<[1]xf32>
+// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<[1]xf32>
// CHECK: vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xf32>, vector<[1]xf32>
- %0 = vector.splat %arg2 : vector<[1]xf32>
+ %0 = vector.broadcast %arg2 : f32 to vector<[1]xf32>
vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<[1]xf32>
return
}
@@ -823,9 +823,9 @@ func.func @negative_store_scalable(%arg0: memref<?xf32>, %arg1: index, %arg2: f3
// CHECK-LABEL: @negative_store_memref_of_vec
// CHECK-SAME: (%[[ARG0:.*]]: memref<?xvector<1xf32>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32)
func.func @negative_store_memref_of_vec(%arg0: memref<?xvector<1xf32>>, %arg1: index, %arg2: f32) {
-// CHECK: %[[RES:.*]] = vector.splat %[[ARG2]] : vector<1xf32>
+// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<1xf32>
// CHECK: vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xvector<1xf32>>, vector<1xf32>
- %0 = vector.splat %arg2 : vector<1xf32>
+ %0 = vector.broadcast %arg2 : f32 to vector<1xf32>
vector.store %0, %arg0[%arg1] : memref<?xvector<1xf32>>, vector<1xf32>
return
}
@@ -833,9 +833,9 @@ func.func @negative_store_memref_of_vec(%arg0: memref<?xvector<1xf32>>, %arg1: i
// CHECK-LABEL: @negative_store_more_than_one_element
// CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32)
func.func @negative_store_more_than_one_element(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) {
-// CHECK: %[[RES:.*]] = vector.splat %[[ARG2]] : vector<4xf32>
+// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<4xf32>
// CHECK: vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xf32>, vector<4xf32>
- %0 = vector.splat %arg2 : vector<4xf32>
+ %0 = vector.broadcast %arg2 : f32 to vector<4xf32>
vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<4xf32>
return
}
@@ -843,10 +843,10 @@ func.func @negative_store_more_than_one_element(%arg0: memref<?xf32>, %arg1: ind
// CHECK-LABEL: @negative_store_no_single_use
// CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: f32)
func.func @negative_store_no_single_use(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) -> vector<1xf32> {
-// CHECK: %[[RES:.*]] = vector.splat %[[ARG2]] : vector<1xf32>
+// CHECK: %[[RES:.*]] = vector.broadcast %[[ARG2]] : f32 to vector<1xf32>
// CHECK: vector.store %[[RES]], %[[ARG0]][%[[ARG1]]] : memref<?xf32>, vector<1xf32>
// CHECK: return %[[RES:.*]] : vector<1xf32>
- %0 = vector.splat %arg2 : vector<1xf32>
+ %0 = vector.broadcast %arg2 : f32 to vector<1xf32>
vector.store %0, %arg0[%arg1] : memref<?xf32>, vector<1xf32>
return %0 : vector<1xf32>
}
diff --git a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
index 1b54d54..45afbff 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
@@ -285,19 +285,19 @@ func.func @transfer_read_permutations(%mem_0 : memref<?x?xf32>, %mem_1 : memref<
%c0 = arith.constant 0 : index
// CHECK: %[[MASK0:.*]] = vector.broadcast %{{.*}} : i1 to vector<14x7xi1>
- %mask0 = vector.splat %m : vector<14x7xi1>
+ %mask0 = vector.broadcast %m : i1 to vector<14x7xi1>
%0 = vector.transfer_read %mem_1[%c0, %c0, %c0, %c0], %cst, %mask0 {in_bounds = [true, false, true, true], permutation_map = #map0} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32>
// CHECK: vector.transfer_read {{.*}} %[[MASK0]] {in_bounds = [false, true, true, true], permutation_map = #[[$MAP0]]} : memref<?x?x?x?xf32>, vector<14x7x8x16xf32>
// CHECK: vector.transpose %{{.*}}, [1, 0, 2, 3] : vector<14x7x8x16xf32> to vector<7x14x8x16xf32>
// CHECK: %[[MASK1:.*]] = vector.broadcast %{{.*}} : i1 to vector<16x14xi1>
- %mask1 = vector.splat %m : vector<16x14xi1>
+ %mask1 = vector.broadcast %m : i1 to vector<16x14xi1>
%1 = vector.transfer_read %mem_1[%c0, %c0, %c0, %c0], %cst, %mask1 {in_bounds = [true, false, true, false], permutation_map = #map1} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32>
// CHECK: vector.transfer_read {{.*}} %[[MASK1]] {in_bounds = [false, false, true, true], permutation_map = #[[$MAP0]]} : memref<?x?x?x?xf32>, vector<16x14x7x8xf32>
// CHECK: vector.transpose %{{.*}}, [2, 1, 3, 0] : vector<16x14x7x8xf32> to vector<7x14x8x16xf32>
// CHECK: %[[MASK3:.*]] = vector.broadcast %{{.*}} : i1 to vector<14x7xi1>
- %mask2 = vector.splat %m : vector<14x7xi1>
+ %mask2 = vector.broadcast %m : i1 to vector<14x7xi1>
%2 = vector.transfer_read %mem_1[%c0, %c0, %c0, %c0], %cst, %mask2 {in_bounds = [true, false, true, true], permutation_map = #map2} : memref<?x?x?x?xf32>, vector<7x14x8x16xf32>
// CHECK: vector.transfer_read {{.*}} %[[MASK3]] {in_bounds = [false, true, true], permutation_map = #[[$MAP1]]} : memref<?x?x?x?xf32>, vector<14x16x7xf32>
// CHECK: vector.broadcast %{{.*}} : vector<14x16x7xf32> to vector<8x14x16x7xf32>
@@ -337,7 +337,7 @@ func.func @transfer_write_permutations_tensor_masked(
%c0 = arith.constant 0 : index
// CHECK: %[[MASK:.*]] = vector.broadcast %[[M]] : i1 to vector<16x14x7x8xi1>
- %mask0 = vector.splat %m : vector<16x14x7x8xi1>
+ %mask0 = vector.broadcast %m : i1 to vector<16x14x7x8xi1>
%res = vector.transfer_write %vec, %dst[%c0, %c0, %c0, %c0], %mask0 {in_bounds = [true, false, false, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d1, d3, d0)>} : vector<7x14x8x16xf32>, tensor<?x?x?x?xf32>
// CHECK: %[[NEW_VEC0:.*]] = vector.transpose %{{.*}} [3, 1, 0, 2] : vector<7x14x8x16xf32> to vector<16x14x7x8xf32>
// CHECK: %[[NEW_RES0:.*]] = vector.transfer_write %[[NEW_VEC0]], %[[DST]][%c0, %c0, %c0, %c0], %[[MASK]] {in_bounds = [true, false, true, false]} : vector<16x14x7x8xf32>, tensor<?x?x?x?xf32>
diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/if.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/if.mlir
new file mode 100644
index 0000000..01068cb
--- /dev/null
+++ b/mlir/test/Dialect/WasmSSA/custom_parser/if.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+// CHECK-LABEL: wasmssa.func nested @func_0(
+// CHECK-SAME: %[[ARG0:.*]]: !wasmssa<local ref to i32>) -> i32 {
+// CHECK: %[[VAL_0:.*]] = wasmssa.local_get %[[ARG0]] : ref to i32
+// CHECK: wasmssa.if %[[VAL_0]] : {
+// CHECK: %[[VAL_1:.*]] = wasmssa.const 5.000000e-01 : f32
+// CHECK: wasmssa.block_return %[[VAL_1]] : f32
+// CHECK: } "else "{
+// CHECK: %[[VAL_2:.*]] = wasmssa.const 2.500000e-01 : f32
+// CHECK: wasmssa.block_return %[[VAL_2]] : f32
+// CHECK: }> ^bb1
+// CHECK: ^bb1(%[[VAL_3:.*]]: f32):
+// CHECK: wasmssa.return %[[VAL_3]] : f32
+wasmssa.func nested @func_0(%arg0 : !wasmssa<local ref to i32>) -> i32 {
+ %cond = wasmssa.local_get %arg0 : ref to i32
+ wasmssa.if %cond : {
+ %c0 = wasmssa.const 0.5 : f32
+ wasmssa.block_return %c0 : f32
+ } else {
+ %c1 = wasmssa.const 0.25 : f32
+ wasmssa.block_return %c1 : f32
+ } >^bb1
+ ^bb1(%retVal: f32):
+ wasmssa.return %retVal : f32
+}
+
+// CHECK-LABEL: wasmssa.func nested @func_1(
+// CHECK-SAME: %[[ARG0:.*]]: !wasmssa<local ref to i32>) -> i32 {
+// CHECK: %[[VAL_0:.*]] = wasmssa.local_get %[[ARG0]] : ref to i32
+// CHECK: %[[VAL_1:.*]] = wasmssa.local of type i32
+// CHECK: %[[VAL_2:.*]] = wasmssa.const 0 : i64
+// CHECK: wasmssa.if %[[VAL_0]] : {
+// CHECK: %[[VAL_3:.*]] = wasmssa.const 1 : i32
+// CHECK: wasmssa.local_set %[[VAL_1]] : ref to i32 to %[[VAL_3]] : i32
+// CHECK: wasmssa.block_return
+// CHECK: } > ^bb1
+// CHECK: ^bb1:
+// CHECK: %[[VAL_4:.*]] = wasmssa.local_get %[[VAL_1]] : ref to i32
+// CHECK: wasmssa.return %[[VAL_4]] : i32
+wasmssa.func nested @func_1(%arg0 : !wasmssa<local ref to i32>) -> i32 {
+ %cond = wasmssa.local_get %arg0 : ref to i32
+ %var = wasmssa.local of type i32
+ %zero = wasmssa.const 0
+ wasmssa.if %cond : {
+ %c1 = wasmssa.const 1 : i32
+ wasmssa.local_set %var : ref to i32 to %c1 : i32
+ wasmssa.block_return
+ } >^bb1
+ ^bb1:
+ %res = wasmssa.local_get %var : ref to i32
+ wasmssa.return %res : i32
+}
diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/memory.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/memory.mlir
new file mode 100644
index 0000000..47551db
--- /dev/null
+++ b/mlir/test/Dialect/WasmSSA/custom_parser/memory.mlir
@@ -0,0 +1,7 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+// CHECK: wasmssa.memory @mem0 public !wasmssa<limit[0: 65536]>
+wasmssa.memory @mem0 public !wasmssa<limit[0:65536]>
+
+// CHECK: wasmssa.memory @mem1 nested !wasmssa<limit[512:]>
+wasmssa.memory @mem1 !wasmssa<limit[512:]>
diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/table.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/table.mlir
new file mode 100644
index 0000000..5a874f4
--- /dev/null
+++ b/mlir/test/Dialect/WasmSSA/custom_parser/table.mlir
@@ -0,0 +1,7 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+// CHECK: wasmssa.table @tab0 public !wasmssa<tabletype !wasmssa.externref [0: 65536]>
+wasmssa.table @tab0 public !wasmssa<tabletype !wasmssa.externref [0:65536]>
+
+// CHECK: wasmssa.table @tab1 nested !wasmssa<tabletype !wasmssa.funcref [348:]>
+wasmssa.table @tab1 !wasmssa<tabletype !wasmssa.funcref [348:]>
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index dff3ffa..44e15dd 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -52,14 +52,14 @@ func.func @create_nd_tdesc_7(%src: memref<128x128xf32>) {
// -----
func.func @create_nd_tdesc_8(%src: ui64) {
- // expected-error@+1 {{'xegpu.create_nd_tdesc' op Expecting strides and shape to be present for integer source}}
+ // expected-error@+1 {{'xegpu.create_nd_tdesc' op expecting strides and shape to be present for integer source}}
%1 = xegpu.create_nd_tdesc %src : ui64-> !xegpu.tensor_desc<128x128xf32>
return
}
// -----
func.func @create_nd_tdesc_9(%src: ui64) {
- // expected-error@+1 {{expected mixed offsets rank to match mixed sizes rank}}
+ // expected-error@+1 {{expecting strides and shape to be present for integer source}}
%1 = xegpu.create_nd_tdesc %src[0, 0] : ui64-> !xegpu.tensor_desc<128x128xf32>
return
}
@@ -149,7 +149,7 @@ func.func @subgroup_load_nd_offset_2(%src: memref<4x8x16xf16>, %x : index) {
}
// -----
-func.func @subgroup_load_nd_offset_3(%src: memref<4x8x16xf16>, %x : index) {
+func.func @subgroup_load_nd_offset_3(%src: memref<4x8x16xf16>, %x : index) {
%3 = xegpu.create_nd_tdesc %src: memref<4x8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
%5 = xegpu.load_nd %3[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
// expected-error@+1 {{Mismatched ranks between offsets and tensor descriptor}}
@@ -418,7 +418,7 @@ func.func @store_scatter_offset_wi_1(%src: memref<?xf16>) {
%offsets = arith.constant dense<[0]> : vector<1xindex>
%mask = arith.constant dense<1>: vector<1xi1>
// expected-error@+1 {{value elements must match chunk size}}
- xegpu.store %val, %src[%offsets], %mask
+ xegpu.store %val, %src[%offsets], %mask
: vector<4xf16>, memref<?xf16>, vector<1xindex>, vector<1xi1>
return
}
@@ -429,7 +429,7 @@ func.func @store_scatter_offset_wi_2(%src: memref<4x4xf16>) {
%offsets = arith.constant dense<[0]> : vector<1xindex>
%mask = arith.constant dense<1>: vector<1xi1>
// expected-error@+1 {{Expecting the dest is a 1D memref or pointer}}
- xegpu.store %val, %src[%offsets], %mask
+ xegpu.store %val, %src[%offsets], %mask
: vector<4xf16>, memref<4x4xf16>, vector<1xindex>, vector<1xi1>
return
}
@@ -743,3 +743,22 @@ func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
#xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2], order = [0, 1, 2]>>
return
}
+
+// -----
+#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>
+// expected-error@+1 {{repeated dim (2) in slice attribute}}
+#s = #xegpu.slice<#l, dims = [2, 2]>
+func.func @slice_attr_repeat_dim() {
+ %offsets = arith.constant {layout_result_0 = #s} dense<0.8> : vector<16x8xindex>
+ return
+}
+
+// -----
+#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>
+// expected-error@+1 {{invalid dim (3) in slice attribute}}
+#s = #xegpu.slice<#l, dims = [3]>
+func.func @slice_attr_repeat_dim() {
+ %offsets = arith.constant {layout_result_0 = #s} dense<0.8> : vector<16x8xindex>
+ return
+}
+
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index 017dacc..e4b4e22 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -50,4 +50,27 @@ gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
gpu.return
}
+gpu.func @slice_attr() {
+ //CHECK: arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+ gpu.return
+}
+
+gpu.func @nested_slice_attr() {
+ //CHECK: arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>, dims = [1]>} dense<8> : vector<16xindex>
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>, dims = [1]>} dense<8> : vector<16xindex>
+ gpu.return
+}
+
+gpu.func @softmax_dim_0(%arg0: vector<256x128xf32>) -> vector<256x128xf32> {
+ %cst = arith.constant dense<0.000000e+00> : vector<128xf32>
+ %0 = math.exp %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
+ //CHECK: vector.multi_reduction <add>, {{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+ %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+ //CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
+ %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
+ %3 = arith.divf %0, %2 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
+ gpu.return %3 : vector<256x128xf32>
+}
+
}
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 6be2371..67c00f5 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -62,28 +62,28 @@ gpu.func @create_nd_tdesc_7(%src: memref<8x24x32x48x64xf32>) {
}
-// CHECK: gpu.func @test_create_nd_tdesc_7(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index, %[[arg5:.*]]: memref<24x32xf32>)
+// CHECK: gpu.func @test_create_nd_tdesc_7(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index, %[[arg5:.*]]: memref<24x32xf32>)
gpu.func @test_create_nd_tdesc_7(%src: ui64, %w : index, %h : index, %x : index, %y : index, %src2: memref<24x32xf32>) {
//CHECK: %[[C:.*]] = arith.constant 1 : index
%c1 = arith.constant 1 : index
-
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg5]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg5]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
%3 = xegpu.create_nd_tdesc %src2 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
-
+
gpu.return
}
-// CHECK: gpu.func @test_create_nd_tdesc_8(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index)
+// CHECK: gpu.func @test_create_nd_tdesc_8(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index)
gpu.func @test_create_nd_tdesc_8(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
-
- %c1 = arith.constant 1 : index
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0], shape : [%arg2, %arg1], strides : [%arg1, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+
+ %c1 = arith.constant 1 : index
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0, shape : [%arg2, %arg1], strides : [%arg1, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
%2 = xegpu.create_nd_tdesc %src, shape : [%h, %w], strides : [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
-
+
gpu.return
}
-// CHECK-LABEL: func @test_create_nd_tdesc_9({{.*}})
+// CHECK-LABEL: func @test_create_nd_tdesc_9({{.*}})
gpu.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
@@ -94,10 +94,10 @@ gpu.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index,
gpu.return
}
-// CHECK-LABEL: func @test_create_nd_tdesc_10({{.*}})
-gpu.func @test_create_nd_tdesc_10(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
+// CHECK-LABEL: func @test_create_nd_tdesc_10({{.*}})
+gpu.func @test_create_nd_tdesc_10(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0], shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0, shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
%2 = xegpu.create_nd_tdesc %src, shape:[%h, %w], strides:[%w, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
gpu.return
@@ -123,7 +123,7 @@ gpu.func @prefetch_nd_2(%src: memref<48x64xf16>) {
// CHECK: gpu.func @prefetch_nd_offset_1(%[[arg0:.*]]: memref<48x64xf16>, %arg1: index, %arg2: index) {
gpu.func @prefetch_nd_offset_1(%src: memref<48x64xf16>, %x : index, %y : index) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16>
%1 = xegpu.create_nd_tdesc %src : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16>
// CHECK: xegpu.prefetch_nd %[[R0]][%arg1, %arg2] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
xegpu.prefetch_nd %1[%x, %y] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16>
@@ -271,7 +271,7 @@ gpu.func @subgroup_load_nd_8(%src: memref<24x32xf32>) {
// CHECK: func @subgroup_load_nd_offset_1(%[[arg0:.*]]: memref<24x32xf32>, %arg1: index, %arg2: index) {
gpu.func @subgroup_load_nd_offset_1(%src: memref<24x32xf32>, %x : index, %y : index) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
%1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
// CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][%arg1, %arg2] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
%2 = xegpu.load_nd %1[%x, %y] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
@@ -290,7 +290,7 @@ gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) {
// CHECK: func @simt_load_nd_offset_1(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @simt_load_nd_offset_1(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
%1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
// CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
%2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
@@ -323,7 +323,7 @@ gpu.func @simt_store_nd(%src: memref<24x32xf16>) {
gpu.func @subgroup_store_nd_2(%dst: memref<24x32xf16>, %x : index) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16>
%1 = arith.constant dense<1.0>: vector<32xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
%2 = xegpu.create_nd_tdesc %dst : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
// CHECK: xegpu.store_nd %[[C]], %[[R0]][%arg1] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<32xf16>, !xegpu.tensor_desc<32xf16>
xegpu.store_nd %1, %2[%x] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<32xf16>, !xegpu.tensor_desc<32xf16>
@@ -356,7 +356,7 @@ gpu.func @simt_store_nd_2(%src: memref<24x32xf16>) {
gpu.func @simt_store_nd_offset_1(%src: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16>
%1 = arith.constant dense<1.0>: vector<2xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
%2 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
// CHECK: xegpu.store_nd %[[C]], %[[R0]][0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<32xf16>
xegpu.store_nd %1, %2[0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2xf16>, !xegpu.tensor_desc<32xf16>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
new file mode 100644
index 0000000..547c735
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s
+
+//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)>
+gpu.module @test {
+ gpu.func @slice_attr() -> vector<128xindex> {
+ //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+ //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
+ //CHECK: [[c32:%.+]] = arith.constant 32 : index
+ //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+ //CHECK: [[c0:%.+]] = arith.constant 0 : index
+ //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
+ //CHECK: [[c128:%.+]] = arith.constant 128 : index
+ //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
+ //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
+ //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+ %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
+ gpu.return %step : vector<128xindex>
+ }
+
+ gpu.func @nested_slice_attr() -> vector<128xindex> {
+ //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+ //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
+ //CHECK: [[c32:%.+]] = arith.constant 32 : index
+ //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+ //CHECK: [[c0:%.+]] = arith.constant 0 : index
+ //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
+ //CHECK: [[c128:%.+]] = arith.constant 128 : index
+ //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
+ //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
+ //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+ %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 1], sg_data = [32, 32, 1]>, dims = [2]>, dims = [1]>} : vector<128xindex>
+ gpu.return %0 : vector<128xindex>
+ }
+
+} \ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 628a485..e5cc65e 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -1,5 +1,8 @@
// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
+#map = affine_map<()[s0] -> (s0 floordiv 4)>
+#map1 = affine_map<()[s0] -> (s0 mod 4)>
+
gpu.module @test_round_robin_assignment {
// CHECK-LABEL: create_nd_tdesc
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
@@ -12,6 +15,30 @@ gpu.module @test_round_robin_assignment {
gpu.return
}
+ // CHECK-LABEL: create_nd_tdesc_with_shared_data
+ // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
+ gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) {
+ //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+ //CHECK: [[IdY:%.+]] = affine.apply #map()[[[sgId]]]
+ //CHECK: [[IdX:%.+]] = affine.apply #map1()[[[sgId]]]
+ //CHECK: [[C16:%.+]] = arith.constant 16 : index
+ //CHECK: [[LY:%.+]] = index.mul [[IdY]], [[C16]]
+ //CHECK: [[C64:%.+]] = arith.constant 64 : index
+ //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]]
+ //CHECK: [[C0:%.+]] = arith.constant 0 : index
+ //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
+ //CHECK: [[ADDY:%.+]] = arith.addi [[LY]], [[C0]] : index
+ //CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
+ //CHECK: [[C128:%.+]] = arith.constant 128 : index
+ //CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]]
+ //CHECK: [[C64_2:%.+]] = arith.constant 64 : index
+ //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C64_2]]
+ //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
+ %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+ -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
+ gpu.return
+ }
+
// CHECK-LABEL: load_nd_tdesc
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index d4b0037..180ba8a 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -4,34 +4,26 @@
//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
gpu.module @test_1_1_assignment {
// CHECK-LABEL: create_nd_tdesc
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
+ // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
- // CHECK: %[[SGID:.*]] = gpu.subgroup_id
- // CHECK: %[[C8:.*]] = arith.constant 8 : index
- // CHECK: %[[C32:.*]] = arith.constant 32 : index
- // CHECK: %[[C4:.*]] = arith.constant 4 : index
- // CHECK: %[[C32_0:.*]] = arith.constant 32 : index
- // CHECK: %[[C4_1:.*]] = arith.constant 4 : index
- // CHECK: %[[DIV:.*]] = affine.apply #map()[%[[SGID]]]
- // CHECK: %[[REM:.*]] = affine.apply #map1()[%[[SGID]]]
- // CHECK: %[[MUL1:.*]] = index.mul %[[DIV]], %[[C32]]
- // CHECK: %[[MUL2:.*]] = index.mul %[[REM]], %[[C32_0]]
- // CHECK: %[[C0:.*]] = arith.constant 0 : index
- // CHECK: %[[C256:.*]] = arith.constant 256 : index
- // CHECK: %[[MOD:.*]] = index.remu %[[MUL1]], %[[C256]]
- // CHECK: %[[C0_2:.*]] = arith.constant 0 : index
- // CHECK: %[[ADD1:.*]] = index.add %[[MOD]], %[[C0_2]]
- // CHECK: %[[C0_3:.*]] = arith.constant 0 : index
- // CHECK: %[[C128:.*]] = arith.constant 128 : index
- // CHECK: %[[MOD1:.*]] = index.remu %[[MUL2]], %[[C128]]
- // CHECK: %[[C0_4:.*]] = arith.constant 0 : index
- // CHECK: %[[ADD2:.*]] = index.add %[[MOD1]], %[[C0_4]]
- // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[ADD1]], %[[ADD2]]] : memref<256x128xf32>
- // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: gpu.return
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
+ //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
+ //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
+ //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
+ //CHECK: [[C32:%.+]] = arith.constant 32 : index
+ //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]]
+ //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
+ //CHECK: [[C0:%.+]] = arith.constant 0 : index
+ //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
+ //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
+ //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
+ //CHECK: [[C256:%.+]] = arith.constant 256 : index
+ //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C256]]
+ //CHECK: [[C128:%.+]] = arith.constant 128 : index
+ //CHECK: [[X:%.+]] = index.remu [[UX]], [[C128]]
+ //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+ -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
}
// CHECK-LABEL: load_nd_tdesc
@@ -347,7 +339,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
// CHECK-LABEL: @subgroup_id_range_nested_if
gpu.func @subgroup_id_range_nested_if(%src: memref<256x128xf32>, %src1: memref<128x64xf32>) {
%sg_id = gpu.subgroup_id : index
- %c1 = arith.constant 1 : i1
+ %c1 = arith.constant 1 : i1
%c3 = arith.constant 3 : index
%c32 = arith.constant 32 : index
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
diff --git a/mlir/test/Dialect/common_folders.mlir b/mlir/test/Dialect/common_folders.mlir
new file mode 100644
index 0000000..92598b4
--- /dev/null
+++ b/mlir/test/Dialect/common_folders.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-opt %s --test-fold-type-converting-op --split-input-file | FileCheck %s
+
+// CHECK-LABEL: @test_fold_unary_op_f32_to_si32(
+func.func @test_fold_unary_op_f32_to_si32() -> tensor<4x2xsi32> {
+ // CHECK-NEXT: %[[POSITIVE_ONE:.*]] = arith.constant dense<1> : tensor<4x2xsi32>
+ // CHECK-NEXT: return %[[POSITIVE_ONE]] : tensor<4x2xsi32>
+ %operand = arith.constant dense<5.1> : tensor<4x2xf32>
+ %sign = test.sign %operand : (tensor<4x2xf32>) -> tensor<4x2xsi32>
+ return %sign : tensor<4x2xsi32>
+}
+
+// -----
+
+// CHECK-LABEL: @test_fold_binary_op_f32_to_i1(
+func.func @test_fold_binary_op_f32_to_i1() -> tensor<8xi1> {
+ // CHECK-NEXT: %[[FALSE:.*]] = arith.constant dense<false> : tensor<8xi1>
+ // CHECK-NEXT: return %[[FALSE]] : tensor<8xi1>
+ %lhs = arith.constant dense<5.1> : tensor<8xf32>
+ %rhs = arith.constant dense<4.2> : tensor<8xf32>
+ %less_than = test.less_than %lhs, %rhs : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xi1>
+ return %less_than : tensor<8xi1>
+}
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
index 06a6e22..9d04357 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
@@ -9,7 +9,12 @@
// RUN: FileCheck %s
func.func @matmul_transpose_a(%A : tensor<?x?xf32>, %B : tensor<?x?xf32>, %C : tensor<?x?xf32>) {
- %res = linalg.matmul_transpose_a ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>)
+ %res = linalg.matmul
+ indexing_maps = [
+ affine_map<(d0, d1, d2) -> (d2, d0)>,
+ affine_map<(d0, d1, d2) -> (d2, d1)>,
+ affine_map<(d0, d1, d2) -> (d0, d1)>]
+ ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>)
outs(%C: tensor<?x?xf32>) -> tensor<?x?xf32>
%xf = tensor.cast %res : tensor<?x?xf32> to tensor<*xf32>
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
@@ -56,7 +61,7 @@ func.func @main() {
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%module : !transform.any_op {transform.readonly}) {
- %matmul_transpose_a = transform.structured.match ops{["linalg.matmul_transpose_a"]} in %module
+ %matmul_transpose_a = transform.structured.match ops{["linalg.matmul"]} in %module
: (!transform.any_op) -> !transform.any_op
// Step 1: Tile for size [4] x [4], which corresponds to SVLs x SVLs, where
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir
index 0ee0166..219367a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir
@@ -46,7 +46,7 @@ func.func @test_outerproduct_with_accumulator_4x4xf32() {
%c0 = arith.constant 0 : index
%f10 = arith.constant 10.0 : f32
- %acc = vector.splat %f10 : vector<[4]x[4]xf32>
+ %acc = vector.broadcast %f10 : f32 to vector<[4]x[4]xf32>
%vector_i32 = llvm.intr.stepvector : vector<[4]xi32>
%vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32>
%tile = vector.outerproduct %vector, %vector, %acc : vector<[4]xf32>, vector<[4]xf32>
@@ -103,7 +103,7 @@ func.func @test_masked_outerproduct_with_accumulator_4x4xf32() {
%ones = arith.constant dense<1> : vector<[4]xi32>
%f10 = arith.constant 10.0 : f32
- %acc = vector.splat %f10 : vector<[4]x[4]xf32>
+ %acc = vector.broadcast %f10 : f32 to vector<[4]x[4]xf32>
%step_vector = llvm.intr.stepvector : vector<[4]xi32>
%vector_i32 = arith.addi %step_vector, %ones : vector<[4]xi32>
%vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir
index 8e81210..059f24a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f64.mlir
@@ -52,7 +52,7 @@ func.func @test_outerproduct_with_accumulator_2x2xf64() {
%ones = arith.constant dense<1> : vector<[2]xi32>
%f10 = arith.constant 10.0 : f64
- %acc = vector.splat %f10 : vector<[2]x[2]xf64>
+ %acc = vector.broadcast %f10 : f64 to vector<[2]x[2]xf64>
%step_vector = llvm.intr.stepvector : vector<[2]xi32>
%vector_i32 = arith.addi %step_vector, %ones : vector<[2]xi32>
%vector = arith.sitofp %vector_i32 : vector<[2]xi32> to vector<[2]xf64>
@@ -108,7 +108,7 @@ func.func @test_masked_outerproduct_with_accumulator_2x2xf64() {
%ones = arith.constant dense<1> : vector<[2]xi32>
%f10 = arith.constant 10.0 : f64
- %acc = vector.splat %f10 : vector<[2]x[2]xf64>
+ %acc = vector.broadcast %f10 : f64 to vector<[2]x[2]xf64>
%step_vector = llvm.intr.stepvector : vector<[2]xi32>
%vector_i32 = arith.addi %step_vector, %ones : vector<[2]xi32>
%vector = arith.sitofp %vector_i32 : vector<[2]xi32> to vector<[2]xf64>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir
index c3bf379..bf6900c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir
@@ -10,7 +10,7 @@
// Vector store.
func.func @transfer_write_2d(%A : memref<?x?xf32>, %base1: index, %base2: index) {
%c0 = arith.constant 0.0 : f32
- %zero = vector.splat %c0 : vector<[4]x[4]xf32>
+ %zero = vector.broadcast %c0 : f32 to vector<[4]x[4]xf32>
vector.transfer_write %zero, %A[%base1, %base2] {in_bounds=[true, true]} :
vector<[4]x[4]xf32>, memref<?x?xf32>
return
@@ -22,7 +22,7 @@ func.func @transfer_write_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: i
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%mask = vector.create_mask %c2, %c3 : vector<[4]x[4]xi1>
- %zero = vector.splat %c0 : vector<[4]x[4]xf32>
+ %zero = vector.broadcast %c0 : f32 to vector<[4]x[4]xf32>
vector.transfer_write %zero, %A[%base1, %base2], %mask {in_bounds=[true, true]} :
vector<[4]x[4]xf32>, memref<?x?xf32>
return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir
index c990432..192f291 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction.mlir
@@ -106,7 +106,7 @@ func.func @matvec_i32() {
// val = (123 * 314) * 4 * vscale
// so ...
%vscale = vector.vscale
- %vscale_v = vector.splat %vscale : vector<3xindex>
+ %vscale_v = vector.broadcast %vscale : index to vector<3xindex>
%vscale_i32 = arith.index_cast %vscale_v : vector<3xindex> to vector<3xi32>
%mv1_div = arith.divui %mv1, %vscale_i32 : vector<3xi32>
// ... val / vscale = 123 * 314 * 4 = 154488
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir
index d3b1fa4..2d8180a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir
@@ -7,8 +7,8 @@
func.func @entry() {
%f1 = arith.constant 1.0 : f32
%f2 = arith.constant 2.0 : f32
- %v1 = vector.splat %f1 : vector<[4]xf32>
- %v2 = vector.splat %f2 : vector<[4]xf32>
+ %v1 = vector.broadcast %f1 : f32 to vector<[4]xf32>
+ %v2 = vector.broadcast %f2 : f32 to vector<[4]xf32>
vector.print %v1 : vector<[4]xf32>
vector.print %v2 : vector<[4]xf32>
//
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir
index f812c25..740c742 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/interleave.mlir
@@ -6,8 +6,8 @@
func.func @entry() {
%f1 = arith.constant 1.0 : f32
%f2 = arith.constant 2.0 : f32
- %v1 = vector.splat %f1 : vector<2x4xf32>
- %v2 = vector.splat %f2 : vector<2x4xf32>
+ %v1 = vector.broadcast %f1 : f32 to vector<2x4xf32>
+ %v2 = vector.broadcast %f2 : f32 to vector<2x4xf32>
vector.print %v1 : vector<2x4xf32>
vector.print %v2 : vector<2x4xf32>
//
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
index f7e2229..e25795a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
@@ -14,9 +14,9 @@
!vector_type_R = vector<7xf32>
func.func @vector_outerproduct_splat_8x8(%fa: f32, %fb: f32, %fc: f32) -> !vector_type_C {
- %a = vector.splat %fa: !vector_type_A
- %b = vector.splat %fb: !vector_type_B
- %c = vector.splat %fc: !vector_type_C
+ %a = vector.broadcast %fa: f32 to !vector_type_A
+ %b = vector.broadcast %fb: f32 to !vector_type_B
+ %c = vector.broadcast %fc: f32 to !vector_type_C
%d = vector.outerproduct %a, %b, %c : !vector_type_A, !vector_type_B
return %d: !vector_type_C
}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
index a19dfa1..0675102 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
@@ -14,9 +14,9 @@
!vector_type_R = vector<7xi64>
func.func @vector_outerproduct_splat_8x8(%ia: i64, %ib: i64, %ic: i64) -> !vector_type_C {
- %a = vector.splat %ia: !vector_type_A
- %b = vector.splat %ib: !vector_type_B
- %c = vector.splat %ic: !vector_type_C
+ %a = vector.broadcast %ia: i64 to !vector_type_A
+ %b = vector.broadcast %ib: i64 to !vector_type_B
+ %c = vector.broadcast %ic: i64 to !vector_type_C
%d = vector.outerproduct %a, %b, %c : !vector_type_A, !vector_type_B
return %d: !vector_type_C
}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
index 639eed4..895b881 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
@@ -137,7 +137,7 @@ func.func @transfer_read_1d_mask_in_bounds(
// Non-contiguous, strided store.
func.func @transfer_write_1d(%A : memref<?x?xf32>, %base1 : index, %base2 : index) {
%fn1 = arith.constant -1.0 : f32
- %vf0 = vector.splat %fn1 : vector<7xf32>
+ %vf0 = vector.broadcast %fn1 : f32 to vector<7xf32>
vector.transfer_write %vf0, %A[%base1, %base2]
{permutation_map = affine_map<(d0, d1) -> (d0)>}
: vector<7xf32>, memref<?x?xf32>
@@ -147,7 +147,7 @@ func.func @transfer_write_1d(%A : memref<?x?xf32>, %base1 : index, %base2 : inde
// Non-contiguous, strided store.
func.func @transfer_write_1d_mask(%A : memref<?x?xf32>, %base1 : index, %base2 : index) {
%fn1 = arith.constant -2.0 : f32
- %vf0 = vector.splat %fn1 : vector<7xf32>
+ %vf0 = vector.broadcast %fn1 : f32 to vector<7xf32>
%mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1]> : vector<7xi1>
vector.transfer_write %vf0, %A[%base1, %base2], %mask
{permutation_map = affine_map<(d0, d1) -> (d0)>}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
index 009c137..80dff9d 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
@@ -100,7 +100,7 @@ func.func @transfer_read_2d_broadcast(
// Vector store.
func.func @transfer_write_2d(%A : memref<?x?xf32>, %base1: index, %base2: index) {
%fn1 = arith.constant -1.0 : f32
- %vf0 = vector.splat %fn1 : vector<1x4xf32>
+ %vf0 = vector.broadcast %fn1 : f32 to vector<1x4xf32>
vector.transfer_write %vf0, %A[%base1, %base2]
{permutation_map = affine_map<(d0, d1) -> (d0, d1)>} :
vector<1x4xf32>, memref<?x?xf32>
@@ -111,7 +111,7 @@ func.func @transfer_write_2d(%A : memref<?x?xf32>, %base1: index, %base2: index)
func.func @transfer_write_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: index) {
%fn1 = arith.constant -2.0 : f32
%mask = arith.constant dense<[[1, 0, 1, 0]]> : vector<1x4xi1>
- %vf0 = vector.splat %fn1 : vector<1x4xf32>
+ %vf0 = vector.broadcast %fn1 : f32 to vector<1x4xf32>
vector.transfer_write %vf0, %A[%base1, %base2], %mask
{permutation_map = affine_map<(d0, d1) -> (d0, d1)>} :
vector<1x4xf32>, memref<?x?xf32>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
index d41d9c9..93e6a12 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
@@ -62,7 +62,7 @@ func.func @transfer_read_3d_transposed(%A : memref<?x?x?x?xf32>,
func.func @transfer_write_3d(%A : memref<?x?x?x?xf32>,
%o: index, %a: index, %b: index, %c: index) {
%fn1 = arith.constant -1.0 : f32
- %vf0 = vector.splat %fn1 : vector<2x9x3xf32>
+ %vf0 = vector.broadcast %fn1 : f32 to vector<2x9x3xf32>
vector.transfer_write %vf0, %A[%o, %a, %b, %c]
: vector<2x9x3xf32>, memref<?x?x?x?xf32>
return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
index d1a2790..18084e3 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
@@ -45,7 +45,7 @@ func.func @transfer_read_mask_inbounds_4(%A : memref<?xf32>, %base: index) {
func.func @transfer_write_1d(%A : memref<?xf32>, %base: index) {
%f0 = arith.constant 0.0 : f32
- %vf0 = vector.splat %f0 : vector<4xf32>
+ %vf0 = vector.broadcast %f0 : f32 to vector<4xf32>
vector.transfer_write %vf0, %A[%base]
{permutation_map = affine_map<(d0) -> (d0)>} :
vector<4xf32>, memref<?xf32>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
index def7081..2251738 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
@@ -5,7 +5,7 @@
func.func @transfer_write16_inbounds_1d(%A : memref<?xf32>, %base: index) {
%f = arith.constant 16.0 : f32
- %v = vector.splat %f : vector<16xf32>
+ %v = vector.broadcast %f : f32 to vector<16xf32>
vector.transfer_write %v, %A[%base]
{permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]}
: vector<16xf32>, memref<?xf32>
@@ -14,7 +14,7 @@ func.func @transfer_write16_inbounds_1d(%A : memref<?xf32>, %base: index) {
func.func @transfer_write13_1d(%A : memref<?xf32>, %base: index) {
%f = arith.constant 13.0 : f32
- %v = vector.splat %f : vector<13xf32>
+ %v = vector.broadcast %f : f32 to vector<13xf32>
vector.transfer_write %v, %A[%base]
{permutation_map = affine_map<(d0) -> (d0)>}
: vector<13xf32>, memref<?xf32>
@@ -23,7 +23,7 @@ func.func @transfer_write13_1d(%A : memref<?xf32>, %base: index) {
func.func @transfer_write17_1d(%A : memref<?xf32>, %base: index) {
%f = arith.constant 17.0 : f32
- %v = vector.splat %f : vector<17xf32>
+ %v = vector.broadcast %f : f32 to vector<17xf32>
vector.transfer_write %v, %A[%base]
{permutation_map = affine_map<(d0) -> (d0)>}
: vector<17xf32>, memref<?xf32>
@@ -42,7 +42,7 @@ func.func @transfer_read_1d(%A : memref<?xf32>) -> vector<32xf32> {
func.func @transfer_write_inbounds_3d(%A : memref<4x4x4xf32>) {
%c0 = arith.constant 0: index
%f = arith.constant 0.0 : f32
- %v0 = vector.splat %f : vector<2x3x4xf32>
+ %v0 = vector.broadcast %f : f32 to vector<2x3x4xf32>
%f1 = arith.constant 1.0 : f32
%f2 = arith.constant 2.0 : f32
%f3 = arith.constant 3.0 : f32
diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir
new file mode 100644
index 0000000..7e66dee
--- /dev/null
+++ b/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir
@@ -0,0 +1,59 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_levelzero_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+ memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
+ memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
+ func.func @main() {
+ %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
+ %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
+ %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
+ %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
+ call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+ return
+ }
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
+ %c2 = arith.constant 2 : index
+ %c1 = arith.constant 1 : index
+ %mem = gpu.alloc host_shared () : memref<2x2x2xf32>
+ memref.copy %arg1, %mem : memref<2x2x2xf32> to memref<2x2x2xf32>
+ %memref_0 = gpu.alloc host_shared () : memref<2x2x2xf32>
+ memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32>
+ %memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32>
+ %2 = gpu.wait async
+ %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1)
+ args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
+ gpu.wait [%3]
+ %alloc = memref.alloc() : memref<2x2x2xf32>
+ memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32>
+ %4 = gpu.wait async
+ %5 = gpu.dealloc async [%4] %memref_2 : memref<2x2x2xf32>
+ %6 = gpu.dealloc async [%5] %memref_0 : memref<2x2x2xf32>
+ %7 = gpu.dealloc async [%6] %mem : memref<2x2x2xf32>
+ gpu.wait [%7]
+ return %alloc : memref<2x2x2xf32>
+ }
+ gpu.module @test_kernel
+ attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel
+ attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = gpu.block_id z
+ %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
+ %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
+ %5 = arith.addf %3, %4 : f32
+ memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
+ gpu.return
+ }
+ }
+ // CHECK: [2.3, 4.5]
+ // CHECK: [7.8, 10.2]
+ // CHECK: [12.7, 14.9]
+ // CHECK: [18.2, 20.6]
+}
diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir
new file mode 100644
index 0000000..df8fbe4
--- /dev/null
+++ b/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_levelzero_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+ memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>
+ memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>
+ func.func @main() {
+ %0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64>
+ %1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64>
+ %2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64>
+ %cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64>
+ call @printMemrefI64(%cast) : (memref<*xi64>) -> ()
+ return
+ }
+ func.func private @printMemrefI64(memref<*xi64>)
+ func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> {
+ %c3 = arith.constant 3 : index
+ %c1 = arith.constant 1 : index
+ %mem = gpu.alloc host_shared () : memref<3x3xi64>
+ memref.copy %arg1, %mem : memref<3x3xi64> to memref<3x3xi64>
+ %memref_0 = gpu.alloc host_shared () : memref<3x3xi64>
+ memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
+ %memref_2 = gpu.alloc host_shared () : memref<3x3xi64>
+ %2 = gpu.wait async
+ %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1)
+ args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
+ gpu.wait [%3]
+ %alloc = memref.alloc() : memref<3x3xi64>
+ memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
+ %4 = gpu.wait async
+ %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64>
+ %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64>
+ %7 = gpu.dealloc async [%6] %mem : memref<3x3xi64>
+ gpu.wait [%7]
+ return %alloc : memref<3x3xi64>
+ }
+ gpu.module @test_kernel
+ attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel
+ attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = memref.load %arg0[%0, %1] : memref<3x3xi64>
+ %3 = memref.load %arg1[%0, %1] : memref<3x3xi64>
+ %4 = arith.addi %2, %3 : i64
+ memref.store %4, %arg2[%0, %1] : memref<3x3xi64>
+ gpu.return
+ }
+ }
+ // CHECK: [2, 4100, 6],
+ // CHECK: [16777224, 10, 4294971404],
+ // CHECK: [16777230, 1103806595088, 1099511627794]
+}
diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir
new file mode 100644
index 0000000..cd99f2c
--- /dev/null
+++ b/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir
@@ -0,0 +1,56 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(gpu-async-region),spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_levelzero_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+ memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
+ memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
+ func.func @main() {
+ %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
+ %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
+ %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
+ %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
+ call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+ memref.dealloc %2 : memref<2x2x2xf32>
+ return
+ }
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
+ %c2 = arith.constant 2 : index
+ %c1 = arith.constant 1 : index
+ %memref = gpu.alloc () : memref<2x2x2xf32>
+ gpu.memcpy %memref, %arg0 : memref<2x2x2xf32>, memref<2x2x2xf32>
+ %memref_0 = gpu.alloc () : memref<2x2x2xf32>
+ gpu.memcpy %memref_0, %arg1 : memref<2x2x2xf32>, memref<2x2x2xf32>
+ %memref_1 = gpu.alloc () : memref<2x2x2xf32>
+ gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1)
+ args(%memref : memref<2x2x2xf32>, %memref_0 : memref<2x2x2xf32>, %memref_1 : memref<2x2x2xf32>)
+ %alloc = memref.alloc() : memref<2x2x2xf32>
+ gpu.memcpy %alloc, %memref_1 : memref<2x2x2xf32>, memref<2x2x2xf32>
+ gpu.dealloc %memref_1 : memref<2x2x2xf32>
+ gpu.dealloc %memref_0 : memref<2x2x2xf32>
+ gpu.dealloc %memref : memref<2x2x2xf32>
+ return %alloc : memref<2x2x2xf32>
+ }
+ gpu.module @test_kernel
+ attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel
+ attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = gpu.block_id z
+ %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
+ %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
+ %5 = arith.addf %3, %4 : f32
+ memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
+ gpu.return
+ }
+ }
+ // CHECK: [2.3, 4.5]
+ // CHECK: [7.8, 10.2]
+ // CHECK: [12.7, 14.9]
+ // CHECK: [18.2, 20.6]
+}
diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir
new file mode 100644
index 0000000..8d022ac
--- /dev/null
+++ b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir
@@ -0,0 +1,86 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_levelzero_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @relu attributes {gpu.container_module} {
+ memref.global "private" constant @__constant_4x5xf32 : memref<4x5xf32> = dense<[
+ [-1.000000e-01, -2.000000e-01, -3.000000e-01, 4.000000e-01, 5.000000e-01],
+ [1.000000e-01, -2.000000e-01, 3.000000e-01, -4.000000e-01, 5.000000e-01],
+ [1.000000e-01, 2.000000e-01, 3.000000e-01, -4.000000e-01, -5.000000e-01],
+ [1.000000e-01, 2.000000e-01, 3.000000e-01, 4.000000e-01, 5.000000e-01]
+ ]>
+
+ func.func @main() {
+ %c1 = arith.constant 1 : index
+ %c100 = arith.constant 100 : index
+ %c0 = arith.constant 0 : index
+ %0 = memref.get_global @__constant_4x5xf32 : memref<4x5xf32>
+
+ scf.for %arg0 = %c0 to %c100 step %c1 {
+ %1 = func.call @test(%0) : (memref<4x5xf32>) -> memref<4x5xf32>
+ %cast = memref.cast %1 : memref<4x5xf32> to memref<*xf32>
+ func.call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+ // CHECK: [0, 0, 0, 0.4, 0.5],
+ // CHECK: [0.1, 0, 0.3, 0, 0.5],
+ // CHECK: [0.1, 0.2, 0.3, 0, 0],
+ // CHECK: [0.1, 0.2, 0.3, 0.4, 0.5]
+ }
+ return
+ }
+
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func @test(%arg0: memref<4x5xf32>) -> memref<4x5xf32> {
+ %c5 = arith.constant 5 : index
+ %c4 = arith.constant 4 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %c1 = arith.constant 1 : index
+ %memref = gpu.alloc host_shared () : memref<4x5xf32>
+ memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32>
+ %memref_0 = gpu.alloc host_shared () : memref<4x5xi1>
+ %2 = gpu.wait async
+ %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1)
+ args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
+ gpu.wait [%3]
+ %memref_1 = gpu.alloc host_shared () : memref<4x5xf32>
+ %4 = gpu.wait async
+ %5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1)
+ args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32,
+ %memref_1 : memref<4x5xf32>)
+ gpu.wait [%5]
+ %alloc = memref.alloc() : memref<4x5xf32>
+ memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32>
+ %6 = gpu.wait async
+ %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32>
+ %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1>
+ %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32>
+ return %alloc : memref<4x5xf32>
+ }
+ gpu.module @test_kernel
+ attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel
+ attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = memref.load %arg0[%0, %1] : memref<4x5xf32>
+ %3 = arith.cmpf olt, %2, %arg1 : f32
+ memref.store %3, %arg2[%0, %1] : memref<4x5xi1>
+ gpu.return
+ }
+ }
+ gpu.module @test_kernel_0
+ attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel
+ attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = memref.load %arg0[%0, %1] : memref<4x5xi1>
+ %3 = memref.load %arg1[%0, %1] : memref<4x5xf32>
+ %4 = arith.select %2, %arg2, %3 : f32
+ memref.store %4, %arg3[%0, %1] : memref<4x5xf32>
+ gpu.return
+ }
+ }
+}
diff --git a/mlir/test/Integration/GPU/LevelZero/lit.local.cfg b/mlir/test/Integration/GPU/LevelZero/lit.local.cfg
new file mode 100644
index 0000000..36c7ad5
--- /dev/null
+++ b/mlir/test/Integration/GPU/LevelZero/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.enable_levelzero_runner:
+ config.unsupported = True
diff --git a/mlir/test/Pass/pipeline-options-parsing.mlir b/mlir/test/Pass/pipeline-options-parsing.mlir
index 9385d35..03ac38e 100644
--- a/mlir/test/Pass/pipeline-options-parsing.mlir
+++ b/mlir/test/Pass/pipeline-options-parsing.mlir
@@ -13,6 +13,7 @@
// RUN: mlir-opt %s -verify-each=false -pass-pipeline='builtin.module(builtin.module(func.func(test-options-pass{list=3}), func.func(test-options-pass{enum=one list=1,2,3,4 string=foo"bar"baz})))' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_6 %s
// RUN: mlir-opt %s -verify-each=false '-test-options-super-pass-pipeline=super-list={{enum=zero list=1 string=foo},{enum=one list=2 string="bar"},{enum=two list=3 string={baz}}}' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_7 %s
// RUN: mlir-opt %s -verify-each=false -pass-pipeline='builtin.module(func.func(test-options-super-pass{list={{enum=zero list={1} string=foo },{enum=one list={2} string=bar },{enum=two list={3} string=baz }}}))' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_7 %s
+// RUN: mlir-opt %s -verify-each=false -test-options-super-set-ab-pipeline='foo=true bar=false' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_11 %s
// This test checks that lists-of-nested-options like 'option1={...},{....}' can be parsed
@@ -106,3 +107,12 @@
// CHECK_10-NEXT: test-options-pass{enum=zero string= string-list={,}}
// CHECK_10-NEXT: )
// CHECK_10-NEXT: )
+
+// CHECK_11: builtin.module(
+// CHECK_11-NEXT: func.func(
+// CHECK_11-NEXT: test-options-pass-a
+// CHECK_11-NEXT: )
+// CHECK_11-NEXT: func.func(
+// CHECK_11-NEXT: test-options-pass-b
+// CHECK_11-NEXT: )
+// CHECK_11-NEXT: )
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll b/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll
index 797a75c..18c9319 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll
@@ -3,9 +3,9 @@
; CHECK-LABEL: llvm.func @lifetime
define void @lifetime() {
%a = alloca [16 x i8]
- ; CHECK: llvm.call_intrinsic "llvm.lifetime.start.p0"({{.*}}, %[[ptr:.*]]) : (i64, !llvm.ptr {llvm.nonnull}) -> ()
- call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %a)
- ; CHECK: llvm.call_intrinsic "llvm.lifetime.end.p0"({{.*}}, %[[ptr]]) : (i64, !llvm.ptr {llvm.nonnull}) -> ()
- call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %a)
+ ; CHECK: llvm.call_intrinsic "llvm.lifetime.start.p0"(%[[ptr:.*]]) : (!llvm.ptr {llvm.nonnull}) -> ()
+ call void @llvm.lifetime.start.p0(ptr nonnull %a)
+ ; CHECK: llvm.call_intrinsic "llvm.lifetime.end.p0"(%[[ptr]]) : (!llvm.ptr {llvm.nonnull}) -> ()
+ call void @llvm.lifetime.end.p0(ptr nonnull %a)
ret void
}
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index a419d75..9f882ad 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -876,10 +876,10 @@ define void @stack_restore(ptr %0, ptr addrspace(1) %1) {
; CHECK-LABEL: llvm.func @lifetime
define void @lifetime() {
%a = alloca [16 x i8]
- ; CHECK: llvm.intr.lifetime.start 16, %{{.*}} : !llvm.ptr
- call void @llvm.lifetime.start.p0(i64 16, ptr %a)
- ; CHECK: llvm.intr.lifetime.end 32, %{{.*}} : !llvm.ptr
- call void @llvm.lifetime.end.p0(i64 32, ptr %a)
+ ; CHECK: llvm.intr.lifetime.start %{{.*}} : !llvm.ptr
+ call void @llvm.lifetime.start.p0(ptr %a)
+ ; CHECK: llvm.intr.lifetime.end %{{.*}} : !llvm.ptr
+ call void @llvm.lifetime.end.p0(ptr %a)
ret void
}
@@ -1353,8 +1353,8 @@ declare <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double>, <8 x i1>, i32)
declare <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double>, <8 x i1>, i32)
declare <8 x i64> @llvm.vp.ptrtoint.v8i64.v8p0(<8 x ptr>, <8 x i1>, i32)
declare <8 x ptr> @llvm.vp.inttoptr.v8p0.v8i64(<8 x i64>, <8 x i1>, i32)
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+declare void @llvm.lifetime.start.p0(ptr nocapture)
+declare void @llvm.lifetime.end.p0(ptr nocapture)
declare ptr @llvm.invariant.start.p0(i64 immarg, ptr nocapture)
declare void @llvm.invariant.end.p0(ptr, i64 immarg, ptr nocapture)
declare ptr @llvm.launder.invariant.group.p0(ptr nocapture)
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index eb3510c..2b420ed 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -1104,9 +1104,9 @@ llvm.func @lifetime() {
%c = llvm.mlir.constant(16 : i64) : i64
%a = llvm.alloca %c x i8 : (i64) -> !llvm.ptr
// CHECK: call void @llvm.lifetime.start
- llvm.intr.lifetime.start 16, %a : !llvm.ptr
+ llvm.intr.lifetime.start %a : !llvm.ptr
// CHECK: call void @llvm.lifetime.end
- llvm.intr.lifetime.end 16, %a : !llvm.ptr
+ llvm.intr.lifetime.end %a : !llvm.ptr
llvm.return
}
@@ -1418,8 +1418,8 @@ llvm.func @experimental_constrained_fpext(%s: f32, %v: vector<4xf32>) {
// CHECK-DAG: declare <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32>, i64 immarg)
// CHECK-DAG: declare { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double>)
// CHECK-DAG: declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-// CHECK-DAG: declare void @llvm.lifetime.start.p0(i64 immarg, ptr captures(none))
-// CHECK-DAG: declare void @llvm.lifetime.end.p0(i64 immarg, ptr captures(none))
+// CHECK-DAG: declare void @llvm.lifetime.start.p0(ptr captures(none))
+// CHECK-DAG: declare void @llvm.lifetime.end.p0(ptr captures(none))
// CHECK-DAG: declare ptr @llvm.invariant.start.p0(i64 immarg, ptr captures(none))
// CHECK-DAG: declare void @llvm.invariant.end.p0(ptr, i64 immarg, ptr captures(none))
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 85478cc..991222c 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -1,5 +1,24 @@
// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+llvm.func @pmevent_no_id() {
+ // expected-error @below {{either `id` or `mask` must be set}}
+ nvvm.pmevent
+}
+
+// -----
+
+llvm.func @pmevent_bigger15() {
+ // expected-error @below {{`id` must be between 0 and 15}}
+ nvvm.pmevent id = 141
+}
+
+// -----
+
+llvm.func @pmevent_many_ids() {
+ // expected-error @below {{`id` and `mask` cannot be set at the same time}}
+ nvvm.pmevent id = 1 mask = 1
+}
+
// -----
llvm.func @kernel_func(%numberOfThreads : i32) {
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 5c2cfa4..b1800e8 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -918,3 +918,14 @@ llvm.func @nvvm_dot_accumulate_2way(%a: vector<2xi16>, %b: vector<4xi8>, %c: i32
%7 = nvvm.dot.accumulate.2way %a <signed>, %b <signed>, %c {b_hi = true}: vector<2xi16>, vector<4xi8>
llvm.return
}
+
+// -----
+
+// CHECK-LABEL: @nvvm_pmevent
+llvm.func @nvvm_pmevent() {
+ // CHECK: call void @llvm.nvvm.pm.event.mask(i16 15000)
+ nvvm.pmevent mask = 15000
+ // CHECK: call void @llvm.nvvm.pm.event.mask(i16 4)
+ nvvm.pmevent mask = 4
+ llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir b/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir
new file mode 100644
index 0000000..3553907
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir
@@ -0,0 +1,44 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK: atomicrmw add ptr %loadgep_, i32 1 monotonic, align 4, !amdgpu.no.remote.memory !{{.*}}
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, !llvm.ptr<1> = dense<64> : vector<4xi64>, !llvm.ptr<2> = dense<32> : vector<4xi64>, !llvm.ptr<3> = dense<32> : vector<4xi64>, !llvm.ptr<4> = dense<64> : vector<4xi64>, !llvm.ptr<5> = dense<32> : vector<4xi64>, !llvm.ptr<6> = dense<32> : vector<4xi64>, !llvm.ptr<7> = dense<[160, 256, 256, 32]> : vector<4xi64>, !llvm.ptr<8> = dense<[128, 128, 128, 48]> : vector<4xi64>, !llvm.ptr<9> = dense<[192, 256, 256, 32]> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.legal_int_widths" = array<i32: 32, 64>, "dlti.stack_alignment" = 32 : i64, "dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, fir.atomic_fine_grained_memory, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "generic-hsa", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags<openmp_device_version = 31>, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp<clause_requires none>, omp.target_triples = [], omp.version = #omp.version<version = 31>} {
+ llvm.func @_QQmain() attributes {fir.bindc_name = "TEST", omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, target_cpu = "generic-hsa"} {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x i32 {bindc_name = "threads"} : (i64) -> !llvm.ptr<5>
+ %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+ %3 = llvm.mlir.constant(1 : i64) : i64
+ %4 = llvm.alloca %3 x i32 {bindc_name = "capture"} : (i64) -> !llvm.ptr<5>
+ %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr
+ %6 = llvm.mlir.constant(1 : i64) : i64
+ %7 = llvm.alloca %6 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr<5>
+ %8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr
+ %9 = llvm.mlir.constant(0 : i32) : i32
+ %10 = llvm.mlir.constant(128 : i32) : i32
+ %11 = llvm.mlir.constant(1 : i64) : i64
+ %12 = llvm.mlir.constant(1 : i64) : i64
+ %13 = llvm.mlir.constant(1 : i64) : i64
+ llvm.store %10, %2 : i32, !llvm.ptr
+ llvm.store %9, %8 : i32, !llvm.ptr
+ %14 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "threads"}
+ %15 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "capture"}
+ %16 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "a"}
+ omp.target map_entries(%14 -> %arg0, %15 -> %arg1, %16 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+ %17 = llvm.mlir.constant(1 : i32) : i32
+ %18 = llvm.load %arg0 : !llvm.ptr -> i32
+ omp.parallel num_threads(%18 : i32) {
+ omp.atomic.capture {
+ omp.atomic.read %arg1 = %arg2 : !llvm.ptr, !llvm.ptr, i32
+ omp.atomic.update %arg2 : !llvm.ptr {
+ ^bb0(%arg3: i32):
+ %19 = llvm.add %arg3, %17 : i32
+ omp.yield(%19 : i32)
+ } {atomic_control = #omp.atomic_control<fine_grained_memory = true>}
+ }
+ omp.terminator
+ }
+ omp.terminator
+ }
+ llvm.return
+ }
+}
diff --git a/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir b/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir
new file mode 100644
index 0000000..3b0005b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK: atomicrmw add ptr %loadgep_, i32 1 monotonic, align 4, !amdgpu.ignore.denormal.mode !{{.*}}, !amdgpu.no.fine.grained.memory !{{.*}}, !amdgpu.no.remote.memory !{{.*}}
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, !llvm.ptr<1> = dense<64> : vector<4xi64>, !llvm.ptr<2> = dense<32> : vector<4xi64>, !llvm.ptr<3> = dense<32> : vector<4xi64>, !llvm.ptr<4> = dense<64> : vector<4xi64>, !llvm.ptr<5> = dense<32> : vector<4xi64>, !llvm.ptr<6> = dense<32> : vector<4xi64>, !llvm.ptr<7> = dense<[160, 256, 256, 32]> : vector<4xi64>, !llvm.ptr<8> = dense<[128, 128, 128, 48]> : vector<4xi64>, !llvm.ptr<9> = dense<[192, 256, 256, 32]> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.legal_int_widths" = array<i32: 32, 64>, "dlti.stack_alignment" = 32 : i64, "dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, fir.atomic_ignore_denormal_mode, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "generic-hsa", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags<openmp_device_version = 31>, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp<clause_requires none>, omp.target_triples = [], omp.version = #omp.version<version = 31>} {
+ llvm.func @_QQmain() attributes {fir.bindc_name = "TEST", omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, target_cpu = "generic-hsa"} {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x i32 {bindc_name = "threads"} : (i64) -> !llvm.ptr<5>
+ %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+ %3 = llvm.mlir.constant(1 : i64) : i64
+ %4 = llvm.alloca %3 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr<5>
+ %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr
+ %6 = llvm.mlir.constant(0 : i32) : i32
+ %7 = llvm.mlir.constant(128 : i32) : i32
+ %8 = llvm.mlir.constant(1 : i64) : i64
+ %9 = llvm.mlir.constant(1 : i64) : i64
+ llvm.store %7, %2 : i32, !llvm.ptr
+ llvm.store %6, %5 : i32, !llvm.ptr
+ %10 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "threads"}
+ %11 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "a"}
+ omp.target map_entries(%10 -> %arg0, %11 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+ %12 = llvm.mlir.constant(1 : i32) : i32
+ %13 = llvm.load %arg0 : !llvm.ptr -> i32
+ omp.parallel num_threads(%13 : i32) {
+ omp.atomic.update %arg1 : !llvm.ptr {
+ ^bb0(%arg2: i32):
+ %14 = llvm.add %arg2, %12 : i32
+ omp.yield(%14 : i32)
+ } {atomic_control = #omp.atomic_control<ignore_denormal_mode = true>}
+ omp.terminator
+ }
+ omp.terminator
+ }
+ llvm.return
+ }
+}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 740990a..ce43941 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -125,6 +125,23 @@ llvm.func @rocdl.ballot64(%pred : i1) -> i64 {
llvm.return %0 : i64
}
+llvm.func @rocdl.readfirstlane(%src0 : f32, %src1: f64, %src2: i32, %src3: vector<2 x f32>) -> f32 {
+ // CHECK-LABEL: rocdl.readfirstlane
+ // CHECK: call float @llvm.amdgcn.readfirstlane.f32(float %{{.*}})
+ %0 = rocdl.readfirstlane %src0 : f32
+
+ // CHECK: call double @llvm.amdgcn.readfirstlane.f64(double %{{.*}})
+ %1 = rocdl.readfirstlane %src1 : f64
+
+ // CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %{{.*}})
+ %2 = rocdl.readfirstlane %src2 : i32
+
+ // CHECK: call <2 x float> @llvm.amdgcn.readfirstlane.v2f32(<2 x float> %{{.*}})
+ %3 = rocdl.readfirstlane %src3 : vector<2 x f32>
+
+ llvm.return %0 : f32
+}
+
llvm.func @rocdl.readlane(%src0 : f32, %src1: f64, %src2: i32, %src3: vector<2 x f32>) -> f32 {
%idx = llvm.mlir.constant(0 : i32) : i32
diff --git a/mlir/test/Target/SPIRV/arm-tensor-constant.mlir b/mlir/test/Target/SPIRV/arm-tensor-constant.mlir
index 275e586..7fb8af1 100644
--- a/mlir/test/Target/SPIRV/arm-tensor-constant.mlir
+++ b/mlir/test/Target/SPIRV/arm-tensor-constant.mlir
@@ -1,17 +1,36 @@
// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
-// DISABLED: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv %s | spirv-val %}
-
-// FIXME(#152012): Fix arm tensor constant validation errors and reenable spirv-val tests.
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv %s | spirv-val %}
spirv.module Logical Vulkan requires #spirv.vce<v1.3,
[VulkanMemoryModel, Shader, TensorsARM, Linkage], [SPV_KHR_vulkan_memory_model, SPV_ARM_tensors]> {
- // CHECK-LABEL: @arm_tensor_of_i32
- spirv.func @arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3xi32>) "None" {
+ // CHECK-LABEL: @rank_1_arm_tensor_of_i32
+ spirv.func @rank_1_arm_tensor_of_i32() -> (!spirv.arm.tensor<3xi32>) "None" {
+ // CHECK: {{%.*}} = spirv.Constant dense<[1, 2, 3]> : !spirv.arm.tensor<3xi32>
+ %0 = spirv.Constant dense<[1, 2, 3]> : !spirv.arm.tensor<3xi32>
+ spirv.ReturnValue %0 : !spirv.arm.tensor<3xi32>
+ }
+
+ // CHECK-LABEL: @rank_2_arm_tensor_of_i32
+ spirv.func @rank_2_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3xi32>) "None" {
// CHECK: {{%.*}} = spirv.Constant dense<{{\[}}[1, 2, 3], [4, 5, 6]]> : !spirv.arm.tensor<2x3xi32>
%0 = spirv.Constant dense<[[1, 2, 3], [4, 5, 6]]> : !spirv.arm.tensor<2x3xi32>
spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xi32>
}
+ // CHECK-LABEL: @rank_3_arm_tensor_of_i32
+ spirv.func @rank_3_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x2x3xi32>) "None" {
+ // CHECK: {{%.*}} = spirv.Constant dense<{{\[}}{{\[}}[1, 2, 3], [4, 5, 6]], {{\[}}[7, 8, 9], [10, 11, 12]]]> : !spirv.arm.tensor<2x2x3xi32>
+ %0 = spirv.Constant dense<[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]> : !spirv.arm.tensor<2x2x3xi32>
+ spirv.ReturnValue %0 : !spirv.arm.tensor<2x2x3xi32>
+ }
+
+ // CHECK-LABEL: @rank_4_arm_tensor_of_i32
+ spirv.func @rank_4_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3x4x5xi32>) "None" {
+ // CHECK: {{%.*}} = spirv.Constant dense<5> : !spirv.arm.tensor<2x3x4x5xi32>
+ %0 = spirv.Constant dense<5> : !spirv.arm.tensor<2x3x4x5xi32>
+ spirv.ReturnValue %0 : !spirv.arm.tensor<2x3x4x5xi32>
+ }
+
// CHECK-LABEL: @splat_arm_tensor_of_i32
spirv.func @splat_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3xi32>) "None" {
// CHECK: {{%.*}} = spirv.Constant dense<2> : !spirv.arm.tensor<2x3xi32>
@@ -19,13 +38,34 @@ spirv.module Logical Vulkan requires #spirv.vce<v1.3,
spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xi32>
}
- // CHECK-LABEL: @arm_tensor_of_f32
- spirv.func @arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3xf32>) "None" {
+ // CHECK-LABEL: @rank_1_arm_tensor_of_f32
+ spirv.func @rank_1_arm_tensor_of_f32() -> (!spirv.arm.tensor<3xf32>) "None" {
+ // CHECK: {{%.*}} = spirv.Constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00]> : !spirv.arm.tensor<3xf32>
+ %0 = spirv.Constant dense<[1.0, 2.0, 3.0]> : !spirv.arm.tensor<3xf32>
+ spirv.ReturnValue %0 : !spirv.arm.tensor<3xf32>
+ }
+
+ // CHECK-LABEL: @rank_2_arm_tensor_of_f32
+ spirv.func @rank_2_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3xf32>) "None" {
// CHECK: {{%.*}} = spirv.Constant dense<{{\[}}[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : !spirv.arm.tensor<2x3xf32>
- %0 = spirv.Constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]>: !spirv.arm.tensor<2x3xf32>
+ %0 = spirv.Constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : !spirv.arm.tensor<2x3xf32>
spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xf32>
}
+ // CHECK-LABEL: @rank_3_arm_tensor_of_f32
+ spirv.func @rank_3_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x2x3xf32>) "None" {
+ // CHECK: {{%.*}} = spirv.Constant dense<{{\[}}{{\[}}[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]], {{\[}}[7.000000e+00, 8.000000e+00, 9.000000e+00], [1.000000e+01, 1.100000e+01, 1.200000e+01]]]> : !spirv.arm.tensor<2x2x3xf32>
+ %0 = spirv.Constant dense<[[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]]> : !spirv.arm.tensor<2x2x3xf32>
+ spirv.ReturnValue %0 : !spirv.arm.tensor<2x2x3xf32>
+ }
+
+ // CHECK-LABEL: @rank_4_arm_tensor_of_f32
+ spirv.func @rank_4_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3x4x5xf32>) "None" {
+ // CHECK: {{%.*}} = spirv.Constant dense<5.000000e+00> : !spirv.arm.tensor<2x3x4x5xf32>
+ %0 = spirv.Constant dense<5.0> : !spirv.arm.tensor<2x3x4x5xf32>
+ spirv.ReturnValue %0 : !spirv.arm.tensor<2x3x4x5xf32>
+ }
+
// CHECK-LABEL: @splat_arm_tensor_of_f32
spirv.func @splat_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3xf32>) "None" {
// CHECK: {{%.*}} = spirv.Constant dense<2.000000e+00> : !spirv.arm.tensor<2x3xf32>
diff --git a/mlir/test/Target/SPIRV/decorations.mlir b/mlir/test/Target/SPIRV/decorations.mlir
index ee7ad81..90ba690e 100644
--- a/mlir/test/Target/SPIRV/decorations.mlir
+++ b/mlir/test/Target/SPIRV/decorations.mlir
@@ -58,6 +58,20 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
// -----
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Tessellation, Linkage], []> {
+ // CHECK: patch
+ spirv.GlobalVariable @var {patch} : !spirv.ptr<vector<4xf32>, Input>
+}
+
+// -----
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
+ // CHECK: invariant
+ spirv.GlobalVariable @var {invariant} : !spirv.ptr<vector<2xf32>, Output>
+}
+
+// -----
+
spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
// CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = "outSideGlobalVar1", linkage_type = <Import>>
spirv.GlobalVariable @var1 {
diff --git a/mlir/test/Transforms/test-legalizer-fold-after.mlir b/mlir/test/Transforms/test-legalizer-fold-after.mlir
new file mode 100644
index 0000000..7f80252
--- /dev/null
+++ b/mlir/test/Transforms/test-legalizer-fold-after.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-opt %s -test-legalize-patterns="test-legalize-folding-mode=after-patterns" | FileCheck %s
+
+// CHECK-LABEL: @fold_legalization
+func.func @fold_legalization() -> i32 {
+ // CHECK-NOT: op_in_place_self_fold
+ // CHECK: 97
+ %1 = "test.op_in_place_self_fold"() : () -> (i32)
+ "test.return"(%1) : (i32) -> ()
+}
diff --git a/mlir/test/Transforms/test-legalizer-fold-before.mlir b/mlir/test/Transforms/test-legalizer-fold-before.mlir
new file mode 100644
index 0000000..fe6e293
--- /dev/null
+++ b/mlir/test/Transforms/test-legalizer-fold-before.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-opt %s -test-legalize-patterns="test-legalize-folding-mode=before-patterns" | FileCheck %s
+
+// CHECK-LABEL: @fold_legalization
+func.func @fold_legalization() -> i32 {
+ // CHECK: op_in_place_self_fold
+ // CHECK-SAME: folded
+ %1 = "test.op_in_place_self_fold"() : () -> (i32)
+ "test.return"(%1) : (i32) -> ()
+}
diff --git a/mlir/test/Transforms/test-legalizer-no-fold.mlir b/mlir/test/Transforms/test-legalizer-no-fold.mlir
new file mode 100644
index 0000000..720d17f
--- /dev/null
+++ b/mlir/test/Transforms/test-legalizer-no-fold.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-opt %s -allow-unregistered-dialect -test-legalize-patterns="test-legalize-folding-mode=never" | FileCheck %s
+
+// CHECK-LABEL: @remove_foldable_op(
+func.func @remove_foldable_op(%arg0 : i32) -> (i32) {
+ // Check that op was not folded.
+ // CHECK: "test.op_with_region_fold"
+ %0 = "test.op_with_region_fold"(%arg0) ({
+ "foo.op_with_region_terminator"() : () -> ()
+ }) : (i32) -> (i32)
+ "test.return"(%0) : (i32) -> ()
+}
+
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index e4406e6..5630d15 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -415,3 +415,20 @@ func.func @test_multiple_1_to_n_replacement() {
%0 = "test.multiple_1_to_n_replacement"() : () -> (f16)
"test.invalid"(%0) : (f16) -> ()
}
+
+// -----
+
+// CHECK-LABEL: func @test_lookup_without_converter
+// CHECK: %[[producer:.*]] = "test.valid_producer"() : () -> i16
+// CHECK: %[[cast:.*]] = "test.cast"(%[[producer]]) : (i16) -> f64
+// CHECK: "test.valid_consumer"(%[[cast]]) : (f64) -> ()
+// CHECK: "test.valid_consumer"(%[[producer]]) : (i16) -> ()
+func.func @test_lookup_without_converter() {
+ %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64)
+ "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> ()
+ // Make sure that the second "replace_with_valid_consumer" lowering does not
+ // lookup the materialization that was created for the above op.
+ "test.replace_with_valid_consumer"(%0) : (i64) -> ()
+ // expected-remark@+1 {{op 'func.return' is not legalizable}}
+ return
+}
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 2eaad55..231400e 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -1169,6 +1169,26 @@ def OpP : TEST_Op<"op_p"> {
let results = (outs I32);
}
+// Test constant-folding a pattern that maps `(F32) -> SI32`.
+def SignOp : TEST_Op<"sign", [SameOperandsAndResultShape]> {
+ let arguments = (ins RankedTensorOf<[F32]>:$operand);
+ let results = (outs RankedTensorOf<[SI32]>:$result);
+
+ let assemblyFormat = [{
+ $operand attr-dict `:` functional-type(operands, results)
+ }];
+}
+
+// Test constant-folding a pattern that maps `(F32, F32) -> I1`.
+def LessThanOp : TEST_Op<"less_than", [SameOperandsAndResultShape]> {
+ let arguments = (ins RankedTensorOf<[F32]>:$lhs, RankedTensorOf<[F32]>:$rhs);
+ let results = (outs RankedTensorOf<[I1]>:$result);
+
+ let assemblyFormat = [{
+ $lhs `,` $rhs attr-dict `:` functional-type(operands, results)
+ }];
+}
+
// Test same operand name enforces equality condition check.
def TestEqualArgsPattern : Pat<(OpN $a, $a), (OpO $a)>;
@@ -1478,6 +1498,8 @@ def TestOpInPlaceSelfFold : TEST_Op<"op_in_place_self_fold"> {
let results = (outs I32);
let hasFolder = 1;
}
+def : Pat<(TestOpInPlaceSelfFold:$op $_),
+ (TestOpConstant ConstantAttr<I32Attr, "97">)>;
// Test op that simply returns success.
def TestOpInPlaceFoldSuccess : TEST_Op<"op_in_place_fold_success"> {
@@ -2104,6 +2126,10 @@ def TestInvalidOp : TEST_Op<"invalid", [Terminator]>,
Arguments<(ins Variadic<AnyType>)>;
def TestTypeProducerOp : TEST_Op<"type_producer">,
Results<(outs AnyType)>;
+def TestValidProducerOp : TEST_Op<"valid_producer">,
+ Results<(outs AnyType)>;
+def TestValidConsumerOp : TEST_Op<"valid_consumer">,
+ Arguments<(ins AnyType)>;
def TestAnotherTypeProducerOp : TEST_Op<"another_type_producer">,
Results<(outs AnyType)>;
def TestTypeConsumerOp : TEST_Op<"type_consumer">,
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index eda618f..ff958d9 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -10,6 +10,7 @@
#include "TestOps.h"
#include "TestTypes.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/CommonFolders.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -202,6 +203,66 @@ struct HoistEligibleOps : public OpRewritePattern<test::OneRegionOp> {
}
};
+struct FoldSignOpF32ToSI32 : public OpRewritePattern<test::SignOp> {
+ using OpRewritePattern<test::SignOp>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(test::SignOp op,
+ PatternRewriter &rewriter) const override {
+ if (op->getNumOperands() != 1 || op->getNumResults() != 1)
+ return failure();
+
+ TypedAttr operandAttr;
+ matchPattern(op->getOperand(0), m_Constant(&operandAttr));
+ if (!operandAttr)
+ return failure();
+
+ TypedAttr res = cast_or_null<TypedAttr>(
+ constFoldUnaryOp<FloatAttr, FloatAttr::ValueType, void, IntegerAttr>(
+ operandAttr, op.getType(), [](APFloat operand) -> APSInt {
+ static const APFloat zero(0.0f);
+ int operandSign = 0;
+ if (operand != zero)
+ operandSign = (operand < zero) ? -1 : +1;
+ return APSInt(APInt(32, operandSign), false);
+ }));
+ if (!res)
+ return failure();
+
+ rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, res);
+ return success();
+ }
+};
+
+struct FoldLessThanOpF32ToI1 : public OpRewritePattern<test::LessThanOp> {
+ using OpRewritePattern<test::LessThanOp>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(test::LessThanOp op,
+ PatternRewriter &rewriter) const override {
+ if (op->getNumOperands() != 2 || op->getNumResults() != 1)
+ return failure();
+
+ TypedAttr lhsAttr;
+ TypedAttr rhsAttr;
+ matchPattern(op->getOperand(0), m_Constant(&lhsAttr));
+ matchPattern(op->getOperand(1), m_Constant(&rhsAttr));
+
+ if (!lhsAttr || !rhsAttr)
+ return failure();
+
+ Attribute operandAttrs[2] = {lhsAttr, rhsAttr};
+ TypedAttr res = cast_or_null<TypedAttr>(
+ constFoldBinaryOp<FloatAttr, FloatAttr::ValueType, void, IntegerAttr>(
+ operandAttrs, op.getType(), [](APFloat lhs, APFloat rhs) -> APInt {
+ return APInt(1, lhs < rhs);
+ }));
+ if (!res)
+ return failure();
+
+ rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, res);
+ return success();
+ }
+};
+
/// This pattern moves "test.move_before_parent_op" before the parent op.
struct MoveBeforeParentOp : public RewritePattern {
MoveBeforeParentOp(MLIRContext *context)
@@ -1198,6 +1259,47 @@ public:
}
};
+/// Pattern that replaces test.replace_with_valid_producer with
+/// test.valid_producer and the specified type.
+class TestReplaceWithValidProducer : public ConversionPattern {
+public:
+ TestReplaceWithValidProducer(MLIRContext *ctx)
+ : ConversionPattern("test.replace_with_valid_producer", 1, ctx) {}
+ LogicalResult
+ matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+ ConversionPatternRewriter &rewriter) const final {
+ auto attr = op->getAttrOfType<TypeAttr>("type");
+ if (!attr)
+ return failure();
+ rewriter.replaceOpWithNewOp<TestValidProducerOp>(op, attr.getValue());
+ return success();
+ }
+};
+
+/// Pattern that replaces test.replace_with_valid_consumer with
+/// test.valid_consumer. Can be used with and without a type converter.
+class TestReplaceWithValidConsumer : public ConversionPattern {
+public:
+ TestReplaceWithValidConsumer(MLIRContext *ctx, const TypeConverter &converter)
+ : ConversionPattern(converter, "test.replace_with_valid_consumer", 1,
+ ctx) {}
+ TestReplaceWithValidConsumer(MLIRContext *ctx)
+ : ConversionPattern("test.replace_with_valid_consumer", 1, ctx) {}
+
+ LogicalResult
+ matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+ ConversionPatternRewriter &rewriter) const final {
+ // with_converter present: pattern must have been initialized with a type
+ // converter.
+ // with_converter absent: pattern must have been initialized without a type
+ // converter.
+ if (op->hasAttr("with_converter") != static_cast<bool>(getTypeConverter()))
+ return failure();
+ rewriter.replaceOpWithNewOp<TestValidConsumerOp>(op, operands[0]);
+ return success();
+ }
+};
+
/// This pattern matches a test.convert_block_args op. It either:
/// a) Duplicates all block arguments,
/// b) or: drops all block arguments and replaces each with 2x the first
@@ -1314,6 +1416,7 @@ struct TestTypeConverter : public TypeConverter {
TestTypeConverter() {
addConversion(convertType);
addSourceMaterialization(materializeCast);
+ addTargetMaterialization(materializeCast);
}
static LogicalResult convertType(Type t, SmallVectorImpl<Type> &results) {
@@ -1389,10 +1492,12 @@ struct TestLegalizePatternDriver
TestBoundedRecursiveRewrite, TestNestedOpCreationUndoRewrite,
TestReplaceEraseOp, TestCreateUnregisteredOp, TestUndoMoveOpBefore,
TestUndoPropertiesModification, TestEraseOp,
+ TestReplaceWithValidProducer, TestReplaceWithValidConsumer,
TestRepetitive1ToNConsumer>(&getContext());
patterns.add<TestDropOpSignatureConversion, TestDropAndReplaceInvalidOp,
TestPassthroughInvalidOp, TestMultiple1ToNReplacement,
- TestBlockArgReplace>(&getContext(), converter);
+ TestBlockArgReplace, TestReplaceWithValidConsumer>(
+ &getContext(), converter);
patterns.add<TestConvertBlockArgs>(converter, &getContext());
mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
converter);
@@ -1402,7 +1507,8 @@ struct TestLegalizePatternDriver
ConversionTarget target(getContext());
target.addLegalOp<ModuleOp>();
target.addLegalOp<LegalOpA, LegalOpB, LegalOpC, TestCastOp, TestValidOp,
- TerminatorOp, OneRegionOp>();
+ TerminatorOp, TestOpConstant, OneRegionOp,
+ TestValidProducerOp, TestValidConsumerOp>();
target.addLegalOp(OperationName("test.legal_op", &getContext()));
target
.addIllegalOp<ILLegalOpF, TestRegionBuilderOp, TestOpWithRegionFold>();
@@ -1457,6 +1563,7 @@ struct TestLegalizePatternDriver
DumpNotifications dumpNotifications;
config.listener = &dumpNotifications;
config.unlegalizedOps = &unlegalizedOps;
+ config.foldingMode = foldingMode;
if (failed(applyPartialConversion(getOperation(), target,
std::move(patterns), config))) {
getOperation()->emitRemark() << "applyPartialConversion failed";
@@ -1476,6 +1583,7 @@ struct TestLegalizePatternDriver
ConversionConfig config;
DumpNotifications dumpNotifications;
+ config.foldingMode = foldingMode;
config.listener = &dumpNotifications;
if (failed(applyFullConversion(getOperation(), target,
std::move(patterns), config))) {
@@ -1490,6 +1598,7 @@ struct TestLegalizePatternDriver
// Analyze the convertible operations.
DenseSet<Operation *> legalizedOps;
ConversionConfig config;
+ config.foldingMode = foldingMode;
config.legalizableOps = &legalizedOps;
if (failed(applyAnalysisConversion(getOperation(), target,
std::move(patterns), config)))
@@ -1510,6 +1619,21 @@ struct TestLegalizePatternDriver
clEnumValN(ConversionMode::Full, "full", "Perform a full conversion"),
clEnumValN(ConversionMode::Partial, "partial",
"Perform a partial conversion"))};
+
+ Option<DialectConversionFoldingMode> foldingMode{
+ *this, "test-legalize-folding-mode",
+ llvm::cl::desc("The folding mode to use with the test driver"),
+ llvm::cl::init(DialectConversionFoldingMode::BeforePatterns),
+ llvm::cl::values(clEnumValN(DialectConversionFoldingMode::Never, "never",
+ "Never attempt to fold"),
+ clEnumValN(DialectConversionFoldingMode::BeforePatterns,
+ "before-patterns",
+ "Only attempt to fold not legal operations "
+ "before applying patterns"),
+ clEnumValN(DialectConversionFoldingMode::AfterPatterns,
+ "after-patterns",
+ "Only attempt to fold not legal operations "
+ "after applying patterns"))};
};
} // namespace
@@ -2181,6 +2305,24 @@ struct TestSelectiveReplacementPatternDriver
(void)applyPatternsGreedily(getOperation(), std::move(patterns));
}
};
+
+struct TestFoldTypeConvertingOp
+ : public PassWrapper<TestFoldTypeConvertingOp, OperationPass<>> {
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestFoldTypeConvertingOp)
+
+ StringRef getArgument() const final { return "test-fold-type-converting-op"; }
+ StringRef getDescription() const final {
+ return "Test helper functions for folding ops whose input and output types "
+ "differ, e.g. float comparisons of the form `(f32, f32) -> i1`.";
+ }
+ void runOnOperation() override {
+ MLIRContext *context = &getContext();
+ mlir::RewritePatternSet patterns(context);
+ patterns.add<FoldSignOpF32ToSI32, FoldLessThanOpF32ToI1>(context);
+ if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
+ signalPassFailure();
+ }
+};
} // namespace
//===----------------------------------------------------------------------===//
@@ -2211,6 +2353,8 @@ void registerPatternsTestPass() {
PassRegistration<TestMergeBlocksPatternDriver>();
PassRegistration<TestSelectiveReplacementPatternDriver>();
+
+ PassRegistration<TestFoldTypeConvertingOp>();
}
} // namespace test
} // namespace mlir
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index c6245b6..3bea8ef 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -7,11 +7,14 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
using namespace mlir;
@@ -147,12 +150,118 @@ struct TestXeGPUUnrollingPatterns
}
};
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "test-xegpu-layout-interface"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+// Test pattern for distributing vector::StepOp from workgroup to subgroup.
+// Validates LayoutTrait interfaces for offset computation abstraction between
+// LayoutAttr and SliceAttr.
+class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
+ using OpConversionPattern<vector::StepOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+
+ auto layoutName = xegpu::getLayoutName(op->getResult(0));
+ auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
+ if (!sliceAttr || sliceAttr.getRank() != 1)
+ return failure();
+
+ std::optional<SmallVector<int64_t>> sgShape = sliceAttr.getSgDataAsInt();
+ if (!sgShape)
+ return failure();
+
+ Location loc = op.getLoc();
+ VectorType type = op.getResult().getType();
+ auto wgShape = type.getShape();
+
+ Value sgId =
+ gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
+ auto maybeOffsets = sliceAttr.getOffsets(rewriter, loc, sgId, wgShape);
+ if (failed(maybeOffsets))
+ return failure();
+
+ VectorType newTy = type.cloneWith(*sgShape, type.getElementType());
+ Value base = vector::StepOp::create(rewriter, loc, newTy);
+ SmallVector<Value> newOps;
+ for (auto offsets : *maybeOffsets) {
+ Value bcast =
+ vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]);
+ Value add = arith::AddIOp::create(rewriter, loc, base, bcast);
+ newOps.push_back(add);
+ }
+ rewriter.replaceOpWithMultiple(op, {newOps});
+ return success();
+ }
+};
+
+struct TestXeGPULayoutInterface
+ : public PassWrapper<TestXeGPULayoutInterface,
+ OperationPass<gpu::GPUModuleOp>> {
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPULayoutInterface)
+
+ StringRef getArgument() const final { return "test-xegpu-layout-interface"; }
+
+ StringRef getDescription() const final {
+ return "Test the implementation of XeGPU Layout interfaces";
+ }
+
+ void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+ registry.insert<arith::ArithDialect>();
+ registry.insert<memref::MemRefDialect>();
+ registry.insert<xegpu::XeGPUDialect>();
+ registry.insert<vector::VectorDialect>();
+ registry.insert<index::IndexDialect>();
+ }
+
+ TestXeGPULayoutInterface() = default;
+ TestXeGPULayoutInterface(const TestXeGPULayoutInterface &pass)
+ : PassWrapper(pass) {}
+
+ void runOnOperation() override {
+ MLIRContext *ctx = &getContext();
+
+ TypeConverter typeConverter;
+ auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+ mlir::ValueRange inputs,
+ mlir::Location loc) -> mlir::Value {
+ return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+ .getResult(0);
+ };
+ typeConverter.addSourceMaterialization(materializeCast);
+ typeConverter.addTargetMaterialization(materializeCast);
+
+ RewritePatternSet patterns(ctx);
+ patterns.add<TestStepOpPattern>(typeConverter, ctx);
+
+ ConversionTarget target(*ctx);
+ auto isLegal = [&](xegpu::SliceAttr layout) -> bool {
+ return !layout || !layout.isWgLayout();
+ };
+
+ target.addDynamicallyLegalOp<vector::StepOp>(
+ [&](vector::StepOp op) -> bool {
+ auto layoutName = xegpu::getLayoutName(op->getResult(0));
+ auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
+ return isLegal(sliceAttr);
+ });
+
+ target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
+
+ (void)applyPartialConversion(getOperation(), target, std::move(patterns));
+ }
+};
+
} // namespace
namespace mlir {
namespace test {
void registerTestXeGPULowerings() {
PassRegistration<TestXeGPUUnrollingPatterns>();
+ PassRegistration<TestXeGPULayoutInterface>();
}
} // namespace test
} // namespace mlir
diff --git a/mlir/test/lib/Pass/TestPassManager.cpp b/mlir/test/lib/Pass/TestPassManager.cpp
index 25c8e53..df2736b 100644
--- a/mlir/test/lib/Pass/TestPassManager.cpp
+++ b/mlir/test/lib/Pass/TestPassManager.cpp
@@ -133,6 +133,51 @@ struct TestOptionsSuperPass
llvm::cl::desc("Example list of PassPipelineOptions option")};
};
+struct TestOptionsPassA
+ : public PassWrapper<TestOptionsPassA, OperationPass<func::FuncOp>> {
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOptionsPassA)
+
+ struct Options : public PassPipelineOptions<Options> {
+ Option<bool> foo{*this, "foo", llvm::cl::desc("Example boolean option")};
+ };
+
+ TestOptionsPassA() = default;
+ TestOptionsPassA(const TestOptionsPassA &) : PassWrapper() {}
+ TestOptionsPassA(const Options &options) { this->options.foo = options.foo; }
+
+ void runOnOperation() final {}
+ StringRef getArgument() const final { return "test-options-pass-a"; }
+ StringRef getDescription() const final {
+ return "Test superset options parsing capabilities - subset A";
+ }
+
+ Options options;
+};
+
+struct TestOptionsPassB
+ : public PassWrapper<TestOptionsPassB, OperationPass<func::FuncOp>> {
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOptionsPassB)
+
+ struct Options : public PassPipelineOptions<Options> {
+ Option<bool> bar{*this, "bar", llvm::cl::desc("Example boolean option")};
+ };
+
+ TestOptionsPassB() = default;
+ TestOptionsPassB(const TestOptionsPassB &) : PassWrapper() {}
+ TestOptionsPassB(const Options &options) { this->options.bar = options.bar; }
+
+ void runOnOperation() final {}
+ StringRef getArgument() const final { return "test-options-pass-b"; }
+ StringRef getDescription() const final {
+ return "Test superset options parsing capabilities - subset B";
+ }
+
+ Options options;
+};
+
+struct TestPipelineOptionsSuperSetAB : TestOptionsPassA::Options,
+ TestOptionsPassB::Options {};
+
/// A test pass that always aborts to enable testing the crash recovery
/// mechanism of the pass manager.
struct TestCrashRecoveryPass
@@ -270,6 +315,9 @@ void registerPassManagerTestPass() {
PassRegistration<TestOptionsPass>();
PassRegistration<TestOptionsSuperPass>();
+ PassRegistration<TestOptionsPassA>();
+ PassRegistration<TestOptionsPassB>();
+
PassRegistration<TestModulePass>();
PassRegistration<TestFunctionPass>();
@@ -306,5 +354,16 @@ void registerPassManagerTestPass() {
[](OpPassManager &pm, const TestOptionsSuperPass::Options &options) {
pm.addPass(std::make_unique<TestOptionsSuperPass>(options));
});
+
+ PassPipelineRegistration<TestPipelineOptionsSuperSetAB>
+ registerPipelineOptionsSuperSetABPipeline(
+ "test-options-super-set-ab-pipeline",
+ "Parses options of PassPipelineOptions using pass pipeline "
+ "registration",
+ [](OpPassManager &pm, const TestPipelineOptionsSuperSetAB &options) {
+ // Pass superset AB options to subset options A and B
+ pm.addPass(std::make_unique<TestOptionsPassA>(options));
+ pm.addPass(std::make_unique<TestOptionsPassB>(options));
+ });
}
} // namespace mlir
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index feaf5fb..f392bda 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -224,6 +224,9 @@ if config.enable_cuda_runner:
if config.enable_sycl_runner:
tools.extend([add_runtime("mlir_sycl_runtime")])
+if config.enable_levelzero_runner:
+ tools.extend([add_runtime("mlir_levelzero_runtime")])
+
if config.enable_spirv_cpu_runner:
tools.extend([add_runtime("mlir_spirv_cpu_runtime")])
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index b1185e1..d904780 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -34,6 +34,7 @@ config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
+config.enable_levelzero_runner = @MLIR_ENABLE_LEVELZERO_RUNNER@
config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@
diff --git a/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py b/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py
index ee76b6d..bc273bf 100644
--- a/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py
+++ b/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py
@@ -1,7 +1,7 @@
# RUN: %PYTHON -m mlir.dialects.linalg.opdsl.dump_oplib .ops.core_named_ops | FileCheck %s
# Just verify that at least one known op is generated.
-# CHECK: name: matmul
+# CHECK: name: copy
# verify some special cases: negf->NegFOp, powf->PowFOp
# CHECK cpp_class_name: NegFOp