diff options
Diffstat (limited to 'mlir')
32 files changed, 920 insertions, 350 deletions
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 4f48385..ba5e48e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -579,7 +579,8 @@ def NVVM_PMEventOp : NVVM_PTXBuilder_Op<"pmevent">, /// mbarrier.init instruction with generic pointer type def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">, - Arguments<(ins LLVM_AnyPointer:$addr, I32:$count, PtxPredicate:$predicate)> { + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, + I32:$count, PtxPredicate:$predicate)> { let summary = "MBarrier Initialization Op"; let description = [{ The `nvvm.mbarrier.init` operation initializes an *mbarrier object* at the specified @@ -592,48 +593,35 @@ def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">, - Transaction count (tx-count): 0 The operation takes the following operands: - - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. + - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. - `count`: Integer specifying the number of threads that will participate in barrier synchronization. Must be in the range [1, 2²⁰ - 1]. - `predicate`: Optional predicate for conditional execution. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init) }]; - string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init, {$addr, $count}); - }]; let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)"; + let extraClassDeclaration = [{ bool hasIntrinsic() { if(getPredicate()) return false; return true; } - }]; - let extraClassDefinition = [{ - std::string $cppClass::getPtx() { return std::string("mbarrier.init.b64 [%0], %1;"); } - }]; -} - -/// mbarrier.init instruction with shared pointer type -def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared", [NVVMRequiresSM<80>, DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>, - Arguments<(ins LLVM_PointerShared:$addr, I32:$count, PtxPredicate:$predicate)> { - let summary = "Shared MBarrier Initialization Op"; - let description = [{ - This Op is the same as `nvvm.mbarrier.init` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init) + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init_shared, {$addr, $count}); - }]; - let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)"; - let extraClassDeclaration = "bool hasIntrinsic() { return !getPredicate(); }"; - let extraClassDefinition = [{ - std::string $cppClass::getPtx() { return std::string("mbarrier.init.shared.b64 [%0], %1;"); } + auto [id, args] = NVVM::MBarrierInitOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); }]; } def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">, - Arguments<(ins LLVM_AnyPointer:$addr)> { + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> { let summary = "MBarrier Invalidation Operation"; let description = [{ The `nvvm.mbarrier.inval` operation invalidates an *mbarrier object* at the @@ -644,30 +632,27 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">, It is undefined behavior if the *mbarrier object* is already invalid. The operation takes the following operand: - - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. + - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval) }]; - string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval, {$addr}); - }]; - let assemblyFormat = "$addr attr-dict `:` type(operands)"; -} -def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">, - Arguments<(ins LLVM_PointerShared:$addr)> { - let summary = "Shared MBarrier Invalidation Operation"; - let description = [{ - This Op is the same as `nvvm.mbarrier.inval` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. + let assemblyFormat = "$addr attr-dict `:` type(operands)"; - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval) + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval_shared, {$addr}); + auto [id, args] = NVVM::MBarrierInvalOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); }]; - let assemblyFormat = "$addr attr-dict `:` type(operands)"; } def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">, @@ -2014,6 +1999,9 @@ class MMA_LDST_OPS<list<GEOM> Geom, list<string> Frags, list<string> Types> { // llvm supports and can be extended as needed. class NVVM_MMA_OPS { // "wmma" operations + list<list<WMMA_REGS>> fp64_wmma_ops = MMA_OPS< + [GEOM<8, 8, 4>], + ["f64"], [], ["f64"], []>.ret; list<list<WMMA_REGS>> tf32_wmma_ops = MMA_OPS< [GEOM<16, 16, 8>], ["tf32"], [], ["f32"], []>.ret; @@ -2024,6 +2012,7 @@ class NVVM_MMA_OPS { [GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>], ["s8","u8"], [], ["s32"], []>.ret; list<list<WMMA_REGS>> all_wmma_ops = !listconcat( + fp64_wmma_ops, tf32_wmma_ops, fp_wmma_ops, i8_wmma_ops); @@ -2040,9 +2029,17 @@ class NVVM_MMA_OPS { list<WMMA_REGS> ldst_tf32_cd_ops = MMA_LDST_OPS< [GEOM<16, 16, 8>], ["c", "d"], ["f32"]>.ret; + list<WMMA_REGS> ldst_f64_ab_ops = MMA_LDST_OPS< + [GEOM<8, 8, 4>], + ["a", "b"], ["f64"]>.ret; + list<WMMA_REGS> ldst_f64_cd_ops = MMA_LDST_OPS< + [GEOM<8, 8, 4>], + ["c", "d"], ["f64"]>.ret; list<WMMA_REGS> all_ldst_ops = !listconcat(ldst_ab_ops, ldst_cd_ops, ldst_tf32_ab_ops, - ldst_tf32_cd_ops); + ldst_tf32_cd_ops, + ldst_f64_ab_ops, + ldst_f64_cd_ops); // Separate A/B/C fragments (loads) from D (stores). list<WMMA_REGS> all_ld_ops = !filter(op, all_ldst_ops, !ne(op.frag, "d")); list<WMMA_REGS> all_st_ops = !filter(op, all_ldst_ops, !eq(op.frag, "d")); @@ -2349,7 +2346,7 @@ def MMAFragAttr : EnumAttr<NVVM_Dialect, MMAFrag, "mma_frag"> { } def NVVM_WMMALoadOp: NVVM_Op<"wmma.load">, - Results<(outs LLVM_AnyStruct:$res)>, + Results<(outs AnyTypeOf<[LLVM_AnyStruct, F64]>:$res)>, Arguments<(ins LLVM_AnyPointer: $ptr, I32: $stride, I32Attr:$m, I32Attr:$n, I32Attr:$k, MMALayoutAttr:$layout, MMATypesAttr:$eltype, MMAFragAttr:$frag)> { diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td index b39207f..e00f3c1 100644 --- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td +++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td @@ -323,8 +323,8 @@ def MemRef_ReallocOp : MemRef_Op<"realloc"> { ```mlir %new = memref.realloc %old : memref<64xf32> to memref<124xf32> - %4 = memref.load %new[%index] // ok - %5 = memref.load %old[%index] // undefined behavior + %4 = memref.load %new[%index] : memref<124xf32> // ok + %5 = memref.load %old[%index] : memref<64xf32> // undefined behavior ``` }]; @@ -445,9 +445,10 @@ def MemRef_AllocaScopeOp : MemRef_Op<"alloca_scope", operation: ```mlir - %result = memref.alloca_scope { + %result = memref.alloca_scope -> f32 { + %value = arith.constant 1.0 : f32 ... - memref.alloca_scope.return %value + memref.alloca_scope.return %value : f32 } ``` @@ -478,7 +479,7 @@ def MemRef_AllocaScopeReturnOp : MemRef_Op<"alloca_scope.return", to indicate which values are going to be returned. For example: ```mlir - memref.alloca_scope.return %value + memref.alloca_scope.return %value : f32 ``` }]; @@ -543,11 +544,11 @@ def MemRef_CastOp : MemRef_Op<"cast", [ Example: ```mlir - Cast to concrete shape. - %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32> + // Cast to concrete shape. + %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32> - Erase rank information. - %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32> + // Erase rank information. + %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32> ``` }]; @@ -613,8 +614,8 @@ def MemRef_DeallocOp : MemRef_Op<"dealloc", [MemRefsNormalizable]> { Example: ```mlir - %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1), 1>> - memref.dealloc %0 : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1), 1>> + %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> + memref.dealloc %0 : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> ``` }]; @@ -728,13 +729,13 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> { space 1 at indices [%k, %l], would be specified as follows: ```mlir - %num_elements = arith.constant 256 + %num_elements = arith.constant 256 : index %idx = arith.constant 0 : index - %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 4> - dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] : - memref<40 x 128 x f32>, affine_map<(d0) -> (d0)>, 0>, - memref<2 x 1024 x f32>, affine_map<(d0) -> (d0)>, 1>, - memref<1 x i32>, affine_map<(d0) -> (d0)>, 2> + %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 2> + memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] : + memref<40 x 128 x f32, affine_map<(d0, d1) -> (d0, d1)>, 0>, + memref<2 x 1024 x f32, affine_map<(d0, d1) -> (d0, d1)>, 1>, + memref<1 x i32, affine_map<(d0) -> (d0)>, 2> ``` If %stride and %num_elt_per_stride are specified, the DMA is expected to @@ -742,8 +743,8 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> { memory space 0 until %num_elements are transferred. ```mlir - dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride, - %num_elt_per_stride : + memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride, + %num_elt_per_stride : ``` * TODO: add additional operands to allow source and destination striding, and @@ -891,10 +892,10 @@ def MemRef_DmaWaitOp : MemRef_Op<"dma_wait"> { Example: ```mlir - dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] : - memref<2048 x f32>, affine_map<(d0) -> (d0)>, 0>, - memref<256 x f32>, affine_map<(d0) -> (d0)>, 1> - memref<1 x i32>, affine_map<(d0) -> (d0)>, 2> + memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] : + memref<2048 x f32, affine_map<(d0) -> (d0)>, 0>, + memref<256 x f32, affine_map<(d0) -> (d0)>, 1>, + memref<1 x i32, affine_map<(d0) -> (d0)>, 2> ... ... dma_wait %tag[%index], %num_elements : memref<1 x i32, affine_map<(d0) -> (d0)>, 2> @@ -1004,8 +1005,8 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [ ```mlir %base, %offset, %sizes:2, %strides:2 = - memref.extract_strided_metadata %memref : - memref<10x?xf32>, index, index, index, index, index + memref.extract_strided_metadata %memref : memref<10x?xf32> + -> memref<f32>, index, index, index, index, index // After folding, the type of %m2 can be memref<10x?xf32> and further // folded to %memref. @@ -1013,7 +1014,7 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [ offset: [%offset], sizes: [%sizes#0, %sizes#1], strides: [%strides#0, %strides#1] - : memref<f32> to memref<?x?xf32, offset: ?, strides: [?, ?]> + : memref<f32> to memref<?x?xf32, strided<[?, ?], offset:?>> ``` }]; @@ -1182,10 +1183,10 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> { ```mlir // Private variable with an initial value. - memref.global "private" @x : memref<2xf32> = dense<0.0,2.0> + memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]> // Private variable with an initial value and an alignment (power of 2). - memref.global "private" @x : memref<2xf32> = dense<0.0,2.0> {alignment = 64} + memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]> {alignment = 64} // Declaration of an external variable. memref.global "private" @y : memref<4xi32> @@ -1194,7 +1195,7 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> { memref.global @z : memref<3xf16> = uninitialized // Externally visible constant variable. - memref.global constant @c : memref<2xi32> = dense<1, 4> + memref.global constant @c : memref<2xi32> = dense<[1, 4]> ``` }]; @@ -1555,7 +1556,8 @@ def MemRef_ReinterpretCastOp %dst = memref.reinterpret_cast %src to offset: [%offset], sizes: [%sizes], - strides: [%strides] + strides: [%strides] : + memref<*xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>> ``` means that `%dst`'s descriptor will be: ```mlir @@ -1695,12 +1697,12 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [ ```mlir // Reshape statically-shaped memref. %dst = memref.reshape %src(%shape) - : (memref<4x1xf32>, memref<1xi32>) to memref<4xf32> + : (memref<4x1xf32>, memref<1xi32>) -> memref<4xf32> %dst0 = memref.reshape %src(%shape0) - : (memref<4x1xf32>, memref<2xi32>) to memref<2x2xf32> + : (memref<4x1xf32>, memref<2xi32>) -> memref<2x2xf32> // Flatten unranked memref. %dst = memref.reshape %src(%shape) - : (memref<*xf32>, memref<1xi32>) to memref<?xf32> + : (memref<*xf32>, memref<1xi32>) -> memref<?xf32> ``` b. Source type is ranked or unranked. Shape argument has dynamic size. @@ -1709,10 +1711,10 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [ ```mlir // Reshape dynamically-shaped 1D memref. %dst = memref.reshape %src(%shape) - : (memref<?xf32>, memref<?xi32>) to memref<*xf32> + : (memref<?xf32>, memref<?xi32>) -> memref<*xf32> // Reshape unranked memref. %dst = memref.reshape %src(%shape) - : (memref<*xf32>, memref<?xi32>) to memref<*xf32> + : (memref<*xf32>, memref<?xi32>) -> memref<*xf32> ``` }]; diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td index 6fb9a95..054c13a 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td @@ -26,4 +26,22 @@ def ComputeRegionOpInterface : OpInterface<"ComputeRegionOpInterface"> { ]; } +def PartialEntityAccessOpInterface : OpInterface<"PartialEntityAccessOpInterface"> { + let cppNamespace = "::mlir::acc"; + + let description = [{ + An interface for operations that access a partial entity such as + field or array element access. + }]; + + let methods = [ + InterfaceMethod<"Get the base entity being accessed", "::mlir::Value", + "getBaseEntity", (ins)>, + InterfaceMethod<"Check if this is a complete view of the entity", "bool", + "isCompleteView", (ins), [{ + return false; + }]>, + ]; +} + #endif // OPENACC_OPS_INTERFACES diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h index 563c1e0..9647357 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h @@ -47,6 +47,11 @@ std::string getVariableName(mlir::Value v); /// Returns an empty string if not possible to generate a recipe name. std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type); +// Get the base entity from partial entity access. This is used for getting +// the base `struct` from an operation that only accesses a field or the +// base `array` from an operation that only accesses a subarray. +mlir::Value getBaseEntity(mlir::Value val); + } // namespace acc } // namespace mlir diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 19a5231..40352b4 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -379,28 +379,28 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> { ); let builders = [ - AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $lane_layout, + AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $inst_data, + "llvm::ArrayRef<int32_t>": $lane_layout, "llvm::ArrayRef<int32_t>": $lane_data), [{ auto sg_layout = DenseI32ArrayAttr(); auto sg_data = DenseI32ArrayAttr(); - auto inst_data = DenseI32ArrayAttr(); auto order = DenseI32ArrayAttr(); - return $_get($_ctxt, sg_layout, sg_data, inst_data, + return $_get($_ctxt, sg_layout, sg_data, + DenseI32ArrayAttr::get($_ctxt, inst_data), DenseI32ArrayAttr::get($_ctxt, lane_layout), DenseI32ArrayAttr::get($_ctxt, lane_data), order); }]>, AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $lane_layout, - "llvm::ArrayRef<int32_t>": $lane_data, - "llvm::ArrayRef<int32_t>": $order), + "llvm::ArrayRef<int32_t>": $lane_data), [{ - return $_get($_ctxt, - /*sg_layout =*/ nullptr, - /*sg_data =*/ nullptr, - /*inst_data =*/ nullptr, + auto sg_layout = DenseI32ArrayAttr(); + auto sg_data = DenseI32ArrayAttr(); + auto inst_data = DenseI32ArrayAttr(); + auto order = DenseI32ArrayAttr(); + return $_get($_ctxt, sg_layout, sg_data, inst_data, DenseI32ArrayAttr::get($_ctxt, lane_layout), - DenseI32ArrayAttr::get($_ctxt, lane_data), - DenseI32ArrayAttr::get($_ctxt, order)); + DenseI32ArrayAttr::get($_ctxt, lane_data), order); }]>, AttrBuilder<(ins "DenseI32ArrayAttr": $lane_layout, "DenseI32ArrayAttr": $lane_data, diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h deleted file mode 100644 index 8aa9536..0000000 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h +++ /dev/null @@ -1,30 +0,0 @@ -//===- XeGPUTargetInfo.h - Target constants ---------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_ -#define MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_ - -namespace mlir { -namespace xegpu { -/// HW dependent constants. -/// TODO: These constants should be queried from the target information. -namespace targetinfo { -constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup. -/// If DPAS A or B operands have low precision element types they must be packed -/// according to the following sizes. -constexpr unsigned packedSizeInBitsForDefault = - 16; // Minimum packing size per register for DPAS A. -constexpr unsigned packedSizeInBitsForDpasB = - 32; // Minimum packing size per register for DPAS B. -constexpr unsigned packedSizeInBitsForGatherScatter = - 32; // Minimum packing size per register for Gather and Scatter ops. -} // namespace targetinfo -} // namespace xegpu -} // namespace mlir - -#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_ diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 564d9c4..b7af541 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -43,7 +43,12 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> { let options = [Option< "printOnly", "print-analysis-only", "bool", /*default=*/"false", - "Print the result of layout propagation analysis and exit.">]; + "Print the result of layout propagation analysis and exit.">, + Option< + "layoutKind", "layout-kind", "std::string", + /*default=*/"\"lane\"", + "Propagate a `sg` / `inst` / `lane` level of xegpu layouts."> + ]; } def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> { diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h index dcb2ad5..b3231a1 100644 --- a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h +++ b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h @@ -270,6 +270,8 @@ inline const uArch *getUArch(llvm::StringRef archName) { return PVCuArch::getInstance(); else if (archName.equals_insensitive("bmg")) return BMGuArch::getInstance(); + else + llvm_unreachable("No matching uArch found"); return nullptr; } diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h index ea33e88..8f23b89 100644 --- a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h +++ b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h @@ -29,6 +29,8 @@ namespace mlir { namespace xegpu { namespace uArch { +constexpr unsigned generalPackedFormatBitSize{32}; + // An enum class to represent the scope of an instruction enum class InstructionScope { Lane, Subgroup, Workgroup, Cluster }; enum class InstructionKind { diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index a9efada..ec182f1 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -846,13 +846,8 @@ struct NVGPUMBarrierInitLowering Value barrier = getMbarrierPtr(b, mbarrierType, adaptor.getBarriers(), adaptor.getMbarId(), rewriter); Value count = truncToI32(b, adaptor.getCount()); - if (isMbarrierShared(mbarrierType)) { - rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>( - op, barrier, count, adaptor.getPredicate()); - } else { - rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count, - adaptor.getPredicate()); - } + rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count, + adaptor.getPredicate()); return success(); } }; diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index 41d8d53..69a317ec 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -716,7 +716,7 @@ lowerReductionWithStartValue(ConversionPatternRewriter &rewriter, Location loc, accumulator = getOrCreateAccumulator<ReductionNeutral>(rewriter, loc, llvmType, accumulator); return LLVMRedIntrinOp::create(rewriter, loc, llvmType, - /*startValue=*/accumulator, vectorOperand, + /*start_value=*/accumulator, vectorOperand, fmf); } @@ -743,7 +743,7 @@ static Value lowerPredicatedReductionWithStartValue( Value vectorLength = createVectorLengthValue(rewriter, loc, vectorOperand.getType()); return LLVMVPRedIntrinOp::create(rewriter, loc, llvmType, - /*startValue=*/accumulator, vectorOperand, + /*satrt_value=*/accumulator, vectorOperand, mask, vectorLength); } diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp index e08cc6f..d428fbf 100644 --- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp @@ -1106,10 +1106,7 @@ static bool isUniformDefinition(Value value, return false; } - if (!value.getType().isIntOrIndexOrFloat()) - return false; - - return true; + return value.getType().isIntOrIndexOrFloat(); } /// Generates a broadcast op for the provided uniform value using the diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 81c3069..ec1571a 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -416,13 +416,39 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, if (ci.clusterSize >= 32) { if (chipset.majorVersion <= 9) { // Broadcast last value from each row to next row. - // Use row mask to avoid polluting rows 1 and 3. + // Use row mask to avoid polluting row 0 (and row 2 if wave-64). dpp = amdgpu::DPPOp::create(rewriter, loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, rewriter.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false); res = vector::makeArithReduction( rewriter, loc, gpu::convertReductionKind(mode), res, dpp); + + // For subgroupSize = 64, at this point lanes [16, 32) contain the full + // reduction over lanes [0, 32), but lanes [0, 16) do not. Similarly, + // lanes [48, 64) contain the full reduction over lanes [32, 64), but + // lanes [32, 48) do not. + // + // If subgroup size is 64 and cluster size is 64, we don't need lanes [0, + // 16) and [32, 48) to have the correct cluster-32 reduction values at + // this point, because only lane 63's value will ultimately be read in + // this full-cluster case. + // + // If subgroup size is 64 and cluster size is 32, we need to ensure that + // lanes [0, 16) and [32, 48) have the correct final cluster-32 reduction + // values (subgroup_reduce guarantees that all lanes within each cluster + // contain the final reduction value). We do this by broadcasting lane + // 31's value to lanes [0, 16) and lanes 63's value to lanes [32, 48). + // + // See https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations + // for an illustration of how this within-cluster broadcast works with a + // swizzle. + if (ci.subgroupSize == 64 && ci.clusterSize == 32) { + res = + amdgpu::SwizzleBitModeOp::create(rewriter, loc, res, /*and_mask=*/0, + /*or_mask=*/31, + /*xor_mask=*/0); + } } else if (chipset.majorVersion <= 12) { // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). Value uint32Max = arith::ConstantOp::create( diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index f0de4db..a5ffb9e 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -896,6 +896,12 @@ std::pair<mlir::Type, unsigned> NVVM::inferMMAType(NVVM::MMATypes type, } else if (type == NVVM::MMATypes::f32) { elementType = builder.getF32Type(); numberElements = 8; + } else if (type == NVVM::MMATypes::f64) { + elementType = builder.getF64Type(); + if (frag == NVVM::MMAFrag::a || frag == NVVM::MMAFrag::b) + numberElements = 1; + else + numberElements = 2; } else if (type == NVVM::MMATypes::tf32) { elementType = builder.getI32Type(); numberElements = 4; @@ -954,6 +960,14 @@ LogicalResult NVVM::WMMALoadOp::verify() { return emitOpError() << "invalid attribute combination"; std::pair<Type, unsigned> typeInfo = inferMMATypeFromMNK( getEltype(), getFrag(), getM(), getN(), getK(), getContext()); + // Special case for f64 fragments + Type f64Ty = Float64Type::get(getContext()); + if (typeInfo.first == f64Ty && typeInfo.second == 1) { + if (getType() != f64Ty) + return emitOpError("expected destination type to be f64"); + return success(); + } + // Everything else is a struct Type dstType = LLVM::LLVMStructType::getLiteral( getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first)); if (getType() != dstType) @@ -1608,9 +1622,52 @@ void Tcgen05MmaSmemDescOp::createSmemDescriptor(Operation &op, } //===----------------------------------------------------------------------===// +// getPtx methods +//===----------------------------------------------------------------------===// + +std::string NVVM::MBarrierInitOp::getPtx() { + unsigned addressSpace = + llvm::cast<LLVM::LLVMPointerType>(getAddr().getType()).getAddressSpace(); + return (addressSpace == NVVMMemorySpace::Shared) + ? std::string("mbarrier.init.shared.b64 [%0], %1;") + : std::string("mbarrier.init.b64 [%0], %1;"); +} + +//===----------------------------------------------------------------------===// // getIntrinsicID/getIntrinsicIDAndArgs methods //===----------------------------------------------------------------------===// +mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast<NVVM::MBarrierInitOp>(op); + unsigned addressSpace = + llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType()) + .getAddressSpace(); + llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared) + ? llvm::Intrinsic::nvvm_mbarrier_init_shared + : llvm::Intrinsic::nvvm_mbarrier_init; + + // Fill the Intrinsic Args + llvm::SmallVector<llvm::Value *> args; + args.push_back(mt.lookupValue(thisOp.getAddr())); + args.push_back(mt.lookupValue(thisOp.getCount())); + + return {id, std::move(args)}; +} + +mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast<NVVM::MBarrierInvalOp>(op); + unsigned addressSpace = + llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType()) + .getAddressSpace(); + llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared) + ? llvm::Intrinsic::nvvm_mbarrier_inval_shared + : llvm::Intrinsic::nvvm_mbarrier_inval; + + return {id, {mt.lookupValue(thisOp.getAddr())}}; +} + #define CP_ASYNC_ID_IMPL(mod, size, suffix) \ llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index b09112b..3a43382 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1958,7 +1958,7 @@ enum class OuterOrInnerPerm { Outer = 0, Inner = 1 }; /// Return true if either `op` or `permutation` are empty to allow a simpler /// polymorphic implementation. template <typename RelayoutOpTy> -bool isValidPackingPermutation( +static bool isValidPackingPermutation( RelayoutOpTy op, ArrayRef<int64_t> permutation, OuterOrInnerPerm outerOrInnerPerm = OuterOrInnerPerm::Outer) { static_assert( @@ -4322,9 +4322,10 @@ DiagnosedSilenceableFailure transform::TransposeMatmulOp::applyToOne( // InsertSliceToCopyOp //===----------------------------------------------------------------------===// template <typename OpTy> -DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target, - transform::ApplyToEachResultList &results, - transform::TransformState &state) { +static DiagnosedSilenceableFailure +doit(RewriterBase &rewriter, OpTy target, + transform::ApplyToEachResultList &results, + transform::TransformState &state) { static_assert(llvm::is_one_of<OpTy, tensor::InsertSliceOp, tensor::ParallelInsertSliceOp>() && "wrong op type"); diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp index 660c313..fbac28e 100644 --- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp +++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp @@ -145,3 +145,13 @@ std::string mlir::acc::getRecipeName(mlir::acc::RecipeKind kind, return recipeName; } + +mlir::Value mlir::acc::getBaseEntity(mlir::Value val) { + if (auto partialEntityAccessOp = + dyn_cast<PartialEntityAccessOpInterface>(val.getDefiningOp())) { + if (!partialEntityAccessOp.isCompleteView()) + return partialEntityAccessOp.getBaseEntity(); + } + + return val; +} diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index ad8255a..ae3423c 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -4336,7 +4336,7 @@ OpFoldResult ExtractStridedSliceOp::fold(FoldAdaptor adaptor) { // ExtractStridedSliceOp(splat ConstantOp) -> ConstantOp. if (auto splat = llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getSource())) - DenseElementsAttr::get(getType(), splat.getSplatValue<Attribute>()); + return DenseElementsAttr::get(getType(), splat.getSplatValue<Attribute>()); // ExtractStridedSliceOp(non-splat ConstantOp) -> ConstantOp. return foldExtractStridedSliceNonSplatConstant(*this, adaptor.getSource()); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index f9aa28d5..83406c8 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -11,7 +11,6 @@ #include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/Builders.h" #include "mlir/IR/DialectImplementation.h" @@ -229,8 +228,10 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError, } if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) { - return emitError() - << "expected inst_data and lane_layout to have the same rank"; + return emitError() << "expected inst_data and lane_layout to have the same " + "rank, got inst_data " + << inst_data.size() << ", lane_layout " + << lane_layout.size(); } // sg_data is optional for Workgroup layout, but its presence requires @@ -569,8 +570,8 @@ TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError, // for gather and scatter ops, Low-precision types are packed in 32-bit units. unsigned bitWidth = elementType.getIntOrFloatBitWidth(); int chunkAlignmentFactor = - bitWidth < targetinfo::packedSizeInBitsForGatherScatter - ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth + bitWidth < xegpu::uArch::generalPackedFormatBitSize + ? xegpu::uArch::generalPackedFormatBitSize / bitWidth : 1; auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding); if (scatterAttr) { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 8fab255..90eae87 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -14,7 +14,6 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/IR/Attributes.h" @@ -37,6 +36,8 @@ #include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" + namespace mlir { namespace xegpu { #define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT @@ -104,6 +105,8 @@ public: SmallVector<int> getLaneData() const; + SmallVector<int> getInstData() const; + bool isSliceLayout() const { if (!isAssigned()) return false; @@ -137,6 +140,13 @@ SmallVector<int> LayoutInfo::getLaneData() const { [](int64_t val) { return static_cast<int>(val); }); } +SmallVector<int> LayoutInfo::getInstData() const { + if (!isAssigned()) + return {}; + return llvm::map_to_vector(storage.getEffectiveInstDataAsInt(), + [](int64_t val) { return static_cast<int>(val); }); +} + void LayoutInfo::print(raw_ostream &os) const { if (isAssigned()) { os << storage; @@ -174,12 +184,14 @@ LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const { SmallVector<int32_t> laneLayout; SmallVector<int32_t> laneData; + SmallVector<int32_t> instData; for (int64_t idx : permutation) { laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx])); laneData.push_back(static_cast<int32_t>(getLaneData()[idx])); + instData.push_back(static_cast<int32_t>(getInstData()[idx])); } - return LayoutInfo( - xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData)); + return LayoutInfo(xegpu::LayoutAttr::get(storage.getContext(), instData, + laneLayout, laneData)); } //===----------------------------------------------------------------------===// @@ -192,6 +204,28 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> { using Lattice::Lattice; }; +/// Helper Function to find a proper instruction multiple for the user-supplied +/// sg-level data shape. `candidates` are uArch allowed shapes. +/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count). +template <typename T> +int getLargestDivisor(T dim, ArrayRef<T> candidates, + ArrayRef<T> candidateMultiples = {}) { + static_assert(std::is_integral<T>::value, "T must be an integer type"); + int largest = -1; + SmallVector<T> multiples = {1}; + if (!candidateMultiples.empty()) + multiples = + SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end()); + for (T candidate : candidates) { + for (T multiple : multiples) { + int value = static_cast<int>(candidate * multiple); + if (value != 0 && dim % value == 0 && value > largest) + largest = value; + } + } + return largest; +} + /// Helper Functions to get default layouts. A `default layout` is a layout that /// is assigned to a value when the layout is not fixed by some anchor operation /// (like DPAS). @@ -200,18 +234,32 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> { /// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1]. /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1]. static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, - unsigned rank) { + unsigned rank, + const xegpu::uArch::uArch *uArch, + ArrayRef<int> instData) { assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); if (rank == 1) { return LayoutInfo( - xegpu::LayoutAttr::get(ctx, {xegpu::targetinfo::subgroupSize}, {1})); + xegpu::LayoutAttr::get(ctx, instData, {uArch->getSubgroupSize()}, {1})); } return LayoutInfo(xegpu::LayoutAttr::get( - ctx, {1, xegpu::targetinfo::subgroupSize}, {1, 1})); + ctx, instData, {1, uArch->getSubgroupSize()}, {1, 1})); +} + +static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, + unsigned rank, int subgroupSize) { + assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); + if (rank == 1) { + return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1})); + } + return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1})); } /// Helper to get the default layout for a vector type. static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, + const xegpu::uArch::uArch *uArch, + ArrayRef<int> instData, + unsigned packingSize, bool isScattered = false) { // Expecting a 1D or 2D vector. assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) && @@ -221,28 +269,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (vectorTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1); + return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch, instData); // Packing factor is determined by the element type bitwidth. - int packingFactor = 1; unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); + int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1; if (isScattered) { - packingFactor = - bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter - ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth - : 1; - return LayoutInfo(xegpu::LayoutAttr::get( - vectorTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, - {1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData, + {uArch->getSubgroupSize(), 1}, + {1, packingFactor})); } - if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) - packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth; - return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), - {1, xegpu::targetinfo::subgroupSize}, + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData, + {1, uArch->getSubgroupSize()}, {1, packingFactor})); } /// Helper to get the default layout for a vector type. static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, + const xegpu::uArch::uArch *uArch, + ArrayRef<int> instData, + unsigned packingSize, bool isScattered = false) { // Expecting a 1D or 2D vector. assert((tdescTy.getRank() == 1 || tdescTy.getRank() == 2) && @@ -252,27 +297,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (tdescTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1); + return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch, instData); // Packing factor is determined by the element type bitwidth. unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth(); - + int subgroupSize = uArch->getSubgroupSize(); + int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1; if (isScattered) { - int packingFactor = - bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter - ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth - : 1; return LayoutInfo(xegpu::LayoutAttr::get( - tdescTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, - {1, packingFactor})); + tdescTy.getContext(), instData, {subgroupSize, 1}, {1, packingFactor})); } - int packingFactor = - (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) - ? xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth - : 1; - return LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), - {1, xegpu::targetinfo::subgroupSize}, - {1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get( + tdescTy.getContext(), instData, {1, subgroupSize}, {1, packingFactor})); } /// Helper Function to get the expected layouts for DPAS operands. `lane_data` @@ -281,25 +317,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, /// `packedSizeInBitsForDefault` /// * For B operand, the data must be packed in minimum /// `packedSizeInBitsForDpasB` -static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, - unsigned operandNum) { +static LayoutInfo +getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum, + const xegpu::uArch::uArch *uArch, + ArrayRef<int> instData, unsigned packingSize) { Type elementTy = vectorTy.getElementType(); assert(elementTy.isIntOrFloat() && "Expected int or float type in DPAS operands"); - SmallVector<int32_t, 2> layout({1, xegpu::targetinfo::subgroupSize}); + SmallVector<int32_t, 2> layout({1, uArch->getSubgroupSize()}); // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and // must have the VNNI format. - if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < - xegpu::targetinfo::packedSizeInBitsForDpasB) { + if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < packingSize) { SmallVector<int32_t, 2> data( - {static_cast<int32_t>(xegpu::targetinfo::packedSizeInBitsForDpasB / - elementTy.getIntOrFloatBitWidth()), + {static_cast<int32_t>(packingSize / elementTy.getIntOrFloatBitWidth()), 1}); return LayoutInfo( - xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data)); + xegpu::LayoutAttr::get(vectorTy.getContext(), instData, layout, data)); } // Otherwise, return the default layout for the vector type. - return getDefaultSIMTLayoutInfo(vectorTy); + return getDefaultSIMTLayoutInfo(vectorTy, uArch, instData, packingSize); } //===----------------------------------------------------------------------===// @@ -456,7 +492,37 @@ void LayoutInfoPropagation::visitPrefetchNdOp( // Here we assign the default layout to the tensor descriptor operand of // prefetch. auto tdescTy = prefetch.getTensorDescType(); - auto prefetchLayout = getDefaultSIMTLayoutInfo(tdescTy); + + auto uArch = getUArch(getChipStr(prefetch).value_or("")); + const auto *uArchInstruction = + dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>( + uArch->getInstruction( + xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch)); + + auto blockWHC = + uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType()); + if (!blockWHC) + prefetch.emitWarning("No known block params found for the element type."); + auto [bWidth, bHeight, bCount] = blockWHC.value(); + SmallVector<int> instData; + int instWidth = getLargestDivisor( + static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth, + bCount); + if (instWidth == -1) + prefetch.emitWarning( + "No suitable instruction multiple found for the given shape."); + if (tdescTy.getRank() == 1) + instData = {instWidth}; + else { + int instHeight = getLargestDivisor( + static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight); + if (instHeight == -1) + prefetch.emitWarning( + "No suitable instruction multiple found for the given shape."); + instData = {instHeight, instWidth}; + } + auto prefetchLayout = getDefaultSIMTLayoutInfo( + tdescTy, uArch, instData, uArchInstruction->getPackedFormatBitSize()); // Propagate the layout to the source tensor descriptor. propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout)); } @@ -475,10 +541,11 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( reduction.emitWarning("Expecting output type to be 1D vector."); return; } + auto uArch = getUArch(xegpu::getChipStr(reduction).value_or("")); // Given that the result is 1D, the layout of the operand should be 2D with // default layout. - LayoutInfo operandLayout = - getDefaultSIMTLayoutInfo(reduction->getContext(), 2); + LayoutInfo operandLayout = getDefaultSIMTLayoutInfo( + reduction->getContext(), 2, uArch->getSubgroupSize()); propagateIfChanged(operands[0], operands[0]->meet(operandLayout)); // Accumulator should have the same layout as the result. propagateIfChanged(operands[1], operands[1]->meet(resultLayout)); @@ -557,15 +624,53 @@ void LayoutInfoPropagation::visitDpasOp( ArrayRef<const LayoutInfoLattice *> results) { VectorType aTy = dpas.getLhsType(); VectorType bTy = dpas.getRhsType(); - propagateIfChanged( - operands[0], operands[0]->meet(getSIMTLayoutInfoForDPASOperand(aTy, 0))); - propagateIfChanged( - operands[1], operands[1]->meet(getSIMTLayoutInfoForDPASOperand(bTy, 1))); + + auto uArch = getUArch(getChipStr(dpas).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + const auto *uArchInstruction = + dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction( + xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc)); + + const unsigned dataALen = aTy.getShape().front(); + auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType()); + const int maxALen = + getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen)); + if (maxALen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + + const unsigned dataBLen = bTy.getShape().back(); + auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType()); + const int maxBLen = + getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen)); + if (maxBLen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + SmallVector<int> instDataA = {maxALen, subgroupSize}; + SmallVector<int> instDataB = {subgroupSize, maxBLen}; + + propagateIfChanged(operands[0], + operands[0]->meet(getSIMTLayoutInfoForDPASOperand( + aTy, 0, uArch, instDataA, + uArchInstruction->getPackedFormatBitSizeA()))); + propagateIfChanged(operands[1], + operands[1]->meet(getSIMTLayoutInfoForDPASOperand( + bTy, 1, uArch, instDataB, + uArchInstruction->getPackedFormatBitSizeB()))); if (operands.size() > 2) { VectorType cTy = dpas.getAccType(); - propagateIfChanged( - operands[2], - operands[2]->meet(getSIMTLayoutInfoForDPASOperand(cTy, 2))); + const unsigned dataCLen = bTy.getShape().back(); + auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType()); + const int maxCLen = + getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen)); + if (maxCLen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + SmallVector<int> instDataC = {maxALen, maxCLen}; + propagateIfChanged(operands[2], + operands[2]->meet(getSIMTLayoutInfoForDPASOperand( + cTy, 2, uArch, instDataC, + uArchInstruction->getPackedFormatBitSizeB()))); } } @@ -573,7 +678,38 @@ void LayoutInfoPropagation::visitDpasOp( void LayoutInfoPropagation::visitStoreNdOp( xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands, ArrayRef<const LayoutInfoLattice *> results) { - LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType()); + + auto uArch = getUArch(getChipStr(store).value_or("")); + const auto *uArchInstruction = + dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>( + uArch->getInstruction( + xegpu::uArch::InstructionKind::Subgroup2DBlockStore)); + VectorType dataTy = store.getValueType(); + auto blockWHC = uArchInstruction->getBlockWidthHeightCount( + store.getValueType().getElementType()); + if (!blockWHC) + store.emitWarning("No known block params found for the element type."); + auto [bWidth, bHeight, bCount] = blockWHC.value(); + SmallVector<int> instData; + int instWidth = getLargestDivisor( + static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth, + bCount); + if (instWidth == -1) + store.emitWarning( + "No suitable instruction multiple found for the given shape."); + if (dataTy.getRank() == 1) + instData = {instWidth}; + else { + int instHeight = getLargestDivisor( + static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight); + if (instHeight == -1) + store.emitWarning( + "No suitable instruction multiple found for the given shape."); + instData = {instHeight, instWidth}; + } + LayoutInfo storeLayout = + getDefaultSIMTLayoutInfo(store.getValueType(), uArch, instData, + uArchInstruction->getPackedFormatBitSize()); // Both operands should have the same layout for (LayoutInfoLattice *operand : operands) propagateIfChanged(operand, operand->meet(storeLayout)); @@ -694,10 +830,23 @@ void LayoutInfoPropagation::visitLoadGatherOp( load.emitWarning("Not propagating, non-vector payload supplied."); return; } - LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true); + auto uArch = getUArch(getChipStr(load).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + SmallVector<int> instData{subgroupSize}; + if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1) + instData.push_back(chunkSize); + else if (auto srcTdescTy = + dyn_cast<xegpu::TensorDescType>(load.getSourceType())) { + if (srcTdescTy.getChunkSizeAsInt() > 1) + instData.push_back(chunkSize); + } + LayoutInfo layout = getDefaultSIMTLayoutInfo( + payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), + /*scattered*/ true); // Mask operand should have 1D default layout. - LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1); + LayoutInfo maskLayout = + getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize); // Propagate the new layout to the tensor descriptor operand. if (isa<xegpu::TensorDescType>(load.getSourceType())) @@ -717,8 +866,10 @@ void LayoutInfoPropagation::visitCreateDescOp( // Need the layout of the descriptor to propagate to the operands. if (!descLayout.isAssigned()) return; + auto uArch = getUArch(getChipStr(createDesc).value_or("")); // For offset operand propagate 1D default layout. - LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1); + LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1, + uArch->getSubgroupSize()); propagateIfChanged(operands[1], operands[1]->meet(layout)); } @@ -735,18 +886,30 @@ void LayoutInfoPropagation::visitStoreScatterOp( storeScatter.emitWarning("Not propagating, non-vector payload supplied."); return; } + auto uArch = getUArch(getChipStr(storeScatter).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + auto payloadShape = payloadTy.getShape(); if (payloadShape.size() > 1) assert( - payloadShape[0] == xegpu::targetinfo::subgroupSize && + payloadShape[0] == subgroupSize && "Expected the first dimension of 2D tensor descriptor to be equal to " "subgroup size."); - LayoutInfo payloadLayout = - getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true); + SmallVector<int> instData{subgroupSize}; + if (auto chunkSize = storeScatter.getChunkSize().value_or(0); chunkSize > 1) + instData.push_back(chunkSize); + else if (auto dstTdescTy = + dyn_cast<xegpu::TensorDescType>(storeScatter.getDestType())) { + if (dstTdescTy.getChunkSizeAsInt() > 1) + instData.push_back(chunkSize); + } + LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo( + payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), + /*scattered=*/true); LayoutInfo maskLayout = - getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1); + getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize); // Propagate the payload operand layout propagateIfChanged(operands[0], operands[0]->meet(payloadLayout)); // Propagate the destination (if tdesc) operand layout @@ -1023,9 +1186,13 @@ void XeGPUPropagateLayoutPass::runOnOperation() { LayoutInfo layout = analysis.getLayoutInfo(val); if (!layout.isAssigned()) return {}; + xegpu::DistributeLayoutAttr layoutAttr = + cast<xegpu::DistributeLayoutAttr>(layout.get()); + if (this->layoutKind == "lane") + layoutAttr = layoutAttr.dropInstData(); if (layout.isSliceLayout()) - return cast<xegpu::SliceAttr>(layout.get()); - return cast<xegpu::LayoutAttr>(layout.get()); + return cast<xegpu::SliceAttr>(layoutAttr); + return cast<xegpu::LayoutAttr>(layoutAttr); }; mlir::OpBuilder builder(&getContext()); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index d09dc19..5a3b27e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -11,10 +11,10 @@ #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" @@ -159,17 +159,18 @@ static bool requirePacked(const xegpu::LayoutAttr layout) { /// Helper function to check if the layout requires a transpose effect. static bool requireTranspose(const xegpu::LayoutAttr layout, - const std::string &chipStr) { + const xegpu::uArch::uArch *uArch) { // Return false for unsupported targets. // TODO: Add more support or move to target info. - if (chipStr != "pvc" && chipStr != "bmg") + if (uArch->getName().equals_insensitive("pvc") && + uArch->getName().equals_insensitive("bmg")) return false; if (!layout) return false; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); if (laneLayout.size() != 2) return false; - return laneLayout[0] == xegpu::targetinfo::subgroupSize && laneLayout[1] == 1; + return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1; } /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body @@ -199,6 +200,11 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> { using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern; LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, PatternRewriter &rewriter) const override { + auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or("")); + if (!uArch) + return rewriter.notifyMatchFailure( + gpuFuncOp, "Subgroup distribution requires target attribute attached " + "to set the warp size"); // If the function only contains a single void return, skip. if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) { return isa<gpu::ReturnOp>(op) && !op.getNumOperands(); @@ -230,7 +236,7 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> { ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults(); auto warpOp = gpu::WarpExecuteOnLane0Op::create( rewriter, laneId.getLoc(), gpuFuncResultType, laneId, - xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(), + uArch->getSubgroupSize(), newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes()); Block &warpBodyBlock = warpOp.getBodyRegion().front(); // Replace the ReturnOp of the original gpu function with a YieldOp. @@ -495,14 +501,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { warpOp, "warp result is not a xegpu::LoadNd op"); auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>(); + auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or("")); + if (!uArch) + return rewriter.notifyMatchFailure( + loadOp, "xegpu::LoadNdOp require target attribute attached to " + "determine transpose " + "requirement"); // Chip information is required to decide if the layout requires transpose // effect. - auto chipStr = xegpu::getChipStr(loadOp); - if (!chipStr) - return rewriter.notifyMatchFailure( - loadOp, - "xegpu::LoadNdOp require chip information to determine transpose " - "requirement"); // Expecting offsets to be present. SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets(); if (offsets.empty()) @@ -556,7 +562,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { // Set the packed attribute if the layout requires it. newLoadOp.setPacked(requirePacked(layout)); // Set the transpose attribute if the layout requires it. - if (requireTranspose(layout, chipStr.value())) + if (requireTranspose(layout, uArch)) newLoadOp.setTranspose( DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0})); Value distributedVal = newWarpOp.getResult(operandIdx); diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp index 08cac1f..5790a77 100644 --- a/mlir/lib/Transforms/ViewOpGraph.cpp +++ b/mlir/lib/Transforms/ViewOpGraph.cpp @@ -158,7 +158,8 @@ private: /// Emit a cluster (subgraph). The specified builder generates the body of the /// cluster. Return the anchor node of the cluster. - Node emitClusterStmt(function_ref<void()> builder, std::string label = "") { + Node emitClusterStmt(function_ref<void()> builder, + const std::string &label = "") { int clusterId = ++counter; os << "subgraph cluster_" << clusterId << " {\n"; os.indent(); @@ -269,7 +270,7 @@ private: } /// Emit a node statement. - Node emitNodeStmt(std::string label, StringRef shape = kShapeNode, + Node emitNodeStmt(const std::string &label, StringRef shape = kShapeNode, StringRef background = "") { int nodeId = ++counter; AttributeMap attrs; diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 5755ca9..8cce630 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -486,7 +486,7 @@ func.func @mbarrier() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + // CHECK: nvvm.mbarrier.init %[[barPtr]] nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> @@ -516,7 +516,7 @@ func.func @mbarrier_nocomplete() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + // CHECK: nvvm.mbarrier.init %[[barPtr]] nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> @@ -592,7 +592,7 @@ func.func @mbarrier_txcount() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + // CHECK: nvvm.mbarrier.init %[[barPtr]] nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType %tidxreg = nvvm.read.ptx.sreg.tid.x : i32 @@ -643,7 +643,7 @@ func.func @mbarrier_txcount_pred() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]], {{.*}}, predicate = %[[P]] + // CHECK: nvvm.mbarrier.init %[[barPtr]], {{.*}}, predicate = %[[P]] nvgpu.mbarrier.init %barrier[%c0], %mine, predicate = %pred : !barrierType %txcount = arith.constant 256 : index diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index 6960e83..fbc4c0a 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -8,7 +8,7 @@ // CHECK-LABEL: @init_mbarrier llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %count : i32, %pred : i1) { //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.shared.b64 [$0], $1;", "r,r,b" - nvvm.mbarrier.init.shared %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 + nvvm.mbarrier.init %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" nvvm.mbarrier.init %barrier_gen, %count, predicate = %pred : !llvm.ptr, i32, i1 llvm.return diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir index 87a31ca..1adc418 100644 --- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir +++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir @@ -8,11 +8,11 @@ // RUN: mlir-opt --allow-unregistered-dialect \ // RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \ -// RUN: | FileCheck %s --check-prefix=CHECK-GFX9 +// RUN: | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX9 // RUN: mlir-opt --allow-unregistered-dialect \ // RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx1030" %s \ -// RUN: | FileCheck %s --check-prefix=CHECK-GFX10 +// RUN: | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX10 // CHECK-SUB: gpu.module @kernels { // CHECK-SHFL: gpu.module @kernels { @@ -24,8 +24,7 @@ gpu.module @kernels { // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<5xf16>) // // CHECK-SHFL-LABEL: gpu.func @kernel0( - // CHECK-GFX9-LABEL: gpu.func @kernel0( - // CHECK-GFX10-LABEL: gpu.func @kernel0( + // CHECK-GFX-LABEL: gpu.func @kernel0( gpu.func @kernel0(%arg0: vector<5xf16>) kernel { // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16> // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16> @@ -56,8 +55,7 @@ gpu.module @kernels { // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4) // CHECK-SUB: "test.consume" - // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} - // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} + // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum2) : (vector<5xf16>) -> () @@ -74,8 +72,7 @@ gpu.module @kernels { // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<1xf32>) // // CHECK-SHFL-LABEL: gpu.func @kernel1( - // CHECK-GFX9-LABEL: gpu.func @kernel1( - // CHECK-GFX10-LABEL: gpu.func @kernel1( + // CHECK-GFX-LABEL: gpu.func @kernel1( gpu.func @kernel1(%arg0: vector<1xf32>) kernel { // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32> // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32 @@ -100,17 +97,14 @@ gpu.module @kernels { // Note stride is dropped because it is == 1. // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32 // CHECK-SUB: "test.consume" - // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} quad_perm - // CHECK-GFX9: amdgpu.dpp {{.+}} row_half_mirror - // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} quad_perm - // CHECK-GFX10: amdgpu.dpp {{.+}} row_half_mirror + // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} quad_perm + // CHECK-GFX: amdgpu.dpp {{.+}} row_half_mirror %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum2) : (vector<1xf32>) -> () // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32 // CHECK-SUB: "test.consume" - // CHECK-GFX9-NOT: amdgpu.dpp - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-NOT: amdgpu.dpp // CHECK-GFX10-NOT: rocdl.permlanex16 %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum3) : (vector<1xf32>) -> () @@ -126,11 +120,8 @@ gpu.module @kernels { // // CHECK-SHFL-LABEL: gpu.func @kernel2( // - // CHECK-GFX9-LABEL: gpu.func @kernel2( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel2( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel2( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel { // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8> // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> () @@ -148,8 +139,7 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) - // CHECK-GFX9-LABEL: gpu.func @kernel3( - // CHECK-GFX10-LABEL: gpu.func @kernel3( + // CHECK-GFX-LABEL: gpu.func @kernel3( gpu.func @kernel3(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -169,9 +159,9 @@ gpu.module @kernels { // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32 // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32 // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> () - + // CHECK-GFX9-COUNT-6: amdgpu.dpp - + // CHECK-GFX10-COUNT-4: amdgpu.dpp // CHECK-GFX10: rocdl.permlanex16 // CHECK-GFX10-COUNT-2: rocdl.readlane @@ -185,11 +175,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) // - // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered( - // CHECK-GFX9-SAME: %[[ARG0:.+]]: i32) - // - // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered( - // CHECK-GFX10-SAME: %[[ARG0:.+]]: i32) + // CHECK-GFX-LABEL: gpu.func @kernel3_clustered( + // CHECK-GFX-SAME: %[[ARG0:.+]]: i32) gpu.func @kernel3_clustered(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -204,19 +191,13 @@ gpu.module @kernels { // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32 // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> () - // CHECK-GFX9: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX9: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32 - // CHECK-GFX9: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX9: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32 - // CHECK-GFX9: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32 - // CHECK-GFX9: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32 - - // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32 - // CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32 - // CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32 - // CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32 - // CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32 + // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32 + // CHECK-GFX: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32 + // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32 + // CHECK-GFX: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32 + // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32 + // CHECK-GFX: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32 + // CHECK-GFX10: "test.consume"(%[[A2]]) : (i32) -> () %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32 "test.consume"(%sum0) : (i32) -> () @@ -228,11 +209,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) // - // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered_strided( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered_strided( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel3_clustered_strided( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel3_clustered_strided(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32 @@ -256,11 +234,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel4( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>) // - // CHECK-GFX9-LABEL: gpu.func @kernel4( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel4( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel4( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel4(%arg0: vector<2xf16>) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -298,11 +273,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>) // - // CHECK-GFX9-LABEL: gpu.func @kernel4_clustered( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel4_clustered( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel4_clustered( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -319,10 +291,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel5( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16) // - // CHECK-GFX9-LABEL: gpu.func @kernel5( - // - // CHECK-GFX10-LABEL: gpu.func @kernel5( - // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16) + // CHECK-GFX-LABEL: gpu.func @kernel5( + // CHECK-GFX-SAME: %[[ARG0:.+]]: i16) gpu.func @kernel5(%arg0: i16) kernel { // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32 @@ -334,7 +304,7 @@ gpu.module @kernels { // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () - + // CHECK-GFX9-COUNT-6: amdgpu.dpp // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 @@ -361,11 +331,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16) // - // CHECK-GFX9-LABEL: gpu.func @kernel5_clustered - // CHECK-GFX9-SAME: %[[ARG0:.+]]: i16) - // - // CHECK-GFX10-LABEL: gpu.func @kernel5_clustered - // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16) + // CHECK-GFX-LABEL: gpu.func @kernel5_clustered + // CHECK-GFX-SAME: %[[ARG0:.+]]: i16) gpu.func @kernel5_clustered(%arg0: i16) kernel { // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32 @@ -378,25 +345,15 @@ gpu.module @kernels { // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () - // CHECK-GFX9: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16 - // CHECK-GFX9: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16 - // CHECK-GFX9: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16 - // CHECK-GFX9: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX9: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16 - // CHECK-GFX9: "test.consume"(%[[VAR7]]) : (i16) -> () - - // CHECK-GFX10: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16 - // CHECK-GFX10: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16 - // CHECK-GFX10: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16 - // CHECK-GFX10: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-GFX10: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16 - // CHECK-GFX10: "test.consume"(%[[VAR7]]) : (i16) -> () + // CHECK-GFX: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16 + // CHECK-GFX: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16 + // CHECK-GFX: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16 + // CHECK-GFX: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-GFX: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16 + // CHECK-GFX: "test.consume"(%[[VAR7]]) : (i16) -> () %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16 "test.consume"(%sum0) : (i16) -> () @@ -407,11 +364,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel6( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>) // - // CHECK-GFX9-LABEL: gpu.func @kernel6( - // CHECK-GFX9-NOT: amdgpu.dpp - // - // CHECK-GFX10-LABEL: gpu.func @kernel6( - // CHECK-GFX10-NOT: amdgpu.dpp + // CHECK-GFX-LABEL: gpu.func @kernel6( + // CHECK-GFX-NOT: amdgpu.dpp gpu.func @kernel6(%arg0: vector<3xi8>) kernel { // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8> // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8> @@ -433,6 +387,44 @@ gpu.module @kernels { gpu.return } + // CHECK-GFX-LABEL: gpu.func @kernel7( + // CHECK-GFX-SAME: %[[ARG0:.+]]: f32) + // + // Checks, common to gfx942 and gfx1030, of + // (1) quad_perm, followed by reduction resulting in reduction over 2 consecutive lanes, + // (2) quad_perm, followed by reduction resulting in reduction over 4 consecutive lanes, + // (3) row_half_mirror, followed by reduction resulting in reduction over 8 consecutive lanes, and + // (4) row_mirror, followed by reduction resulting in reduction over 16 consecutive lanes. + // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A0:.+]] = arith.addf %[[ARG0]], %[[D0]] : f32 + // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A1:.+]] = arith.addf %[[A0]], %[[D1]] : f32 + // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A2:.+]] = arith.addf %[[A1]], %[[D2]] : f32 + // CHECK-GFX: %[[D3:.+]] = amdgpu.dpp %[[A2]] %[[A2]] row_mirror(unit) {bound_ctrl = true} : f32 + // CHECK-GFX: %[[A3:.+]] = arith.addf %[[A2]], %[[D3]] : f32 + // + // Now, on gfx942: + // (1) Lane 15 gets broadcast to lanes [16, 32) and lane 31 gets broadcast to lanes [48, 64], after which + // the reduction in lanes [16, 32) is over the full cluster of the first 32 lanes, and the reduction in lanes + // [48, 64) is over the full cluster of the last 32 lanes. + // (2) Update the reduction value in lanes [0, 16) and [32, 48) with the final reduction result from + // lanes [16, 32) and [48, 64), respectively. + // CHECK-GFX9: %[[BCAST15:.+]] = amdgpu.dpp %[[A3]] %[[A3]] row_bcast_15(unit) {row_mask = 10 : i32} : f32 + // CHECK-GFX9: %[[SUM:.+]] = arith.addf %[[A3]], %[[BCAST15]] : f32 + // CHECK-GFX9: %[[SWIZ:.+]] = amdgpu.swizzle_bitmode %[[SUM]] 0 31 0 : f32 + // CHECK-GFX9: "test.consume"(%[[SWIZ]]) : (f32) -> () + // + // On gfx1030, the final step is to permute the lanes and perform final reduction: + // CHECK-GFX10: rocdl.permlanex16 + // CHECK-GFX10: arith.addf + // CHECK-GFX10: "test.consume" + gpu.func @kernel7(%arg0: f32) kernel { + %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (f32) -> (f32) + "test.consume"(%sum0) : (f32) -> () + gpu.return + } + // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>) // diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir index 0243f5e..2505e56 100644 --- a/mlir/test/Dialect/LLVMIR/nvvm.mlir +++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir @@ -419,8 +419,8 @@ llvm.func private @mbarrier_init_generic(%barrier: !llvm.ptr) { llvm.func private @mbarrier_init_shared(%barrier: !llvm.ptr<3>) { %count = nvvm.read.ptx.sreg.ntid.x : i32 - // CHECK: nvvm.mbarrier.init.shared %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32 - nvvm.mbarrier.init.shared %barrier, %count : !llvm.ptr<3>, i32 + // CHECK: nvvm.mbarrier.init %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32 + nvvm.mbarrier.init %barrier, %count : !llvm.ptr<3>, i32 llvm.return } @@ -433,8 +433,8 @@ llvm.func private @mbarrier_inval_generic(%barrier: !llvm.ptr) { llvm.func private @mbarrier_inval_shared(%barrier: !llvm.ptr<3>) { - // CHECK: nvvm.mbarrier.inval.shared %{{.*}} : !llvm.ptr<3> - nvvm.mbarrier.inval.shared %barrier : !llvm.ptr<3> + // CHECK: nvvm.mbarrier.inval %{{.*}} : !llvm.ptr<3> + nvvm.mbarrier.inval %barrier : !llvm.ptr<3> llvm.return } diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir index d289d73..2780212 100644 --- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir +++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s gpu.module @test { gpu.func @empty() { diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir new file mode 100644 index 0000000..58461b8 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -0,0 +1,128 @@ +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s + +// CHECK-LABEL: func.func @dpas_f16( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : +// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : +// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> +// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> +gpu.module @test { + +func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + return +} +} + +// ----- +gpu.module @test_kernel { + gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c32 : index + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + + %out:3 = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) + -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) { + //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : + //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x32xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> + + //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x32xf16> + %c = arith.addf %a, %b : vector<16x32xf16> + + //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>> + xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16> + + //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> + %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> + scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc + : !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16> + } + gpu.return + } +} + +// ----- +gpu.module @test_kernel { + gpu.func @elementwise_with_inst_data_12(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c32 : index + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + + %out:3 = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) + -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) { + //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : + //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<12x32xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> + + //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<12x32xf16> + %c = arith.addf %a, %b : vector<12x32xf16> + + //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>>> + xegpu.store_nd %c, %arg2: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16> + + //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> + %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> + scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc + : !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16> + } + gpu.return + } +} + +// ----- +gpu.module @test { +// CHECK-LABEL: func.func @scatter_ops_chunksize( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { +// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> +// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> +// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> +// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8], lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +func.func @scatter_ops_chunksize(%src: memref<256xf16>) { + %1 = arith.constant dense<1>: vector<16xi1> + %offset = arith.constant dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> + : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> + xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> + : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + return +} +} diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 30f785d..543e119 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -1,5 +1,6 @@ -// RUN: mlir-opt -xegpu-propagate-layout -split-input-file %s | FileCheck %s +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout -split-input-file %s | FileCheck %s +gpu.module @test { // CHECK-LABEL: func.func @dpas_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32> @@ -25,8 +26,10 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @dpas_i8( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { // CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], @@ -37,8 +40,10 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre xegpu.store_nd %0, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array<i64: 1, 0>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : @@ -55,8 +60,10 @@ func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_transpose( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16> to vector<16x16xf16> @@ -73,8 +80,10 @@ func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, % xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @extf_truncf( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, %[[ARG1:[0-9a-zA-Z]+]]: // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>) -> vector<8x16xf32> { @@ -88,8 +97,10 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> return %4 : vector<8x16xf32> } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_gather_with_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} @@ -113,8 +124,10 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256 xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_gather_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} @@ -132,8 +145,9 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf xegpu.store_nd %1, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @store_scatter_with_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> -> @@ -148,8 +162,9 @@ func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) { xegpu.store %cst, %0, %cst_0 : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @store_scatter_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, @@ -161,8 +176,9 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { xegpu.store %arg0, %0, %cst_0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scatter_ops_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> @@ -179,8 +195,9 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) { : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scatter_ops( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> @@ -195,8 +212,9 @@ func.func @scatter_ops(%src: memref<256xf16>) { xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( // CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} // CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16> @@ -219,8 +237,9 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> @@ -239,8 +258,9 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8 xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} // CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16> @@ -255,8 +275,9 @@ func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16 xegpu.store_nd %3, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> // CHECK: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} @@ -270,9 +291,10 @@ func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %a xegpu.store_nd %3, %1 : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @binary_op_one_use( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, @@ -291,8 +313,9 @@ func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu. xegpu.store_nd %4, %arg2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @binary_op_multiple_uses( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, @@ -312,8 +335,9 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: ! xegpu.store_nd %2, %arg3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @for_op( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> @@ -353,8 +377,9 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me xegpu.store_nd %2#2, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @if_single_use( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, @@ -381,8 +406,9 @@ func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tens xegpu.store_nd %2, %arg3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @if_multiple_uses( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, @@ -411,8 +437,9 @@ func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t xegpu.store_nd %1, %arg4 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_outer_reduction( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) { // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [0] : vector<16x16xf32> to vector<16xf32> @@ -422,8 +449,9 @@ func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_inner_reduction( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) { // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [1] : vector<16x16xf32> to vector<16xf32> @@ -433,8 +461,9 @@ func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @update_nd_offset_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> @@ -448,8 +477,9 @@ func.func @update_nd_offset_1d(%arg0: memref<256xf32>){ xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @update_nd_offset_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> @@ -463,8 +493,9 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){ xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @prefetch_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> @@ -475,8 +506,9 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){ xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @prefetch_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> @@ -487,8 +519,9 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){ xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scf_while_and_condition( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) @@ -520,8 +553,9 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32 } return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { @@ -541,8 +575,9 @@ func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { @@ -563,3 +598,4 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } +} diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir index 09b8f59..42aa221 100644 --- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir @@ -621,3 +621,14 @@ func.func @invalid_range_equal_bounds() { %0 = nvvm.read.ptx.sreg.warpsize range <i32, 32, 32> : i32 return } + +// ----- + +// Test for correct return type check for wmma.load fragment a for f64 +llvm.func @nvvm_wmma_load_a_f64(%arg0: !llvm.ptr, %arg1 : i32) { + // expected-error @below {{'nvvm.wmma.load' op expected destination type to be f64}} + %0 = nvvm.wmma.load %arg0, %arg1 + {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<a>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32} + : (!llvm.ptr) -> !llvm.struct<(f64)> + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 594ae48..9115de6 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -463,6 +463,43 @@ llvm.func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 : llvm.return } +// CHECK-LABEL: @nvvm_wmma_load_a_f64 +llvm.func @nvvm_wmma_load_a_f64(%arg0: !llvm.ptr, %arg1 : i32) { + // CHECK: call double @llvm.nvvm.wmma.m8n8k4.load.a.row.stride.f64.p0(ptr %{{.*}}, i32 %{{.*}}) + %0 = nvvm.wmma.load %arg0, %arg1 + {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<a>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32} + : (!llvm.ptr) -> f64 + llvm.return +} + +// CHECK-LABEL: @nvvm_wmma_load_c_f64 +llvm.func @nvvm_wmma_load_c_f64(%arg0: !llvm.ptr, %arg1 : i32) { + // CHECK: call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.row.stride.f64.p0(ptr %{{.*}}, i32 %{{.*}}) + %0 = nvvm.wmma.load %arg0, %arg1 + {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<c>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32} + : (!llvm.ptr) -> !llvm.struct<(f64, f64)> + llvm.return +} + +// CHECK-LABEL: @nvvm_wmma_mma_f64 +llvm.func @nvvm_wmma_mma_f64(%0 : f64, %1 : f64, %2 : f64, %3 : f64) { + // CHECK: { double, double } @llvm.nvvm.wmma.m8n8k4.mma.row.col.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}) + %r = nvvm.wmma.mma %0, %1, %2, %3 + {eltypeA = #nvvm.mma_type<f64>, eltypeB = #nvvm.mma_type<f64>, k = 4 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, m = 8 : i32, n = 8 : i32} + : (f64, f64, f64, f64) + -> !llvm.struct<(f64, f64)> + llvm.return +} + +// CHECK-LABEL: @nvvm_wmma_store_d_f64 +llvm.func @nvvm_wmma_store_d_f64(%arg0: !llvm.ptr, %arg1 : i32, %arg2 : f64, %arg3 : f64) { + // CHECK: call void @llvm.nvvm.wmma.m8n8k4.store.d.row.stride.f64.p0(ptr %{{.*}}, double %{{.*}}, double %{{.*}}, i32 %{{.*}}) + nvvm.wmma.store %arg0, %arg1, %arg2, %arg3 + {eltype = #nvvm.mma_type<f64>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32} + : !llvm.ptr, f64, f64 + llvm.return +} + // CHECK-LABEL: @cp_async llvm.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) { // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}}) diff --git a/mlir/test/mlir-runner/memref-reshape.mlir b/mlir/test/mlir-runner/memref-reshape.mlir index 8c17f1fd..b264e02 100644 --- a/mlir/test/mlir-runner/memref-reshape.mlir +++ b/mlir/test/mlir-runner/memref-reshape.mlir @@ -65,8 +65,8 @@ func.func @reshape_ranked_memref_to_ranked(%input : memref<2x3xf32>, func.func @reshape_unranked_memref_to_ranked(%input : memref<2x3xf32>, %shape : memref<2xindex>) { %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32> - %output = memref.reshape %input(%shape) - : (memref<2x3xf32>, memref<2xindex>) -> memref<?x?xf32> + %output = memref.reshape %unranked_input(%shape) + : (memref<*xf32>, memref<2xindex>) -> memref<?x?xf32> %unranked_output = memref.cast %output : memref<?x?xf32> to memref<*xf32> call @printMemrefF32(%unranked_output) : (memref<*xf32>) -> () @@ -95,8 +95,8 @@ func.func @reshape_unranked_memref_to_unranked(%input : memref<2x3xf32>, %shape : memref<2xindex>) { %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32> %dyn_size_shape = memref.cast %shape : memref<2xindex> to memref<?xindex> - %output = memref.reshape %input(%dyn_size_shape) - : (memref<2x3xf32>, memref<?xindex>) -> memref<*xf32> + %output = memref.reshape %unranked_input(%dyn_size_shape) + : (memref<*xf32>, memref<?xindex>) -> memref<*xf32> call @printMemrefF32(%output) : (memref<*xf32>) -> () // CHECK: rank = 2 offset = 0 sizes = [3, 2] strides = [2, 1] data = diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp index f1fe53c..6f4e305 100644 --- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp +++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp @@ -570,3 +570,107 @@ TEST_F(OpenACCUtilsTest, getRecipeNamePrivateUnrankedMemref) { getRecipeName(RecipeKind::private_recipe, unrankedMemrefTy); EXPECT_EQ(recipeName, "privatization_memref_Zxi32_"); } + +//===----------------------------------------------------------------------===// +// getBaseEntity Tests +//===----------------------------------------------------------------------===// + +// Local implementation of PartialEntityAccessOpInterface for memref.subview. +// This is implemented locally in the test rather than officially because memref +// operations already have ViewLikeOpInterface, which serves a similar purpose +// for walking through views to the base entity. This test demonstrates how +// getBaseEntity() would work if the interface were attached to memref.subview. +namespace { +struct SubViewOpPartialEntityAccessOpInterface + : public acc::PartialEntityAccessOpInterface::ExternalModel< + SubViewOpPartialEntityAccessOpInterface, memref::SubViewOp> { + Value getBaseEntity(Operation *op) const { + auto subviewOp = cast<memref::SubViewOp>(op); + return subviewOp.getSource(); + } + + bool isCompleteView(Operation *op) const { + // For testing purposes, we'll consider it a partial view (return false). + // The real implementation would need to look at the offsets. + return false; + } +}; +} // namespace + +TEST_F(OpenACCUtilsTest, getBaseEntityFromSubview) { + // Register the local interface implementation for memref.subview + memref::SubViewOp::attachInterface<SubViewOpPartialEntityAccessOpInterface>( + context); + + // Create a base memref + auto memrefTy = MemRefType::get({10, 20}, b.getF32Type()); + OwningOpRef<memref::AllocaOp> allocOp = + memref::AllocaOp::create(b, loc, memrefTy); + Value baseMemref = allocOp->getResult(); + + // Create a subview of the base memref with non-zero offsets + // This creates a 5x10 view starting at [2, 3] in the original 10x20 memref + SmallVector<OpFoldResult> offsets = {b.getIndexAttr(2), b.getIndexAttr(3)}; + SmallVector<OpFoldResult> sizes = {b.getIndexAttr(5), b.getIndexAttr(10)}; + SmallVector<OpFoldResult> strides = {b.getIndexAttr(1), b.getIndexAttr(1)}; + + OwningOpRef<memref::SubViewOp> subviewOp = + memref::SubViewOp::create(b, loc, baseMemref, offsets, sizes, strides); + Value subview = subviewOp->getResult(); + + // Test that getBaseEntity returns the base memref, not the subview + Value baseEntity = getBaseEntity(subview); + EXPECT_EQ(baseEntity, baseMemref); +} + +TEST_F(OpenACCUtilsTest, getBaseEntityNoInterface) { + // Create a memref without the interface + auto memrefTy = MemRefType::get({10}, b.getI32Type()); + OwningOpRef<memref::AllocaOp> allocOp = + memref::AllocaOp::create(b, loc, memrefTy); + Value varPtr = allocOp->getResult(); + + // Test that getBaseEntity returns the value itself when there's no interface + Value baseEntity = getBaseEntity(varPtr); + EXPECT_EQ(baseEntity, varPtr); +} + +TEST_F(OpenACCUtilsTest, getBaseEntityChainedSubviews) { + // Register the local interface implementation for memref.subview + memref::SubViewOp::attachInterface<SubViewOpPartialEntityAccessOpInterface>( + context); + + // Create a base memref + auto memrefTy = MemRefType::get({100, 200}, b.getI64Type()); + OwningOpRef<memref::AllocaOp> allocOp = + memref::AllocaOp::create(b, loc, memrefTy); + Value baseMemref = allocOp->getResult(); + + // Create first subview + SmallVector<OpFoldResult> offsets1 = {b.getIndexAttr(10), b.getIndexAttr(20)}; + SmallVector<OpFoldResult> sizes1 = {b.getIndexAttr(50), b.getIndexAttr(80)}; + SmallVector<OpFoldResult> strides1 = {b.getIndexAttr(1), b.getIndexAttr(1)}; + + OwningOpRef<memref::SubViewOp> subview1Op = + memref::SubViewOp::create(b, loc, baseMemref, offsets1, sizes1, strides1); + Value subview1 = subview1Op->getResult(); + + // Create second subview (subview of subview) + SmallVector<OpFoldResult> offsets2 = {b.getIndexAttr(5), b.getIndexAttr(10)}; + SmallVector<OpFoldResult> sizes2 = {b.getIndexAttr(20), b.getIndexAttr(30)}; + SmallVector<OpFoldResult> strides2 = {b.getIndexAttr(1), b.getIndexAttr(1)}; + + OwningOpRef<memref::SubViewOp> subview2Op = + memref::SubViewOp::create(b, loc, subview1, offsets2, sizes2, strides2); + Value subview2 = subview2Op->getResult(); + + // Test that getBaseEntity on the nested subview returns the first subview + // (since our implementation returns the immediate source, not the ultimate + // base) + Value baseEntity = getBaseEntity(subview2); + EXPECT_EQ(baseEntity, subview1); + + // Test that calling getBaseEntity again returns the original base + Value ultimateBase = getBaseEntity(baseEntity); + EXPECT_EQ(ultimateBase, baseMemref); +} |
