aboutsummaryrefslogtreecommitdiff
path: root/mlir
diff options
context:
space:
mode:
Diffstat (limited to 'mlir')
-rw-r--r--mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td87
-rw-r--r--mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td72
-rw-r--r--mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td18
-rw-r--r--mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h5
-rw-r--r--mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td22
-rw-r--r--mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h30
-rw-r--r--mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td7
-rw-r--r--mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h2
-rw-r--r--mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h2
-rw-r--r--mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp9
-rw-r--r--mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp4
-rw-r--r--mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp5
-rw-r--r--mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp28
-rw-r--r--mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp57
-rw-r--r--mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp9
-rw-r--r--mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp10
-rw-r--r--mlir/lib/Dialect/Vector/IR/VectorOps.cpp2
-rw-r--r--mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp11
-rw-r--r--mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp293
-rw-r--r--mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp30
-rw-r--r--mlir/lib/Transforms/ViewOpGraph.cpp5
-rw-r--r--mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir8
-rw-r--r--mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir2
-rw-r--r--mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir172
-rw-r--r--mlir/test/Dialect/LLVMIR/nvvm.mlir8
-rw-r--r--mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir2
-rw-r--r--mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir128
-rw-r--r--mlir/test/Dialect/XeGPU/propagate-layout.mlir82
-rw-r--r--mlir/test/Target/LLVMIR/nvvmir-invalid.mlir11
-rw-r--r--mlir/test/Target/LLVMIR/nvvmir.mlir37
-rw-r--r--mlir/test/mlir-runner/memref-reshape.mlir8
-rw-r--r--mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp104
32 files changed, 920 insertions, 350 deletions
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 4f48385..ba5e48e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -579,7 +579,8 @@ def NVVM_PMEventOp : NVVM_PTXBuilder_Op<"pmevent">,
/// mbarrier.init instruction with generic pointer type
def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
- Arguments<(ins LLVM_AnyPointer:$addr, I32:$count, PtxPredicate:$predicate)> {
+ Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+ I32:$count, PtxPredicate:$predicate)> {
let summary = "MBarrier Initialization Op";
let description = [{
The `nvvm.mbarrier.init` operation initializes an *mbarrier object* at the specified
@@ -592,48 +593,35 @@ def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
- Transaction count (tx-count): 0
The operation takes the following operands:
- - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic
- addressing, but the address must still be in the shared memory space.
+ - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+ must be a pointer to generic or shared::cta memory. When it is generic, the
+ underlying address must be within the shared::cta memory space; otherwise
+ the behavior is undefined.
- `count`: Integer specifying the number of threads that will participate in barrier
synchronization. Must be in the range [1, 2²⁰ - 1].
- `predicate`: Optional predicate for conditional execution.
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init)
}];
- string llvmBuilder = [{
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init, {$addr, $count});
- }];
let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
+
let extraClassDeclaration = [{
bool hasIntrinsic() { if(getPredicate()) return false; return true; }
- }];
- let extraClassDefinition = [{
- std::string $cppClass::getPtx() { return std::string("mbarrier.init.b64 [%0], %1;"); }
- }];
-}
-
-/// mbarrier.init instruction with shared pointer type
-def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared", [NVVMRequiresSM<80>, DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
- Arguments<(ins LLVM_PointerShared:$addr, I32:$count, PtxPredicate:$predicate)> {
- let summary = "Shared MBarrier Initialization Op";
- let description = [{
- This Op is the same as `nvvm.mbarrier.init` except that the *mbarrier object*
- should be accessed using a shared-memory pointer instead of a generic-memory pointer.
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init)
+ static mlir::NVVM::IDArgPair
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+ llvm::IRBuilderBase& builder);
}];
+
string llvmBuilder = [{
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init_shared, {$addr, $count});
- }];
- let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
- let extraClassDeclaration = "bool hasIntrinsic() { return !getPredicate(); }";
- let extraClassDefinition = [{
- std::string $cppClass::getPtx() { return std::string("mbarrier.init.shared.b64 [%0], %1;"); }
+ auto [id, args] = NVVM::MBarrierInitOp::getIntrinsicIDAndArgs(
+ *op, moduleTranslation, builder);
+ createIntrinsicCall(builder, id, args);
}];
}
def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
- Arguments<(ins LLVM_AnyPointer:$addr)> {
+ Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> {
let summary = "MBarrier Invalidation Operation";
let description = [{
The `nvvm.mbarrier.inval` operation invalidates an *mbarrier object* at the
@@ -644,30 +632,27 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
It is undefined behavior if the *mbarrier object* is already invalid.
The operation takes the following operand:
- - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic
- addressing, but the address must still be in the shared memory space.
+ - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+ must be a pointer to generic or shared::cta memory. When it is generic, the
+ underlying address must be within the shared::cta memory space; otherwise
+ the behavior is undefined.
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval)
}];
- string llvmBuilder = [{
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval, {$addr});
- }];
- let assemblyFormat = "$addr attr-dict `:` type(operands)";
-}
-def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">,
- Arguments<(ins LLVM_PointerShared:$addr)> {
- let summary = "Shared MBarrier Invalidation Operation";
- let description = [{
- This Op is the same as `nvvm.mbarrier.inval` except that the *mbarrier object*
- should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+ let assemblyFormat = "$addr attr-dict `:` type(operands)";
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval)
+ let extraClassDeclaration = [{
+ static mlir::NVVM::IDArgPair
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+ llvm::IRBuilderBase& builder);
}];
+
string llvmBuilder = [{
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval_shared, {$addr});
+ auto [id, args] = NVVM::MBarrierInvalOp::getIntrinsicIDAndArgs(
+ *op, moduleTranslation, builder);
+ createIntrinsicCall(builder, id, args);
}];
- let assemblyFormat = "$addr attr-dict `:` type(operands)";
}
def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
@@ -2014,6 +1999,9 @@ class MMA_LDST_OPS<list<GEOM> Geom, list<string> Frags, list<string> Types> {
// llvm supports and can be extended as needed.
class NVVM_MMA_OPS {
// "wmma" operations
+ list<list<WMMA_REGS>> fp64_wmma_ops = MMA_OPS<
+ [GEOM<8, 8, 4>],
+ ["f64"], [], ["f64"], []>.ret;
list<list<WMMA_REGS>> tf32_wmma_ops = MMA_OPS<
[GEOM<16, 16, 8>],
["tf32"], [], ["f32"], []>.ret;
@@ -2024,6 +2012,7 @@ class NVVM_MMA_OPS {
[GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],
["s8","u8"], [], ["s32"], []>.ret;
list<list<WMMA_REGS>> all_wmma_ops = !listconcat(
+ fp64_wmma_ops,
tf32_wmma_ops,
fp_wmma_ops,
i8_wmma_ops);
@@ -2040,9 +2029,17 @@ class NVVM_MMA_OPS {
list<WMMA_REGS> ldst_tf32_cd_ops = MMA_LDST_OPS<
[GEOM<16, 16, 8>],
["c", "d"], ["f32"]>.ret;
+ list<WMMA_REGS> ldst_f64_ab_ops = MMA_LDST_OPS<
+ [GEOM<8, 8, 4>],
+ ["a", "b"], ["f64"]>.ret;
+ list<WMMA_REGS> ldst_f64_cd_ops = MMA_LDST_OPS<
+ [GEOM<8, 8, 4>],
+ ["c", "d"], ["f64"]>.ret;
list<WMMA_REGS> all_ldst_ops = !listconcat(ldst_ab_ops, ldst_cd_ops,
ldst_tf32_ab_ops,
- ldst_tf32_cd_ops);
+ ldst_tf32_cd_ops,
+ ldst_f64_ab_ops,
+ ldst_f64_cd_ops);
// Separate A/B/C fragments (loads) from D (stores).
list<WMMA_REGS> all_ld_ops = !filter(op, all_ldst_ops, !ne(op.frag, "d"));
list<WMMA_REGS> all_st_ops = !filter(op, all_ldst_ops, !eq(op.frag, "d"));
@@ -2349,7 +2346,7 @@ def MMAFragAttr : EnumAttr<NVVM_Dialect, MMAFrag, "mma_frag"> {
}
def NVVM_WMMALoadOp: NVVM_Op<"wmma.load">,
- Results<(outs LLVM_AnyStruct:$res)>,
+ Results<(outs AnyTypeOf<[LLVM_AnyStruct, F64]>:$res)>,
Arguments<(ins LLVM_AnyPointer: $ptr, I32: $stride, I32Attr:$m,
I32Attr:$n, I32Attr:$k, MMALayoutAttr:$layout,
MMATypesAttr:$eltype, MMAFragAttr:$frag)> {
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index b39207f..e00f3c1 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -323,8 +323,8 @@ def MemRef_ReallocOp : MemRef_Op<"realloc"> {
```mlir
%new = memref.realloc %old : memref<64xf32> to memref<124xf32>
- %4 = memref.load %new[%index] // ok
- %5 = memref.load %old[%index] // undefined behavior
+ %4 = memref.load %new[%index] : memref<124xf32> // ok
+ %5 = memref.load %old[%index] : memref<64xf32> // undefined behavior
```
}];
@@ -445,9 +445,10 @@ def MemRef_AllocaScopeOp : MemRef_Op<"alloca_scope",
operation:
```mlir
- %result = memref.alloca_scope {
+ %result = memref.alloca_scope -> f32 {
+ %value = arith.constant 1.0 : f32
...
- memref.alloca_scope.return %value
+ memref.alloca_scope.return %value : f32
}
```
@@ -478,7 +479,7 @@ def MemRef_AllocaScopeReturnOp : MemRef_Op<"alloca_scope.return",
to indicate which values are going to be returned. For example:
```mlir
- memref.alloca_scope.return %value
+ memref.alloca_scope.return %value : f32
```
}];
@@ -543,11 +544,11 @@ def MemRef_CastOp : MemRef_Op<"cast", [
Example:
```mlir
- Cast to concrete shape.
- %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32>
+ // Cast to concrete shape.
+ %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32>
- Erase rank information.
- %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32>
+ // Erase rank information.
+ %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32>
```
}];
@@ -613,8 +614,8 @@ def MemRef_DeallocOp : MemRef_Op<"dealloc", [MemRefsNormalizable]> {
Example:
```mlir
- %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1), 1>>
- memref.dealloc %0 : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1), 1>>
+ %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1>
+ memref.dealloc %0 : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1>
```
}];
@@ -728,13 +729,13 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
space 1 at indices [%k, %l], would be specified as follows:
```mlir
- %num_elements = arith.constant 256
+ %num_elements = arith.constant 256 : index
%idx = arith.constant 0 : index
- %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 4>
- dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] :
- memref<40 x 128 x f32>, affine_map<(d0) -> (d0)>, 0>,
- memref<2 x 1024 x f32>, affine_map<(d0) -> (d0)>, 1>,
- memref<1 x i32>, affine_map<(d0) -> (d0)>, 2>
+ %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
+ memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] :
+ memref<40 x 128 x f32, affine_map<(d0, d1) -> (d0, d1)>, 0>,
+ memref<2 x 1024 x f32, affine_map<(d0, d1) -> (d0, d1)>, 1>,
+ memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
```
If %stride and %num_elt_per_stride are specified, the DMA is expected to
@@ -742,8 +743,8 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
memory space 0 until %num_elements are transferred.
```mlir
- dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride,
- %num_elt_per_stride :
+ memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride,
+ %num_elt_per_stride :
```
* TODO: add additional operands to allow source and destination striding, and
@@ -891,10 +892,10 @@ def MemRef_DmaWaitOp : MemRef_Op<"dma_wait"> {
Example:
```mlir
- dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] :
- memref<2048 x f32>, affine_map<(d0) -> (d0)>, 0>,
- memref<256 x f32>, affine_map<(d0) -> (d0)>, 1>
- memref<1 x i32>, affine_map<(d0) -> (d0)>, 2>
+ memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] :
+ memref<2048 x f32, affine_map<(d0) -> (d0)>, 0>,
+ memref<256 x f32, affine_map<(d0) -> (d0)>, 1>,
+ memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
...
...
dma_wait %tag[%index], %num_elements : memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
@@ -1004,8 +1005,8 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [
```mlir
%base, %offset, %sizes:2, %strides:2 =
- memref.extract_strided_metadata %memref :
- memref<10x?xf32>, index, index, index, index, index
+ memref.extract_strided_metadata %memref : memref<10x?xf32>
+ -> memref<f32>, index, index, index, index, index
// After folding, the type of %m2 can be memref<10x?xf32> and further
// folded to %memref.
@@ -1013,7 +1014,7 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [
offset: [%offset],
sizes: [%sizes#0, %sizes#1],
strides: [%strides#0, %strides#1]
- : memref<f32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+ : memref<f32> to memref<?x?xf32, strided<[?, ?], offset:?>>
```
}];
@@ -1182,10 +1183,10 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> {
```mlir
// Private variable with an initial value.
- memref.global "private" @x : memref<2xf32> = dense<0.0,2.0>
+ memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]>
// Private variable with an initial value and an alignment (power of 2).
- memref.global "private" @x : memref<2xf32> = dense<0.0,2.0> {alignment = 64}
+ memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]> {alignment = 64}
// Declaration of an external variable.
memref.global "private" @y : memref<4xi32>
@@ -1194,7 +1195,7 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> {
memref.global @z : memref<3xf16> = uninitialized
// Externally visible constant variable.
- memref.global constant @c : memref<2xi32> = dense<1, 4>
+ memref.global constant @c : memref<2xi32> = dense<[1, 4]>
```
}];
@@ -1555,7 +1556,8 @@ def MemRef_ReinterpretCastOp
%dst = memref.reinterpret_cast %src to
offset: [%offset],
sizes: [%sizes],
- strides: [%strides]
+ strides: [%strides] :
+ memref<*xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
```
means that `%dst`'s descriptor will be:
```mlir
@@ -1695,12 +1697,12 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [
```mlir
// Reshape statically-shaped memref.
%dst = memref.reshape %src(%shape)
- : (memref<4x1xf32>, memref<1xi32>) to memref<4xf32>
+ : (memref<4x1xf32>, memref<1xi32>) -> memref<4xf32>
%dst0 = memref.reshape %src(%shape0)
- : (memref<4x1xf32>, memref<2xi32>) to memref<2x2xf32>
+ : (memref<4x1xf32>, memref<2xi32>) -> memref<2x2xf32>
// Flatten unranked memref.
%dst = memref.reshape %src(%shape)
- : (memref<*xf32>, memref<1xi32>) to memref<?xf32>
+ : (memref<*xf32>, memref<1xi32>) -> memref<?xf32>
```
b. Source type is ranked or unranked. Shape argument has dynamic size.
@@ -1709,10 +1711,10 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [
```mlir
// Reshape dynamically-shaped 1D memref.
%dst = memref.reshape %src(%shape)
- : (memref<?xf32>, memref<?xi32>) to memref<*xf32>
+ : (memref<?xf32>, memref<?xi32>) -> memref<*xf32>
// Reshape unranked memref.
%dst = memref.reshape %src(%shape)
- : (memref<*xf32>, memref<?xi32>) to memref<*xf32>
+ : (memref<*xf32>, memref<?xi32>) -> memref<*xf32>
```
}];
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
index 6fb9a95..054c13a 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
@@ -26,4 +26,22 @@ def ComputeRegionOpInterface : OpInterface<"ComputeRegionOpInterface"> {
];
}
+def PartialEntityAccessOpInterface : OpInterface<"PartialEntityAccessOpInterface"> {
+ let cppNamespace = "::mlir::acc";
+
+ let description = [{
+ An interface for operations that access a partial entity such as
+ field or array element access.
+ }];
+
+ let methods = [
+ InterfaceMethod<"Get the base entity being accessed", "::mlir::Value",
+ "getBaseEntity", (ins)>,
+ InterfaceMethod<"Check if this is a complete view of the entity", "bool",
+ "isCompleteView", (ins), [{
+ return false;
+ }]>,
+ ];
+}
+
#endif // OPENACC_OPS_INTERFACES
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
index 563c1e0..9647357 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
@@ -47,6 +47,11 @@ std::string getVariableName(mlir::Value v);
/// Returns an empty string if not possible to generate a recipe name.
std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type);
+// Get the base entity from partial entity access. This is used for getting
+// the base `struct` from an operation that only accesses a field or the
+// base `array` from an operation that only accesses a subarray.
+mlir::Value getBaseEntity(mlir::Value val);
+
} // namespace acc
} // namespace mlir
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 19a5231..40352b4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -379,28 +379,28 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
);
let builders = [
- AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $lane_layout,
+ AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $inst_data,
+ "llvm::ArrayRef<int32_t>": $lane_layout,
"llvm::ArrayRef<int32_t>": $lane_data),
[{
auto sg_layout = DenseI32ArrayAttr();
auto sg_data = DenseI32ArrayAttr();
- auto inst_data = DenseI32ArrayAttr();
auto order = DenseI32ArrayAttr();
- return $_get($_ctxt, sg_layout, sg_data, inst_data,
+ return $_get($_ctxt, sg_layout, sg_data,
+ DenseI32ArrayAttr::get($_ctxt, inst_data),
DenseI32ArrayAttr::get($_ctxt, lane_layout),
DenseI32ArrayAttr::get($_ctxt, lane_data), order);
}]>,
AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $lane_layout,
- "llvm::ArrayRef<int32_t>": $lane_data,
- "llvm::ArrayRef<int32_t>": $order),
+ "llvm::ArrayRef<int32_t>": $lane_data),
[{
- return $_get($_ctxt,
- /*sg_layout =*/ nullptr,
- /*sg_data =*/ nullptr,
- /*inst_data =*/ nullptr,
+ auto sg_layout = DenseI32ArrayAttr();
+ auto sg_data = DenseI32ArrayAttr();
+ auto inst_data = DenseI32ArrayAttr();
+ auto order = DenseI32ArrayAttr();
+ return $_get($_ctxt, sg_layout, sg_data, inst_data,
DenseI32ArrayAttr::get($_ctxt, lane_layout),
- DenseI32ArrayAttr::get($_ctxt, lane_data),
- DenseI32ArrayAttr::get($_ctxt, order));
+ DenseI32ArrayAttr::get($_ctxt, lane_data), order);
}]>,
AttrBuilder<(ins "DenseI32ArrayAttr": $lane_layout,
"DenseI32ArrayAttr": $lane_data,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h
deleted file mode 100644
index 8aa9536..0000000
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- XeGPUTargetInfo.h - Target constants ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
-#define MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
-
-namespace mlir {
-namespace xegpu {
-/// HW dependent constants.
-/// TODO: These constants should be queried from the target information.
-namespace targetinfo {
-constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup.
-/// If DPAS A or B operands have low precision element types they must be packed
-/// according to the following sizes.
-constexpr unsigned packedSizeInBitsForDefault =
- 16; // Minimum packing size per register for DPAS A.
-constexpr unsigned packedSizeInBitsForDpasB =
- 32; // Minimum packing size per register for DPAS B.
-constexpr unsigned packedSizeInBitsForGatherScatter =
- 32; // Minimum packing size per register for Gather and Scatter ops.
-} // namespace targetinfo
-} // namespace xegpu
-} // namespace mlir
-
-#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 564d9c4..b7af541 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -43,7 +43,12 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
let options = [Option<
"printOnly", "print-analysis-only", "bool",
/*default=*/"false",
- "Print the result of layout propagation analysis and exit.">];
+ "Print the result of layout propagation analysis and exit.">,
+ Option<
+ "layoutKind", "layout-kind", "std::string",
+ /*default=*/"\"lane\"",
+ "Propagate a `sg` / `inst` / `lane` level of xegpu layouts.">
+ ];
}
def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> {
diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
index dcb2ad5..b3231a1 100644
--- a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
+++ b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
@@ -270,6 +270,8 @@ inline const uArch *getUArch(llvm::StringRef archName) {
return PVCuArch::getInstance();
else if (archName.equals_insensitive("bmg"))
return BMGuArch::getInstance();
+ else
+ llvm_unreachable("No matching uArch found");
return nullptr;
}
diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
index ea33e88..8f23b89 100644
--- a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
+++ b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
@@ -29,6 +29,8 @@ namespace mlir {
namespace xegpu {
namespace uArch {
+constexpr unsigned generalPackedFormatBitSize{32};
+
// An enum class to represent the scope of an instruction
enum class InstructionScope { Lane, Subgroup, Workgroup, Cluster };
enum class InstructionKind {
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index a9efada..ec182f1 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -846,13 +846,8 @@ struct NVGPUMBarrierInitLowering
Value barrier = getMbarrierPtr(b, mbarrierType, adaptor.getBarriers(),
adaptor.getMbarId(), rewriter);
Value count = truncToI32(b, adaptor.getCount());
- if (isMbarrierShared(mbarrierType)) {
- rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(
- op, barrier, count, adaptor.getPredicate());
- } else {
- rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count,
- adaptor.getPredicate());
- }
+ rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count,
+ adaptor.getPredicate());
return success();
}
};
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 41d8d53..69a317ec 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -716,7 +716,7 @@ lowerReductionWithStartValue(ConversionPatternRewriter &rewriter, Location loc,
accumulator = getOrCreateAccumulator<ReductionNeutral>(rewriter, loc,
llvmType, accumulator);
return LLVMRedIntrinOp::create(rewriter, loc, llvmType,
- /*startValue=*/accumulator, vectorOperand,
+ /*start_value=*/accumulator, vectorOperand,
fmf);
}
@@ -743,7 +743,7 @@ static Value lowerPredicatedReductionWithStartValue(
Value vectorLength =
createVectorLengthValue(rewriter, loc, vectorOperand.getType());
return LLVMVPRedIntrinOp::create(rewriter, loc, llvmType,
- /*startValue=*/accumulator, vectorOperand,
+ /*satrt_value=*/accumulator, vectorOperand,
mask, vectorLength);
}
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index e08cc6f..d428fbf 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -1106,10 +1106,7 @@ static bool isUniformDefinition(Value value,
return false;
}
- if (!value.getType().isIntOrIndexOrFloat())
- return false;
-
- return true;
+ return value.getType().isIntOrIndexOrFloat();
}
/// Generates a broadcast op for the provided uniform value using the
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 81c3069..ec1571a 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -416,13 +416,39 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
if (ci.clusterSize >= 32) {
if (chipset.majorVersion <= 9) {
// Broadcast last value from each row to next row.
- // Use row mask to avoid polluting rows 1 and 3.
+ // Use row mask to avoid polluting row 0 (and row 2 if wave-64).
dpp = amdgpu::DPPOp::create(rewriter, loc, res.getType(), res, res,
amdgpu::DPPPerm::row_bcast_15,
rewriter.getUnitAttr(), 0xa, allBanks,
/*bound_ctrl*/ false);
res = vector::makeArithReduction(
rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+
+ // For subgroupSize = 64, at this point lanes [16, 32) contain the full
+ // reduction over lanes [0, 32), but lanes [0, 16) do not. Similarly,
+ // lanes [48, 64) contain the full reduction over lanes [32, 64), but
+ // lanes [32, 48) do not.
+ //
+ // If subgroup size is 64 and cluster size is 64, we don't need lanes [0,
+ // 16) and [32, 48) to have the correct cluster-32 reduction values at
+ // this point, because only lane 63's value will ultimately be read in
+ // this full-cluster case.
+ //
+ // If subgroup size is 64 and cluster size is 32, we need to ensure that
+ // lanes [0, 16) and [32, 48) have the correct final cluster-32 reduction
+ // values (subgroup_reduce guarantees that all lanes within each cluster
+ // contain the final reduction value). We do this by broadcasting lane
+ // 31's value to lanes [0, 16) and lanes 63's value to lanes [32, 48).
+ //
+ // See https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations
+ // for an illustration of how this within-cluster broadcast works with a
+ // swizzle.
+ if (ci.subgroupSize == 64 && ci.clusterSize == 32) {
+ res =
+ amdgpu::SwizzleBitModeOp::create(rewriter, loc, res, /*and_mask=*/0,
+ /*or_mask=*/31,
+ /*xor_mask=*/0);
+ }
} else if (chipset.majorVersion <= 12) {
// Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
Value uint32Max = arith::ConstantOp::create(
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index f0de4db..a5ffb9e 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -896,6 +896,12 @@ std::pair<mlir::Type, unsigned> NVVM::inferMMAType(NVVM::MMATypes type,
} else if (type == NVVM::MMATypes::f32) {
elementType = builder.getF32Type();
numberElements = 8;
+ } else if (type == NVVM::MMATypes::f64) {
+ elementType = builder.getF64Type();
+ if (frag == NVVM::MMAFrag::a || frag == NVVM::MMAFrag::b)
+ numberElements = 1;
+ else
+ numberElements = 2;
} else if (type == NVVM::MMATypes::tf32) {
elementType = builder.getI32Type();
numberElements = 4;
@@ -954,6 +960,14 @@ LogicalResult NVVM::WMMALoadOp::verify() {
return emitOpError() << "invalid attribute combination";
std::pair<Type, unsigned> typeInfo = inferMMATypeFromMNK(
getEltype(), getFrag(), getM(), getN(), getK(), getContext());
+ // Special case for f64 fragments
+ Type f64Ty = Float64Type::get(getContext());
+ if (typeInfo.first == f64Ty && typeInfo.second == 1) {
+ if (getType() != f64Ty)
+ return emitOpError("expected destination type to be f64");
+ return success();
+ }
+ // Everything else is a struct
Type dstType = LLVM::LLVMStructType::getLiteral(
getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));
if (getType() != dstType)
@@ -1608,9 +1622,52 @@ void Tcgen05MmaSmemDescOp::createSmemDescriptor(Operation &op,
}
//===----------------------------------------------------------------------===//
+// getPtx methods
+//===----------------------------------------------------------------------===//
+
+std::string NVVM::MBarrierInitOp::getPtx() {
+ unsigned addressSpace =
+ llvm::cast<LLVM::LLVMPointerType>(getAddr().getType()).getAddressSpace();
+ return (addressSpace == NVVMMemorySpace::Shared)
+ ? std::string("mbarrier.init.shared.b64 [%0], %1;")
+ : std::string("mbarrier.init.b64 [%0], %1;");
+}
+
+//===----------------------------------------------------------------------===//
// getIntrinsicID/getIntrinsicIDAndArgs methods
//===----------------------------------------------------------------------===//
+mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
+ Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+ auto thisOp = cast<NVVM::MBarrierInitOp>(op);
+ unsigned addressSpace =
+ llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
+ .getAddressSpace();
+ llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
+ ? llvm::Intrinsic::nvvm_mbarrier_init_shared
+ : llvm::Intrinsic::nvvm_mbarrier_init;
+
+ // Fill the Intrinsic Args
+ llvm::SmallVector<llvm::Value *> args;
+ args.push_back(mt.lookupValue(thisOp.getAddr()));
+ args.push_back(mt.lookupValue(thisOp.getCount()));
+
+ return {id, std::move(args)};
+}
+
+mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs(
+ Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+ auto thisOp = cast<NVVM::MBarrierInvalOp>(op);
+ unsigned addressSpace =
+ llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
+ .getAddressSpace();
+ llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
+ ? llvm::Intrinsic::nvvm_mbarrier_inval_shared
+ : llvm::Intrinsic::nvvm_mbarrier_inval;
+
+ return {id, {mt.lookupValue(thisOp.getAddr())}};
+}
+
#define CP_ASYNC_ID_IMPL(mod, size, suffix) \
llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index b09112b..3a43382 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -1958,7 +1958,7 @@ enum class OuterOrInnerPerm { Outer = 0, Inner = 1 };
/// Return true if either `op` or `permutation` are empty to allow a simpler
/// polymorphic implementation.
template <typename RelayoutOpTy>
-bool isValidPackingPermutation(
+static bool isValidPackingPermutation(
RelayoutOpTy op, ArrayRef<int64_t> permutation,
OuterOrInnerPerm outerOrInnerPerm = OuterOrInnerPerm::Outer) {
static_assert(
@@ -4322,9 +4322,10 @@ DiagnosedSilenceableFailure transform::TransposeMatmulOp::applyToOne(
// InsertSliceToCopyOp
//===----------------------------------------------------------------------===//
template <typename OpTy>
-DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target,
- transform::ApplyToEachResultList &results,
- transform::TransformState &state) {
+static DiagnosedSilenceableFailure
+doit(RewriterBase &rewriter, OpTy target,
+ transform::ApplyToEachResultList &results,
+ transform::TransformState &state) {
static_assert(llvm::is_one_of<OpTy, tensor::InsertSliceOp,
tensor::ParallelInsertSliceOp>() &&
"wrong op type");
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
index 660c313..fbac28e 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
@@ -145,3 +145,13 @@ std::string mlir::acc::getRecipeName(mlir::acc::RecipeKind kind,
return recipeName;
}
+
+mlir::Value mlir::acc::getBaseEntity(mlir::Value val) {
+ if (auto partialEntityAccessOp =
+ dyn_cast<PartialEntityAccessOpInterface>(val.getDefiningOp())) {
+ if (!partialEntityAccessOp.isCompleteView())
+ return partialEntityAccessOp.getBaseEntity();
+ }
+
+ return val;
+}
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index ad8255a..ae3423c 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4336,7 +4336,7 @@ OpFoldResult ExtractStridedSliceOp::fold(FoldAdaptor adaptor) {
// ExtractStridedSliceOp(splat ConstantOp) -> ConstantOp.
if (auto splat =
llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getSource()))
- DenseElementsAttr::get(getType(), splat.getSplatValue<Attribute>());
+ return DenseElementsAttr::get(getType(), splat.getSplatValue<Attribute>());
// ExtractStridedSliceOp(non-splat ConstantOp) -> ConstantOp.
return foldExtractStridedSliceNonSplatConstant(*this, adaptor.getSource());
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index f9aa28d5..83406c8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -11,7 +11,6 @@
#include "mlir/Dialect/Index/IR/IndexOps.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/DialectImplementation.h"
@@ -229,8 +228,10 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
}
if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) {
- return emitError()
- << "expected inst_data and lane_layout to have the same rank";
+ return emitError() << "expected inst_data and lane_layout to have the same "
+ "rank, got inst_data "
+ << inst_data.size() << ", lane_layout "
+ << lane_layout.size();
}
// sg_data is optional for Workgroup layout, but its presence requires
@@ -569,8 +570,8 @@ TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
// for gather and scatter ops, Low-precision types are packed in 32-bit units.
unsigned bitWidth = elementType.getIntOrFloatBitWidth();
int chunkAlignmentFactor =
- bitWidth < targetinfo::packedSizeInBitsForGatherScatter
- ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth
+ bitWidth < xegpu::uArch::generalPackedFormatBitSize
+ ? xegpu::uArch::generalPackedFormatBitSize / bitWidth
: 1;
auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding);
if (scatterAttr) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 8fab255..90eae87 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -14,7 +14,6 @@
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/IR/Attributes.h"
@@ -37,6 +36,8 @@
#include "llvm/Support/LogicalResult.h"
#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
+
namespace mlir {
namespace xegpu {
#define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT
@@ -104,6 +105,8 @@ public:
SmallVector<int> getLaneData() const;
+ SmallVector<int> getInstData() const;
+
bool isSliceLayout() const {
if (!isAssigned())
return false;
@@ -137,6 +140,13 @@ SmallVector<int> LayoutInfo::getLaneData() const {
[](int64_t val) { return static_cast<int>(val); });
}
+SmallVector<int> LayoutInfo::getInstData() const {
+ if (!isAssigned())
+ return {};
+ return llvm::map_to_vector(storage.getEffectiveInstDataAsInt(),
+ [](int64_t val) { return static_cast<int>(val); });
+}
+
void LayoutInfo::print(raw_ostream &os) const {
if (isAssigned()) {
os << storage;
@@ -174,12 +184,14 @@ LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const {
SmallVector<int32_t> laneLayout;
SmallVector<int32_t> laneData;
+ SmallVector<int32_t> instData;
for (int64_t idx : permutation) {
laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx]));
laneData.push_back(static_cast<int32_t>(getLaneData()[idx]));
+ instData.push_back(static_cast<int32_t>(getInstData()[idx]));
}
- return LayoutInfo(
- xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData));
+ return LayoutInfo(xegpu::LayoutAttr::get(storage.getContext(), instData,
+ laneLayout, laneData));
}
//===----------------------------------------------------------------------===//
@@ -192,6 +204,28 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
using Lattice::Lattice;
};
+/// Helper Function to find a proper instruction multiple for the user-supplied
+/// sg-level data shape. `candidates` are uArch allowed shapes.
+/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count).
+template <typename T>
+int getLargestDivisor(T dim, ArrayRef<T> candidates,
+ ArrayRef<T> candidateMultiples = {}) {
+ static_assert(std::is_integral<T>::value, "T must be an integer type");
+ int largest = -1;
+ SmallVector<T> multiples = {1};
+ if (!candidateMultiples.empty())
+ multiples =
+ SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
+ for (T candidate : candidates) {
+ for (T multiple : multiples) {
+ int value = static_cast<int>(candidate * multiple);
+ if (value != 0 && dim % value == 0 && value > largest)
+ largest = value;
+ }
+ }
+ return largest;
+}
+
/// Helper Functions to get default layouts. A `default layout` is a layout that
/// is assigned to a value when the layout is not fixed by some anchor operation
/// (like DPAS).
@@ -200,18 +234,32 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
- unsigned rank) {
+ unsigned rank,
+ const xegpu::uArch::uArch *uArch,
+ ArrayRef<int> instData) {
assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
if (rank == 1) {
return LayoutInfo(
- xegpu::LayoutAttr::get(ctx, {xegpu::targetinfo::subgroupSize}, {1}));
+ xegpu::LayoutAttr::get(ctx, instData, {uArch->getSubgroupSize()}, {1}));
}
return LayoutInfo(xegpu::LayoutAttr::get(
- ctx, {1, xegpu::targetinfo::subgroupSize}, {1, 1}));
+ ctx, instData, {1, uArch->getSubgroupSize()}, {1, 1}));
+}
+
+static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
+ unsigned rank, int subgroupSize) {
+ assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
+ if (rank == 1) {
+ return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1}));
+ }
+ return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1}));
}
/// Helper to get the default layout for a vector type.
static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
+ const xegpu::uArch::uArch *uArch,
+ ArrayRef<int> instData,
+ unsigned packingSize,
bool isScattered = false) {
// Expecting a 1D or 2D vector.
assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
@@ -221,28 +269,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
"Expected int or float element type.");
// If the rank is 1, then return default layout for 1D vector.
if (vectorTy.getRank() == 1)
- return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1);
+ return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch, instData);
// Packing factor is determined by the element type bitwidth.
- int packingFactor = 1;
unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
+ int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
if (isScattered) {
- packingFactor =
- bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
- ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
- : 1;
- return LayoutInfo(xegpu::LayoutAttr::get(
- vectorTy.getContext(), {xegpu::targetinfo::subgroupSize, 1},
- {1, packingFactor}));
+ return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData,
+ {uArch->getSubgroupSize(), 1},
+ {1, packingFactor}));
}
- if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
- packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth;
- return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(),
- {1, xegpu::targetinfo::subgroupSize},
+ return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData,
+ {1, uArch->getSubgroupSize()},
{1, packingFactor}));
}
/// Helper to get the default layout for a vector type.
static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
+ const xegpu::uArch::uArch *uArch,
+ ArrayRef<int> instData,
+ unsigned packingSize,
bool isScattered = false) {
// Expecting a 1D or 2D vector.
assert((tdescTy.getRank() == 1 || tdescTy.getRank() == 2) &&
@@ -252,27 +297,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
"Expected int or float element type.");
// If the rank is 1, then return default layout for 1D vector.
if (tdescTy.getRank() == 1)
- return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1);
+ return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch, instData);
// Packing factor is determined by the element type bitwidth.
unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth();
-
+ int subgroupSize = uArch->getSubgroupSize();
+ int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
if (isScattered) {
- int packingFactor =
- bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
- ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
- : 1;
return LayoutInfo(xegpu::LayoutAttr::get(
- tdescTy.getContext(), {xegpu::targetinfo::subgroupSize, 1},
- {1, packingFactor}));
+ tdescTy.getContext(), instData, {subgroupSize, 1}, {1, packingFactor}));
}
- int packingFactor =
- (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
- ? xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth
- : 1;
- return LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(),
- {1, xegpu::targetinfo::subgroupSize},
- {1, packingFactor}));
+ return LayoutInfo(xegpu::LayoutAttr::get(
+ tdescTy.getContext(), instData, {1, subgroupSize}, {1, packingFactor}));
}
/// Helper Function to get the expected layouts for DPAS operands. `lane_data`
@@ -281,25 +317,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
/// `packedSizeInBitsForDefault`
/// * For B operand, the data must be packed in minimum
/// `packedSizeInBitsForDpasB`
-static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy,
- unsigned operandNum) {
+static LayoutInfo
+getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum,
+ const xegpu::uArch::uArch *uArch,
+ ArrayRef<int> instData, unsigned packingSize) {
Type elementTy = vectorTy.getElementType();
assert(elementTy.isIntOrFloat() &&
"Expected int or float type in DPAS operands");
- SmallVector<int32_t, 2> layout({1, xegpu::targetinfo::subgroupSize});
+ SmallVector<int32_t, 2> layout({1, uArch->getSubgroupSize()});
// For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
// must have the VNNI format.
- if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() <
- xegpu::targetinfo::packedSizeInBitsForDpasB) {
+ if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < packingSize) {
SmallVector<int32_t, 2> data(
- {static_cast<int32_t>(xegpu::targetinfo::packedSizeInBitsForDpasB /
- elementTy.getIntOrFloatBitWidth()),
+ {static_cast<int32_t>(packingSize / elementTy.getIntOrFloatBitWidth()),
1});
return LayoutInfo(
- xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data));
+ xegpu::LayoutAttr::get(vectorTy.getContext(), instData, layout, data));
}
// Otherwise, return the default layout for the vector type.
- return getDefaultSIMTLayoutInfo(vectorTy);
+ return getDefaultSIMTLayoutInfo(vectorTy, uArch, instData, packingSize);
}
//===----------------------------------------------------------------------===//
@@ -456,7 +492,37 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
// Here we assign the default layout to the tensor descriptor operand of
// prefetch.
auto tdescTy = prefetch.getTensorDescType();
- auto prefetchLayout = getDefaultSIMTLayoutInfo(tdescTy);
+
+ auto uArch = getUArch(getChipStr(prefetch).value_or(""));
+ const auto *uArchInstruction =
+ dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>(
+ uArch->getInstruction(
+ xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch));
+
+ auto blockWHC =
+ uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType());
+ if (!blockWHC)
+ prefetch.emitWarning("No known block params found for the element type.");
+ auto [bWidth, bHeight, bCount] = blockWHC.value();
+ SmallVector<int> instData;
+ int instWidth = getLargestDivisor(
+ static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
+ bCount);
+ if (instWidth == -1)
+ prefetch.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ if (tdescTy.getRank() == 1)
+ instData = {instWidth};
+ else {
+ int instHeight = getLargestDivisor(
+ static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
+ if (instHeight == -1)
+ prefetch.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ instData = {instHeight, instWidth};
+ }
+ auto prefetchLayout = getDefaultSIMTLayoutInfo(
+ tdescTy, uArch, instData, uArchInstruction->getPackedFormatBitSize());
// Propagate the layout to the source tensor descriptor.
propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
}
@@ -475,10 +541,11 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
reduction.emitWarning("Expecting output type to be 1D vector.");
return;
}
+ auto uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
// Given that the result is 1D, the layout of the operand should be 2D with
// default layout.
- LayoutInfo operandLayout =
- getDefaultSIMTLayoutInfo(reduction->getContext(), 2);
+ LayoutInfo operandLayout = getDefaultSIMTLayoutInfo(
+ reduction->getContext(), 2, uArch->getSubgroupSize());
propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
// Accumulator should have the same layout as the result.
propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
@@ -557,15 +624,53 @@ void LayoutInfoPropagation::visitDpasOp(
ArrayRef<const LayoutInfoLattice *> results) {
VectorType aTy = dpas.getLhsType();
VectorType bTy = dpas.getRhsType();
- propagateIfChanged(
- operands[0], operands[0]->meet(getSIMTLayoutInfoForDPASOperand(aTy, 0)));
- propagateIfChanged(
- operands[1], operands[1]->meet(getSIMTLayoutInfoForDPASOperand(bTy, 1)));
+
+ auto uArch = getUArch(getChipStr(dpas).value_or(""));
+ const int subgroupSize = uArch->getSubgroupSize();
+ const auto *uArchInstruction =
+ dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
+ xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
+
+ const unsigned dataALen = aTy.getShape().front();
+ auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
+ const int maxALen =
+ getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
+ if (maxALen == -1)
+ dpas.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+
+ const unsigned dataBLen = bTy.getShape().back();
+ auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType());
+ const int maxBLen =
+ getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
+ if (maxBLen == -1)
+ dpas.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ SmallVector<int> instDataA = {maxALen, subgroupSize};
+ SmallVector<int> instDataB = {subgroupSize, maxBLen};
+
+ propagateIfChanged(operands[0],
+ operands[0]->meet(getSIMTLayoutInfoForDPASOperand(
+ aTy, 0, uArch, instDataA,
+ uArchInstruction->getPackedFormatBitSizeA())));
+ propagateIfChanged(operands[1],
+ operands[1]->meet(getSIMTLayoutInfoForDPASOperand(
+ bTy, 1, uArch, instDataB,
+ uArchInstruction->getPackedFormatBitSizeB())));
if (operands.size() > 2) {
VectorType cTy = dpas.getAccType();
- propagateIfChanged(
- operands[2],
- operands[2]->meet(getSIMTLayoutInfoForDPASOperand(cTy, 2)));
+ const unsigned dataCLen = bTy.getShape().back();
+ auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType());
+ const int maxCLen =
+ getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen));
+ if (maxCLen == -1)
+ dpas.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ SmallVector<int> instDataC = {maxALen, maxCLen};
+ propagateIfChanged(operands[2],
+ operands[2]->meet(getSIMTLayoutInfoForDPASOperand(
+ cTy, 2, uArch, instDataC,
+ uArchInstruction->getPackedFormatBitSizeB())));
}
}
@@ -573,7 +678,38 @@ void LayoutInfoPropagation::visitDpasOp(
void LayoutInfoPropagation::visitStoreNdOp(
xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType());
+
+ auto uArch = getUArch(getChipStr(store).value_or(""));
+ const auto *uArchInstruction =
+ dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
+ uArch->getInstruction(
+ xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
+ VectorType dataTy = store.getValueType();
+ auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
+ store.getValueType().getElementType());
+ if (!blockWHC)
+ store.emitWarning("No known block params found for the element type.");
+ auto [bWidth, bHeight, bCount] = blockWHC.value();
+ SmallVector<int> instData;
+ int instWidth = getLargestDivisor(
+ static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
+ bCount);
+ if (instWidth == -1)
+ store.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ if (dataTy.getRank() == 1)
+ instData = {instWidth};
+ else {
+ int instHeight = getLargestDivisor(
+ static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
+ if (instHeight == -1)
+ store.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ instData = {instHeight, instWidth};
+ }
+ LayoutInfo storeLayout =
+ getDefaultSIMTLayoutInfo(store.getValueType(), uArch, instData,
+ uArchInstruction->getPackedFormatBitSize());
// Both operands should have the same layout
for (LayoutInfoLattice *operand : operands)
propagateIfChanged(operand, operand->meet(storeLayout));
@@ -694,10 +830,23 @@ void LayoutInfoPropagation::visitLoadGatherOp(
load.emitWarning("Not propagating, non-vector payload supplied.");
return;
}
- LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true);
+ auto uArch = getUArch(getChipStr(load).value_or(""));
+ const int subgroupSize = uArch->getSubgroupSize();
+ SmallVector<int> instData{subgroupSize};
+ if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1)
+ instData.push_back(chunkSize);
+ else if (auto srcTdescTy =
+ dyn_cast<xegpu::TensorDescType>(load.getSourceType())) {
+ if (srcTdescTy.getChunkSizeAsInt() > 1)
+ instData.push_back(chunkSize);
+ }
+ LayoutInfo layout = getDefaultSIMTLayoutInfo(
+ payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
+ /*scattered*/ true);
// Mask operand should have 1D default layout.
- LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1);
+ LayoutInfo maskLayout =
+ getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize);
// Propagate the new layout to the tensor descriptor operand.
if (isa<xegpu::TensorDescType>(load.getSourceType()))
@@ -717,8 +866,10 @@ void LayoutInfoPropagation::visitCreateDescOp(
// Need the layout of the descriptor to propagate to the operands.
if (!descLayout.isAssigned())
return;
+ auto uArch = getUArch(getChipStr(createDesc).value_or(""));
// For offset operand propagate 1D default layout.
- LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1);
+ LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1,
+ uArch->getSubgroupSize());
propagateIfChanged(operands[1], operands[1]->meet(layout));
}
@@ -735,18 +886,30 @@ void LayoutInfoPropagation::visitStoreScatterOp(
storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
return;
}
+ auto uArch = getUArch(getChipStr(storeScatter).value_or(""));
+ const int subgroupSize = uArch->getSubgroupSize();
+
auto payloadShape = payloadTy.getShape();
if (payloadShape.size() > 1)
assert(
- payloadShape[0] == xegpu::targetinfo::subgroupSize &&
+ payloadShape[0] == subgroupSize &&
"Expected the first dimension of 2D tensor descriptor to be equal to "
"subgroup size.");
- LayoutInfo payloadLayout =
- getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true);
+ SmallVector<int> instData{subgroupSize};
+ if (auto chunkSize = storeScatter.getChunkSize().value_or(0); chunkSize > 1)
+ instData.push_back(chunkSize);
+ else if (auto dstTdescTy =
+ dyn_cast<xegpu::TensorDescType>(storeScatter.getDestType())) {
+ if (dstTdescTy.getChunkSizeAsInt() > 1)
+ instData.push_back(chunkSize);
+ }
+ LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo(
+ payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
+ /*scattered=*/true);
LayoutInfo maskLayout =
- getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1);
+ getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
// Propagate the payload operand layout
propagateIfChanged(operands[0], operands[0]->meet(payloadLayout));
// Propagate the destination (if tdesc) operand layout
@@ -1023,9 +1186,13 @@ void XeGPUPropagateLayoutPass::runOnOperation() {
LayoutInfo layout = analysis.getLayoutInfo(val);
if (!layout.isAssigned())
return {};
+ xegpu::DistributeLayoutAttr layoutAttr =
+ cast<xegpu::DistributeLayoutAttr>(layout.get());
+ if (this->layoutKind == "lane")
+ layoutAttr = layoutAttr.dropInstData();
if (layout.isSliceLayout())
- return cast<xegpu::SliceAttr>(layout.get());
- return cast<xegpu::LayoutAttr>(layout.get());
+ return cast<xegpu::SliceAttr>(layoutAttr);
+ return cast<xegpu::LayoutAttr>(layoutAttr);
};
mlir::OpBuilder builder(&getContext());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index d09dc19..5a3b27e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -11,10 +11,10 @@
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
@@ -159,17 +159,18 @@ static bool requirePacked(const xegpu::LayoutAttr layout) {
/// Helper function to check if the layout requires a transpose effect.
static bool requireTranspose(const xegpu::LayoutAttr layout,
- const std::string &chipStr) {
+ const xegpu::uArch::uArch *uArch) {
// Return false for unsupported targets.
// TODO: Add more support or move to target info.
- if (chipStr != "pvc" && chipStr != "bmg")
+ if (uArch->getName().equals_insensitive("pvc") &&
+ uArch->getName().equals_insensitive("bmg"))
return false;
if (!layout)
return false;
auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
if (laneLayout.size() != 2)
return false;
- return laneLayout[0] == xegpu::targetinfo::subgroupSize && laneLayout[1] == 1;
+ return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
}
/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
@@ -199,6 +200,11 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
PatternRewriter &rewriter) const override {
+ auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
+ if (!uArch)
+ return rewriter.notifyMatchFailure(
+ gpuFuncOp, "Subgroup distribution requires target attribute attached "
+ "to set the warp size");
// If the function only contains a single void return, skip.
if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
@@ -230,7 +236,7 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
auto warpOp = gpu::WarpExecuteOnLane0Op::create(
rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
- xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(),
+ uArch->getSubgroupSize(), newGpuFunc.getArguments(),
newGpuFunc.getArgumentTypes());
Block &warpBodyBlock = warpOp.getBodyRegion().front();
// Replace the ReturnOp of the original gpu function with a YieldOp.
@@ -495,14 +501,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
warpOp, "warp result is not a xegpu::LoadNd op");
auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+ auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
+ if (!uArch)
+ return rewriter.notifyMatchFailure(
+ loadOp, "xegpu::LoadNdOp require target attribute attached to "
+ "determine transpose "
+ "requirement");
// Chip information is required to decide if the layout requires transpose
// effect.
- auto chipStr = xegpu::getChipStr(loadOp);
- if (!chipStr)
- return rewriter.notifyMatchFailure(
- loadOp,
- "xegpu::LoadNdOp require chip information to determine transpose "
- "requirement");
// Expecting offsets to be present.
SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
if (offsets.empty())
@@ -556,7 +562,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
// Set the packed attribute if the layout requires it.
newLoadOp.setPacked(requirePacked(layout));
// Set the transpose attribute if the layout requires it.
- if (requireTranspose(layout, chipStr.value()))
+ if (requireTranspose(layout, uArch))
newLoadOp.setTranspose(
DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
Value distributedVal = newWarpOp.getResult(operandIdx);
diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp
index 08cac1f..5790a77 100644
--- a/mlir/lib/Transforms/ViewOpGraph.cpp
+++ b/mlir/lib/Transforms/ViewOpGraph.cpp
@@ -158,7 +158,8 @@ private:
/// Emit a cluster (subgraph). The specified builder generates the body of the
/// cluster. Return the anchor node of the cluster.
- Node emitClusterStmt(function_ref<void()> builder, std::string label = "") {
+ Node emitClusterStmt(function_ref<void()> builder,
+ const std::string &label = "") {
int clusterId = ++counter;
os << "subgraph cluster_" << clusterId << " {\n";
os.indent();
@@ -269,7 +270,7 @@ private:
}
/// Emit a node statement.
- Node emitNodeStmt(std::string label, StringRef shape = kShapeNode,
+ Node emitNodeStmt(const std::string &label, StringRef shape = kShapeNode,
StringRef background = "") {
int nodeId = ++counter;
AttributeMap attrs;
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 5755ca9..8cce630 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -486,7 +486,7 @@ func.func @mbarrier() {
// CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
// CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
// CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
- // CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
+ // CHECK: nvvm.mbarrier.init %[[barPtr]]
nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
// CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
@@ -516,7 +516,7 @@ func.func @mbarrier_nocomplete() {
// CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
// CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
// CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
- // CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
+ // CHECK: nvvm.mbarrier.init %[[barPtr]]
nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
// CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
@@ -592,7 +592,7 @@ func.func @mbarrier_txcount() {
// CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
// CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
// CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
- // CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
+ // CHECK: nvvm.mbarrier.init %[[barPtr]]
nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
%tidxreg = nvvm.read.ptx.sreg.tid.x : i32
@@ -643,7 +643,7 @@ func.func @mbarrier_txcount_pred() {
// CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
// CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
// CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
- // CHECK: nvvm.mbarrier.init.shared %[[barPtr]], {{.*}}, predicate = %[[P]]
+ // CHECK: nvvm.mbarrier.init %[[barPtr]], {{.*}}, predicate = %[[P]]
nvgpu.mbarrier.init %barrier[%c0], %mine, predicate = %pred : !barrierType
%txcount = arith.constant 256 : index
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 6960e83..fbc4c0a 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -8,7 +8,7 @@
// CHECK-LABEL: @init_mbarrier
llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %count : i32, %pred : i1) {
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.shared.b64 [$0], $1;", "r,r,b"
- nvvm.mbarrier.init.shared %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1
+ nvvm.mbarrier.init %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b"
nvvm.mbarrier.init %barrier_gen, %count, predicate = %pred : !llvm.ptr, i32, i1
llvm.return
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 87a31ca..1adc418 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -8,11 +8,11 @@
// RUN: mlir-opt --allow-unregistered-dialect \
// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \
-// RUN: | FileCheck %s --check-prefix=CHECK-GFX9
+// RUN: | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX9
// RUN: mlir-opt --allow-unregistered-dialect \
// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx1030" %s \
-// RUN: | FileCheck %s --check-prefix=CHECK-GFX10
+// RUN: | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX10
// CHECK-SUB: gpu.module @kernels {
// CHECK-SHFL: gpu.module @kernels {
@@ -24,8 +24,7 @@ gpu.module @kernels {
// CHECK-SUB-SAME: %[[ARG0:.+]]: vector<5xf16>)
//
// CHECK-SHFL-LABEL: gpu.func @kernel0(
- // CHECK-GFX9-LABEL: gpu.func @kernel0(
- // CHECK-GFX10-LABEL: gpu.func @kernel0(
+ // CHECK-GFX-LABEL: gpu.func @kernel0(
gpu.func @kernel0(%arg0: vector<5xf16>) kernel {
// CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
// CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
@@ -56,8 +55,7 @@ gpu.module @kernels {
// CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
// CHECK-SUB: "test.consume"
- // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}}
- // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}}
+ // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}}
%sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>)
"test.consume"(%sum2) : (vector<5xf16>) -> ()
@@ -74,8 +72,7 @@ gpu.module @kernels {
// CHECK-SUB-SAME: %[[ARG0:.+]]: vector<1xf32>)
//
// CHECK-SHFL-LABEL: gpu.func @kernel1(
- // CHECK-GFX9-LABEL: gpu.func @kernel1(
- // CHECK-GFX10-LABEL: gpu.func @kernel1(
+ // CHECK-GFX-LABEL: gpu.func @kernel1(
gpu.func @kernel1(%arg0: vector<1xf32>) kernel {
// CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
// CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
@@ -100,17 +97,14 @@ gpu.module @kernels {
// Note stride is dropped because it is == 1.
// CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
// CHECK-SUB: "test.consume"
- // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} quad_perm
- // CHECK-GFX9: amdgpu.dpp {{.+}} row_half_mirror
- // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} quad_perm
- // CHECK-GFX10: amdgpu.dpp {{.+}} row_half_mirror
+ // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} quad_perm
+ // CHECK-GFX: amdgpu.dpp {{.+}} row_half_mirror
%sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>)
"test.consume"(%sum2) : (vector<1xf32>) -> ()
// CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
// CHECK-SUB: "test.consume"
- // CHECK-GFX9-NOT: amdgpu.dpp
- // CHECK-GFX10-NOT: amdgpu.dpp
+ // CHECK-GFX-NOT: amdgpu.dpp
// CHECK-GFX10-NOT: rocdl.permlanex16
%sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>)
"test.consume"(%sum3) : (vector<1xf32>) -> ()
@@ -126,11 +120,8 @@ gpu.module @kernels {
//
// CHECK-SHFL-LABEL: gpu.func @kernel2(
//
- // CHECK-GFX9-LABEL: gpu.func @kernel2(
- // CHECK-GFX9-NOT: amdgpu.dpp
- //
- // CHECK-GFX10-LABEL: gpu.func @kernel2(
- // CHECK-GFX10-NOT: amdgpu.dpp
+ // CHECK-GFX-LABEL: gpu.func @kernel2(
+ // CHECK-GFX-NOT: amdgpu.dpp
gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel {
// CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
// CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
@@ -148,8 +139,7 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel3(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
- // CHECK-GFX9-LABEL: gpu.func @kernel3(
- // CHECK-GFX10-LABEL: gpu.func @kernel3(
+ // CHECK-GFX-LABEL: gpu.func @kernel3(
gpu.func @kernel3(%arg0: i32) kernel {
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -169,9 +159,9 @@ gpu.module @kernels {
// CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32
// CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
// CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
-
+
// CHECK-GFX9-COUNT-6: amdgpu.dpp
-
+
// CHECK-GFX10-COUNT-4: amdgpu.dpp
// CHECK-GFX10: rocdl.permlanex16
// CHECK-GFX10-COUNT-2: rocdl.readlane
@@ -185,11 +175,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel3_clustered(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
//
- // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered(
- // CHECK-GFX9-SAME: %[[ARG0:.+]]: i32)
- //
- // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered(
- // CHECK-GFX10-SAME: %[[ARG0:.+]]: i32)
+ // CHECK-GFX-LABEL: gpu.func @kernel3_clustered(
+ // CHECK-GFX-SAME: %[[ARG0:.+]]: i32)
gpu.func @kernel3_clustered(%arg0: i32) kernel {
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -204,19 +191,13 @@ gpu.module @kernels {
// CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
// CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
- // CHECK-GFX9: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
- // CHECK-GFX9: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
- // CHECK-GFX9: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
- // CHECK-GFX9: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
- // CHECK-GFX9: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
- // CHECK-GFX9: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
-
- // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
- // CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
- // CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
- // CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
- // CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
- // CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
+ // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
+ // CHECK-GFX: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
+ // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
+ // CHECK-GFX: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
+ // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32
+ // CHECK-GFX: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
+
// CHECK-GFX10: "test.consume"(%[[A2]]) : (i32) -> ()
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32
"test.consume"(%sum0) : (i32) -> ()
@@ -228,11 +209,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i32)
//
- // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered_strided(
- // CHECK-GFX9-NOT: amdgpu.dpp
- //
- // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered_strided(
- // CHECK-GFX10-NOT: amdgpu.dpp
+ // CHECK-GFX-LABEL: gpu.func @kernel3_clustered_strided(
+ // CHECK-GFX-NOT: amdgpu.dpp
gpu.func @kernel3_clustered_strided(%arg0: i32) kernel {
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
@@ -256,11 +234,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel4(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
//
- // CHECK-GFX9-LABEL: gpu.func @kernel4(
- // CHECK-GFX9-NOT: amdgpu.dpp
- //
- // CHECK-GFX10-LABEL: gpu.func @kernel4(
- // CHECK-GFX10-NOT: amdgpu.dpp
+ // CHECK-GFX-LABEL: gpu.func @kernel4(
+ // CHECK-GFX-NOT: amdgpu.dpp
gpu.func @kernel4(%arg0: vector<2xf16>) kernel {
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -298,11 +273,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel4_clustered(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>)
//
- // CHECK-GFX9-LABEL: gpu.func @kernel4_clustered(
- // CHECK-GFX9-NOT: amdgpu.dpp
- //
- // CHECK-GFX10-LABEL: gpu.func @kernel4_clustered(
- // CHECK-GFX10-NOT: amdgpu.dpp
+ // CHECK-GFX-LABEL: gpu.func @kernel4_clustered(
+ // CHECK-GFX-NOT: amdgpu.dpp
gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel {
// CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
// CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -319,10 +291,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel5(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
//
- // CHECK-GFX9-LABEL: gpu.func @kernel5(
- //
- // CHECK-GFX10-LABEL: gpu.func @kernel5(
- // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16)
+ // CHECK-GFX-LABEL: gpu.func @kernel5(
+ // CHECK-GFX-SAME: %[[ARG0:.+]]: i16)
gpu.func @kernel5(%arg0: i16) kernel {
// CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
// CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -334,7 +304,7 @@ gpu.module @kernels {
// CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
// CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
// CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
-
+
// CHECK-GFX9-COUNT-6: amdgpu.dpp
// CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
@@ -361,11 +331,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel5_clustered(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: i16)
//
- // CHECK-GFX9-LABEL: gpu.func @kernel5_clustered
- // CHECK-GFX9-SAME: %[[ARG0:.+]]: i16)
- //
- // CHECK-GFX10-LABEL: gpu.func @kernel5_clustered
- // CHECK-GFX10-SAME: %[[ARG0:.+]]: i16)
+ // CHECK-GFX-LABEL: gpu.func @kernel5_clustered
+ // CHECK-GFX-SAME: %[[ARG0:.+]]: i16)
gpu.func @kernel5_clustered(%arg0: i16) kernel {
// CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
// CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -378,25 +345,15 @@ gpu.module @kernels {
// CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
// CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
- // CHECK-GFX9: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
- // CHECK-GFX9: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
- // CHECK-GFX9: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
- // CHECK-GFX9: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
- // CHECK-GFX9: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
- // CHECK-GFX9: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
- // CHECK-GFX9: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
- // CHECK-GFX9: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
- // CHECK-GFX9: "test.consume"(%[[VAR7]]) : (i16) -> ()
-
- // CHECK-GFX10: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
- // CHECK-GFX10: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
- // CHECK-GFX10: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
- // CHECK-GFX10: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
- // CHECK-GFX10: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
- // CHECK-GFX10: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
- // CHECK-GFX10: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
- // CHECK-GFX10: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
- // CHECK-GFX10: "test.consume"(%[[VAR7]]) : (i16) -> ()
+ // CHECK-GFX: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
+ // CHECK-GFX: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
+ // CHECK-GFX: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
+ // CHECK-GFX: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
+ // CHECK-GFX: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16
+ // CHECK-GFX: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
+ // CHECK-GFX: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16
+ // CHECK-GFX: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
+ // CHECK-GFX: "test.consume"(%[[VAR7]]) : (i16) -> ()
%sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
"test.consume"(%sum0) : (i16) -> ()
@@ -407,11 +364,8 @@ gpu.module @kernels {
// CHECK-SHFL-LABEL: gpu.func @kernel6(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
//
- // CHECK-GFX9-LABEL: gpu.func @kernel6(
- // CHECK-GFX9-NOT: amdgpu.dpp
- //
- // CHECK-GFX10-LABEL: gpu.func @kernel6(
- // CHECK-GFX10-NOT: amdgpu.dpp
+ // CHECK-GFX-LABEL: gpu.func @kernel6(
+ // CHECK-GFX-NOT: amdgpu.dpp
gpu.func @kernel6(%arg0: vector<3xi8>) kernel {
// CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
// CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
@@ -433,6 +387,44 @@ gpu.module @kernels {
gpu.return
}
+ // CHECK-GFX-LABEL: gpu.func @kernel7(
+ // CHECK-GFX-SAME: %[[ARG0:.+]]: f32)
+ //
+ // Checks, common to gfx942 and gfx1030, of
+ // (1) quad_perm, followed by reduction resulting in reduction over 2 consecutive lanes,
+ // (2) quad_perm, followed by reduction resulting in reduction over 4 consecutive lanes,
+ // (3) row_half_mirror, followed by reduction resulting in reduction over 8 consecutive lanes, and
+ // (4) row_mirror, followed by reduction resulting in reduction over 16 consecutive lanes.
+ // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : f32
+ // CHECK-GFX: %[[A0:.+]] = arith.addf %[[ARG0]], %[[D0]] : f32
+ // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : f32
+ // CHECK-GFX: %[[A1:.+]] = arith.addf %[[A0]], %[[D1]] : f32
+ // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : f32
+ // CHECK-GFX: %[[A2:.+]] = arith.addf %[[A1]], %[[D2]] : f32
+ // CHECK-GFX: %[[D3:.+]] = amdgpu.dpp %[[A2]] %[[A2]] row_mirror(unit) {bound_ctrl = true} : f32
+ // CHECK-GFX: %[[A3:.+]] = arith.addf %[[A2]], %[[D3]] : f32
+ //
+ // Now, on gfx942:
+ // (1) Lane 15 gets broadcast to lanes [16, 32) and lane 31 gets broadcast to lanes [48, 64], after which
+ // the reduction in lanes [16, 32) is over the full cluster of the first 32 lanes, and the reduction in lanes
+ // [48, 64) is over the full cluster of the last 32 lanes.
+ // (2) Update the reduction value in lanes [0, 16) and [32, 48) with the final reduction result from
+ // lanes [16, 32) and [48, 64), respectively.
+ // CHECK-GFX9: %[[BCAST15:.+]] = amdgpu.dpp %[[A3]] %[[A3]] row_bcast_15(unit) {row_mask = 10 : i32} : f32
+ // CHECK-GFX9: %[[SUM:.+]] = arith.addf %[[A3]], %[[BCAST15]] : f32
+ // CHECK-GFX9: %[[SWIZ:.+]] = amdgpu.swizzle_bitmode %[[SUM]] 0 31 0 : f32
+ // CHECK-GFX9: "test.consume"(%[[SWIZ]]) : (f32) -> ()
+ //
+ // On gfx1030, the final step is to permute the lanes and perform final reduction:
+ // CHECK-GFX10: rocdl.permlanex16
+ // CHECK-GFX10: arith.addf
+ // CHECK-GFX10: "test.consume"
+ gpu.func @kernel7(%arg0: f32) kernel {
+ %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (f32) -> (f32)
+ "test.consume"(%sum0) : (f32) -> ()
+ gpu.return
+ }
+
// CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
// CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>)
//
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index 0243f5e..2505e56 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -419,8 +419,8 @@ llvm.func private @mbarrier_init_generic(%barrier: !llvm.ptr) {
llvm.func private @mbarrier_init_shared(%barrier: !llvm.ptr<3>) {
%count = nvvm.read.ptx.sreg.ntid.x : i32
- // CHECK: nvvm.mbarrier.init.shared %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32
- nvvm.mbarrier.init.shared %barrier, %count : !llvm.ptr<3>, i32
+ // CHECK: nvvm.mbarrier.init %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32
+ nvvm.mbarrier.init %barrier, %count : !llvm.ptr<3>, i32
llvm.return
}
@@ -433,8 +433,8 @@ llvm.func private @mbarrier_inval_generic(%barrier: !llvm.ptr) {
llvm.func private @mbarrier_inval_shared(%barrier: !llvm.ptr<3>) {
- // CHECK: nvvm.mbarrier.inval.shared %{{.*}} : !llvm.ptr<3>
- nvvm.mbarrier.inval.shared %barrier : !llvm.ptr<3>
+ // CHECK: nvvm.mbarrier.inval %{{.*}} : !llvm.ptr<3>
+ nvvm.mbarrier.inval %barrier : !llvm.ptr<3>
llvm.return
}
diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
index d289d73..2780212 100644
--- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
+++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s
gpu.module @test {
gpu.func @empty() {
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
new file mode 100644
index 0000000..58461b8
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -0,0 +1,128 @@
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func.func @dpas_f16(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+gpu.module @test {
+
+func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ return
+}
+}
+
+// -----
+gpu.module @test_kernel {
+ gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+ %c0 = arith.constant 0 : index
+ %c32 = arith.constant 32 : index
+ %c1024 = arith.constant 1024 : index
+ %block_id_x = gpu.block_id x
+ %block_id_y = gpu.block_id y
+ %m = arith.muli %block_id_x, %c32 : index
+
+ %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+ %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+ %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+
+ %out:3 = scf.for %k = %c0 to %c1024 step %c32
+ iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
+ -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
+ //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+ //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x32xf16>
+ %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+ %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+
+ //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x32xf16>
+ %c = arith.addf %a, %b : vector<16x32xf16>
+
+ //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>>
+ xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
+
+ //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+ %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+ %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+ scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
+ : !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
+ }
+ gpu.return
+ }
+}
+
+// -----
+gpu.module @test_kernel {
+ gpu.func @elementwise_with_inst_data_12(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+ %c0 = arith.constant 0 : index
+ %c32 = arith.constant 32 : index
+ %c1024 = arith.constant 1024 : index
+ %block_id_x = gpu.block_id x
+ %block_id_y = gpu.block_id y
+ %m = arith.muli %block_id_x, %c32 : index
+
+ %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+ %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+ %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+
+ %out:3 = scf.for %k = %c0 to %c1024 step %c32
+ iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
+ -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
+ //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+ //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<12x32xf16>
+ %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+ %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+
+ //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<12x32xf16>
+ %c = arith.addf %a, %b : vector<12x32xf16>
+
+ //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>>>
+ xegpu.store_nd %c, %arg2: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
+
+ //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+ %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+ %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+ scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
+ : !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
+ }
+ gpu.return
+ }
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @scatter_ops_chunksize(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}>
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8], lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
+ %1 = arith.constant dense<1>: vector<16xi1>
+ %offset = arith.constant dense<12> : vector<16xindex>
+ %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
+ : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+ xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
+ : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+ return
+}
+}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 30f785d..543e119 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt -xegpu-propagate-layout -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout -split-input-file %s | FileCheck %s
+gpu.module @test {
// CHECK-LABEL: func.func @dpas_f16(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
@@ -25,8 +26,10 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me
xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @dpas_i8(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) {
// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16],
@@ -37,8 +40,10 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre
xegpu.store_nd %0, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
return
}
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @load_with_transpose_effect(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array<i64: 1, 0>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
@@ -55,8 +60,10 @@ func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x
xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @vector_transpose(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
// CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16> to vector<16x16xf16>
@@ -73,8 +80,10 @@ func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %
xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @extf_truncf(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, %[[ARG1:[0-9a-zA-Z]+]]:
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>) -> vector<8x16xf32> {
@@ -88,8 +97,10 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor
%4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
return %4 : vector<8x16xf32>
}
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @load_gather_with_chunksize(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
@@ -113,8 +124,10 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256
xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @load_gather_1d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
@@ -132,8 +145,9 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf
xegpu.store_nd %1, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @store_scatter_with_chunksize(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) {
// CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> ->
@@ -148,8 +162,9 @@ func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) {
xegpu.store %cst, %0, %cst_0 : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @store_scatter_1d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
// CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>,
@@ -161,8 +176,9 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
xegpu.store %arg0, %0, %cst_0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @scatter_ops_chunksize(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
@@ -179,8 +195,9 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
: vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @scatter_ops(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
@@ -195,8 +212,9 @@ func.func @scatter_ops(%src: memref<256xf16>) {
xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
@@ -219,8 +237,9 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1
xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i32_to_f16(
// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
@@ -239,8 +258,9 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8
xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i16_to_i32(
// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
// CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16>
@@ -255,8 +275,9 @@ func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16
xegpu.store_nd %3, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle(
// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32>
// CHECK: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -270,9 +291,10 @@ func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %a
xegpu.store_nd %3, %1 : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @binary_op_one_use(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
@@ -291,8 +313,9 @@ func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.
xegpu.store_nd %4, %arg2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @binary_op_multiple_uses(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
@@ -312,8 +335,9 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !
xegpu.store_nd %2, %arg3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @for_op(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -353,8 +377,9 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me
xegpu.store_nd %2#2, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @if_single_use(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
@@ -381,8 +406,9 @@ func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tens
xegpu.store_nd %2, %arg3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @if_multiple_uses(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
@@ -411,8 +437,9 @@ func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
xegpu.store_nd %1, %arg4 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @vector_outer_reduction(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
// CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [0] : vector<16x16xf32> to vector<16xf32>
@@ -422,8 +449,9 @@ func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @vector_inner_reduction(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
// CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [1] : vector<16x16xf32> to vector<16xf32>
@@ -433,8 +461,9 @@ func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @update_nd_offset_1d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
@@ -448,8 +477,9 @@ func.func @update_nd_offset_1d(%arg0: memref<256xf32>){
xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @update_nd_offset_2d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -463,8 +493,9 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @prefetch_2d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -475,8 +506,9 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){
xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @prefetch_1d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
@@ -487,8 +519,9 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){
xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @scf_while_and_condition(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
// CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>)
@@ -520,8 +553,9 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32
}
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
@@ -541,8 +575,9 @@ func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc
xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
return
}
-
+}
// -----
+gpu.module @test {
// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
@@ -563,3 +598,4 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc
xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
return
}
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 09b8f59..42aa221 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -621,3 +621,14 @@ func.func @invalid_range_equal_bounds() {
%0 = nvvm.read.ptx.sreg.warpsize range <i32, 32, 32> : i32
return
}
+
+// -----
+
+// Test for correct return type check for wmma.load fragment a for f64
+llvm.func @nvvm_wmma_load_a_f64(%arg0: !llvm.ptr, %arg1 : i32) {
+ // expected-error @below {{'nvvm.wmma.load' op expected destination type to be f64}}
+ %0 = nvvm.wmma.load %arg0, %arg1
+ {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<a>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+ : (!llvm.ptr) -> !llvm.struct<(f64)>
+ llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 594ae48..9115de6 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -463,6 +463,43 @@ llvm.func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 :
llvm.return
}
+// CHECK-LABEL: @nvvm_wmma_load_a_f64
+llvm.func @nvvm_wmma_load_a_f64(%arg0: !llvm.ptr, %arg1 : i32) {
+ // CHECK: call double @llvm.nvvm.wmma.m8n8k4.load.a.row.stride.f64.p0(ptr %{{.*}}, i32 %{{.*}})
+ %0 = nvvm.wmma.load %arg0, %arg1
+ {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<a>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+ : (!llvm.ptr) -> f64
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_wmma_load_c_f64
+llvm.func @nvvm_wmma_load_c_f64(%arg0: !llvm.ptr, %arg1 : i32) {
+ // CHECK: call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.row.stride.f64.p0(ptr %{{.*}}, i32 %{{.*}})
+ %0 = nvvm.wmma.load %arg0, %arg1
+ {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<c>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+ : (!llvm.ptr) -> !llvm.struct<(f64, f64)>
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_wmma_mma_f64
+llvm.func @nvvm_wmma_mma_f64(%0 : f64, %1 : f64, %2 : f64, %3 : f64) {
+ // CHECK: { double, double } @llvm.nvvm.wmma.m8n8k4.mma.row.col.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}})
+ %r = nvvm.wmma.mma %0, %1, %2, %3
+ {eltypeA = #nvvm.mma_type<f64>, eltypeB = #nvvm.mma_type<f64>, k = 4 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, m = 8 : i32, n = 8 : i32}
+ : (f64, f64, f64, f64)
+ -> !llvm.struct<(f64, f64)>
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_wmma_store_d_f64
+llvm.func @nvvm_wmma_store_d_f64(%arg0: !llvm.ptr, %arg1 : i32, %arg2 : f64, %arg3 : f64) {
+ // CHECK: call void @llvm.nvvm.wmma.m8n8k4.store.d.row.stride.f64.p0(ptr %{{.*}}, double %{{.*}}, double %{{.*}}, i32 %{{.*}})
+ nvvm.wmma.store %arg0, %arg1, %arg2, %arg3
+ {eltype = #nvvm.mma_type<f64>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+ : !llvm.ptr, f64, f64
+ llvm.return
+}
+
// CHECK-LABEL: @cp_async
llvm.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {
// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
diff --git a/mlir/test/mlir-runner/memref-reshape.mlir b/mlir/test/mlir-runner/memref-reshape.mlir
index 8c17f1fd..b264e02 100644
--- a/mlir/test/mlir-runner/memref-reshape.mlir
+++ b/mlir/test/mlir-runner/memref-reshape.mlir
@@ -65,8 +65,8 @@ func.func @reshape_ranked_memref_to_ranked(%input : memref<2x3xf32>,
func.func @reshape_unranked_memref_to_ranked(%input : memref<2x3xf32>,
%shape : memref<2xindex>) {
%unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32>
- %output = memref.reshape %input(%shape)
- : (memref<2x3xf32>, memref<2xindex>) -> memref<?x?xf32>
+ %output = memref.reshape %unranked_input(%shape)
+ : (memref<*xf32>, memref<2xindex>) -> memref<?x?xf32>
%unranked_output = memref.cast %output : memref<?x?xf32> to memref<*xf32>
call @printMemrefF32(%unranked_output) : (memref<*xf32>) -> ()
@@ -95,8 +95,8 @@ func.func @reshape_unranked_memref_to_unranked(%input : memref<2x3xf32>,
%shape : memref<2xindex>) {
%unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32>
%dyn_size_shape = memref.cast %shape : memref<2xindex> to memref<?xindex>
- %output = memref.reshape %input(%dyn_size_shape)
- : (memref<2x3xf32>, memref<?xindex>) -> memref<*xf32>
+ %output = memref.reshape %unranked_input(%dyn_size_shape)
+ : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
call @printMemrefF32(%output) : (memref<*xf32>) -> ()
// CHECK: rank = 2 offset = 0 sizes = [3, 2] strides = [2, 1] data =
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
index f1fe53c..6f4e305 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
@@ -570,3 +570,107 @@ TEST_F(OpenACCUtilsTest, getRecipeNamePrivateUnrankedMemref) {
getRecipeName(RecipeKind::private_recipe, unrankedMemrefTy);
EXPECT_EQ(recipeName, "privatization_memref_Zxi32_");
}
+
+//===----------------------------------------------------------------------===//
+// getBaseEntity Tests
+//===----------------------------------------------------------------------===//
+
+// Local implementation of PartialEntityAccessOpInterface for memref.subview.
+// This is implemented locally in the test rather than officially because memref
+// operations already have ViewLikeOpInterface, which serves a similar purpose
+// for walking through views to the base entity. This test demonstrates how
+// getBaseEntity() would work if the interface were attached to memref.subview.
+namespace {
+struct SubViewOpPartialEntityAccessOpInterface
+ : public acc::PartialEntityAccessOpInterface::ExternalModel<
+ SubViewOpPartialEntityAccessOpInterface, memref::SubViewOp> {
+ Value getBaseEntity(Operation *op) const {
+ auto subviewOp = cast<memref::SubViewOp>(op);
+ return subviewOp.getSource();
+ }
+
+ bool isCompleteView(Operation *op) const {
+ // For testing purposes, we'll consider it a partial view (return false).
+ // The real implementation would need to look at the offsets.
+ return false;
+ }
+};
+} // namespace
+
+TEST_F(OpenACCUtilsTest, getBaseEntityFromSubview) {
+ // Register the local interface implementation for memref.subview
+ memref::SubViewOp::attachInterface<SubViewOpPartialEntityAccessOpInterface>(
+ context);
+
+ // Create a base memref
+ auto memrefTy = MemRefType::get({10, 20}, b.getF32Type());
+ OwningOpRef<memref::AllocaOp> allocOp =
+ memref::AllocaOp::create(b, loc, memrefTy);
+ Value baseMemref = allocOp->getResult();
+
+ // Create a subview of the base memref with non-zero offsets
+ // This creates a 5x10 view starting at [2, 3] in the original 10x20 memref
+ SmallVector<OpFoldResult> offsets = {b.getIndexAttr(2), b.getIndexAttr(3)};
+ SmallVector<OpFoldResult> sizes = {b.getIndexAttr(5), b.getIndexAttr(10)};
+ SmallVector<OpFoldResult> strides = {b.getIndexAttr(1), b.getIndexAttr(1)};
+
+ OwningOpRef<memref::SubViewOp> subviewOp =
+ memref::SubViewOp::create(b, loc, baseMemref, offsets, sizes, strides);
+ Value subview = subviewOp->getResult();
+
+ // Test that getBaseEntity returns the base memref, not the subview
+ Value baseEntity = getBaseEntity(subview);
+ EXPECT_EQ(baseEntity, baseMemref);
+}
+
+TEST_F(OpenACCUtilsTest, getBaseEntityNoInterface) {
+ // Create a memref without the interface
+ auto memrefTy = MemRefType::get({10}, b.getI32Type());
+ OwningOpRef<memref::AllocaOp> allocOp =
+ memref::AllocaOp::create(b, loc, memrefTy);
+ Value varPtr = allocOp->getResult();
+
+ // Test that getBaseEntity returns the value itself when there's no interface
+ Value baseEntity = getBaseEntity(varPtr);
+ EXPECT_EQ(baseEntity, varPtr);
+}
+
+TEST_F(OpenACCUtilsTest, getBaseEntityChainedSubviews) {
+ // Register the local interface implementation for memref.subview
+ memref::SubViewOp::attachInterface<SubViewOpPartialEntityAccessOpInterface>(
+ context);
+
+ // Create a base memref
+ auto memrefTy = MemRefType::get({100, 200}, b.getI64Type());
+ OwningOpRef<memref::AllocaOp> allocOp =
+ memref::AllocaOp::create(b, loc, memrefTy);
+ Value baseMemref = allocOp->getResult();
+
+ // Create first subview
+ SmallVector<OpFoldResult> offsets1 = {b.getIndexAttr(10), b.getIndexAttr(20)};
+ SmallVector<OpFoldResult> sizes1 = {b.getIndexAttr(50), b.getIndexAttr(80)};
+ SmallVector<OpFoldResult> strides1 = {b.getIndexAttr(1), b.getIndexAttr(1)};
+
+ OwningOpRef<memref::SubViewOp> subview1Op =
+ memref::SubViewOp::create(b, loc, baseMemref, offsets1, sizes1, strides1);
+ Value subview1 = subview1Op->getResult();
+
+ // Create second subview (subview of subview)
+ SmallVector<OpFoldResult> offsets2 = {b.getIndexAttr(5), b.getIndexAttr(10)};
+ SmallVector<OpFoldResult> sizes2 = {b.getIndexAttr(20), b.getIndexAttr(30)};
+ SmallVector<OpFoldResult> strides2 = {b.getIndexAttr(1), b.getIndexAttr(1)};
+
+ OwningOpRef<memref::SubViewOp> subview2Op =
+ memref::SubViewOp::create(b, loc, subview1, offsets2, sizes2, strides2);
+ Value subview2 = subview2Op->getResult();
+
+ // Test that getBaseEntity on the nested subview returns the first subview
+ // (since our implementation returns the immediate source, not the ultimate
+ // base)
+ Value baseEntity = getBaseEntity(subview2);
+ EXPECT_EQ(baseEntity, subview1);
+
+ // Test that calling getBaseEntity again returns the original base
+ Value ultimateBase = getBaseEntity(baseEntity);
+ EXPECT_EQ(ultimateBase, baseMemref);
+}