diff options
author | Krzysztof Drewniak <Krzysztof.Drewniak@amd.com> | 2024-02-27 12:35:48 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-27 12:35:48 -0600 |
commit | 563f414e049dc06dcb955f565fcff3c663982ee4 (patch) | |
tree | c49554aaf4750cef03d65bd93a4f19c7b2b82c98 | |
parent | 7b11e2ec39ae01f53d53250551e207583bd51e80 (diff) | |
download | llvm-563f414e049dc06dcb955f565fcff3c663982ee4.zip llvm-563f414e049dc06dcb955f565fcff3c663982ee4.tar.gz llvm-563f414e049dc06dcb955f565fcff3c663982ee4.tar.bz2 |
[mlir][AMDGPU] Set uniform-work-group-size=true by default (#79077)
GPU kernels generated via typical MLIR mechanisms make the assumption
that all workgroups are of uniform size, and so, as in OpenMP, it is
appropriate to set the "uniform-work-group-size"="true" attribute on
these functions by default. This commit makes that choice.
In the event it is needed, this commit adds
`rocdl.uniform_work_group_size` as an attribute to be set on LLVM
functions that can be used to override the default.
In addition, add proper failure messages to translation
-rw-r--r-- | mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 16 | ||||
-rw-r--r-- | mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp | 45 | ||||
-rw-r--r-- | mlir/test/Target/LLVMIR/rocdl.mlir | 9 |
3 files changed, 61 insertions, 9 deletions
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 6b170c8..53e9f2d 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -28,6 +28,22 @@ def ROCDL_Dialect : Dialect { let hasOperationAttrVerify = 1; let extraClassDeclaration = [{ + /// Get the name of the attribute used to annotate external kernel + /// functions. + static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; } + static constexpr ::llvm::StringLiteral getFlatWorkGroupSizeAttrName() { + return ::llvm::StringLiteral("rocdl.flat_work_group_size"); + } + static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() { + return ::llvm::StringLiteral("rocdl.reqd_work_group_size"); + } + /// MLIR's gpu-related infrastructure effectively assume uniform workgroup + /// sizes, so this attribute defaults to "true" on `rocdl.kernel` functions. + /// It is provided here to allow overriding this assumption. + static constexpr ::llvm::StringLiteral getUniformWorkGroupSizeAttrName() { + return ::llvm::StringLiteral("rocdl.uniform_work_group_size"); + } + /// The address space value that represents global memory. static constexpr unsigned kGlobalMemoryAddressSpace = 1; /// The address space value that represents shared memory. diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp index 93eb456..94423b3 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp @@ -88,7 +88,9 @@ public: if (dialect->getKernelAttrHelper().getName() == attribute.getName()) { auto func = dyn_cast<LLVM::LLVMFuncOp>(op); if (!func) - return failure(); + return op->emitOpError(Twine(attribute.getName()) + + " is only supported on `llvm.func` operations"); + ; // For GPU kernels, // 1. Insert AMDGPU_KERNEL calling convention. @@ -100,6 +102,13 @@ public: if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) { llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256"); } + + // MLIR's GPU kernel APIs all assume and produce uniformly-sized + // workgroups, so the lowering of the `rocdl.kernel` marker encodes this + // assumption. This assumption may be overridden by setting + // `rocdl.uniform_work_group_size` on a given function. + if (!llvmFunc->hasFnAttribute("uniform-work-group-size")) + llvmFunc->addFnAttr("uniform-work-group-size", "true"); } // Override flat-work-group-size // TODO: update clients to rocdl.flat_work_group_size instead, @@ -108,10 +117,12 @@ public: attribute.getName()) { auto func = dyn_cast<LLVM::LLVMFuncOp>(op); if (!func) - return failure(); + return op->emitOpError(Twine(attribute.getName()) + + " is only supported on `llvm.func` operations"); auto value = dyn_cast<IntegerAttr>(attribute.getValue()); if (!value) - return failure(); + return op->emitOpError(Twine(attribute.getName()) + + " must be an integer"); llvm::Function *llvmFunc = moduleTranslation.lookupFunction(func.getName()); @@ -124,10 +135,12 @@ public: attribute.getName()) { auto func = dyn_cast<LLVM::LLVMFuncOp>(op); if (!func) - return failure(); + return op->emitOpError(Twine(attribute.getName()) + + " is only supported on `llvm.func` operations"); auto value = dyn_cast<StringAttr>(attribute.getValue()); if (!value) - return failure(); + return op->emitOpError(Twine(attribute.getName()) + + " must be a string"); llvm::Function *llvmFunc = moduleTranslation.lookupFunction(func.getName()); @@ -135,16 +148,32 @@ public: llvmAttrValue.append(value.getValue()); llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue); } - + if (ROCDL::ROCDLDialect::getUniformWorkGroupSizeAttrName() == + attribute.getName()) { + auto func = dyn_cast<LLVM::LLVMFuncOp>(op); + if (!func) + return op->emitOpError(Twine(attribute.getName()) + + " is only supported on `llvm.func` operations"); + auto value = dyn_cast<BoolAttr>(attribute.getValue()); + if (!value) + return op->emitOpError(Twine(attribute.getName()) + + " must be a boolean"); + llvm::Function *llvmFunc = + moduleTranslation.lookupFunction(func.getName()); + llvmFunc->addFnAttr("uniform-work-group-size", + value.getValue() ? "true" : "false"); + } // Set reqd_work_group_size metadata if (dialect->getReqdWorkGroupSizeAttrHelper().getName() == attribute.getName()) { auto func = dyn_cast<LLVM::LLVMFuncOp>(op); if (!func) - return failure(); + return op->emitOpError(Twine(attribute.getName()) + + " is only supported on `llvm.func` operations"); auto value = dyn_cast<DenseI32ArrayAttr>(attribute.getValue()); if (!value) - return failure(); + return op->emitOpError(Twine(attribute.getName()) + + " must be a dense i32 array attribute"); llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext(); SmallVector<llvm::Metadata *, 3> metadata; llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32); diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 06b7865..3ea6292 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -56,6 +56,12 @@ llvm.func @known_block_sizes() llvm.return } +llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} { + // CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups() + // CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]] + llvm.return +} + llvm.func @rocdl.lane_id() -> i32 { // CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) // CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]]) @@ -505,8 +511,9 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 { llvm.return %source5 : i32 } -// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" } +// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" } // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128" +// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" } // CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64} // CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2} |