aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKrzysztof Drewniak <Krzysztof.Drewniak@amd.com>2024-02-27 12:35:48 -0600
committerGitHub <noreply@github.com>2024-02-27 12:35:48 -0600
commit563f414e049dc06dcb955f565fcff3c663982ee4 (patch)
treec49554aaf4750cef03d65bd93a4f19c7b2b82c98
parent7b11e2ec39ae01f53d53250551e207583bd51e80 (diff)
downloadllvm-563f414e049dc06dcb955f565fcff3c663982ee4.zip
llvm-563f414e049dc06dcb955f565fcff3c663982ee4.tar.gz
llvm-563f414e049dc06dcb955f565fcff3c663982ee4.tar.bz2
[mlir][AMDGPU] Set uniform-work-group-size=true by default (#79077)
GPU kernels generated via typical MLIR mechanisms make the assumption that all workgroups are of uniform size, and so, as in OpenMP, it is appropriate to set the "uniform-work-group-size"="true" attribute on these functions by default. This commit makes that choice. In the event it is needed, this commit adds `rocdl.uniform_work_group_size` as an attribute to be set on LLVM functions that can be used to override the default. In addition, add proper failure messages to translation
-rw-r--r--mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td16
-rw-r--r--mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp45
-rw-r--r--mlir/test/Target/LLVMIR/rocdl.mlir9
3 files changed, 61 insertions, 9 deletions
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 6b170c8..53e9f2d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -28,6 +28,22 @@ def ROCDL_Dialect : Dialect {
let hasOperationAttrVerify = 1;
let extraClassDeclaration = [{
+ /// Get the name of the attribute used to annotate external kernel
+ /// functions.
+ static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
+ static constexpr ::llvm::StringLiteral getFlatWorkGroupSizeAttrName() {
+ return ::llvm::StringLiteral("rocdl.flat_work_group_size");
+ }
+ static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
+ return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
+ }
+ /// MLIR's gpu-related infrastructure effectively assume uniform workgroup
+ /// sizes, so this attribute defaults to "true" on `rocdl.kernel` functions.
+ /// It is provided here to allow overriding this assumption.
+ static constexpr ::llvm::StringLiteral getUniformWorkGroupSizeAttrName() {
+ return ::llvm::StringLiteral("rocdl.uniform_work_group_size");
+ }
+
/// The address space value that represents global memory.
static constexpr unsigned kGlobalMemoryAddressSpace = 1;
/// The address space value that represents shared memory.
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index 93eb456..94423b3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -88,7 +88,9 @@ public:
if (dialect->getKernelAttrHelper().getName() == attribute.getName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
- return failure();
+ return op->emitOpError(Twine(attribute.getName()) +
+ " is only supported on `llvm.func` operations");
+ ;
// For GPU kernels,
// 1. Insert AMDGPU_KERNEL calling convention.
@@ -100,6 +102,13 @@ public:
if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
}
+
+ // MLIR's GPU kernel APIs all assume and produce uniformly-sized
+ // workgroups, so the lowering of the `rocdl.kernel` marker encodes this
+ // assumption. This assumption may be overridden by setting
+ // `rocdl.uniform_work_group_size` on a given function.
+ if (!llvmFunc->hasFnAttribute("uniform-work-group-size"))
+ llvmFunc->addFnAttr("uniform-work-group-size", "true");
}
// Override flat-work-group-size
// TODO: update clients to rocdl.flat_work_group_size instead,
@@ -108,10 +117,12 @@ public:
attribute.getName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
- return failure();
+ return op->emitOpError(Twine(attribute.getName()) +
+ " is only supported on `llvm.func` operations");
auto value = dyn_cast<IntegerAttr>(attribute.getValue());
if (!value)
- return failure();
+ return op->emitOpError(Twine(attribute.getName()) +
+ " must be an integer");
llvm::Function *llvmFunc =
moduleTranslation.lookupFunction(func.getName());
@@ -124,10 +135,12 @@ public:
attribute.getName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
- return failure();
+ return op->emitOpError(Twine(attribute.getName()) +
+ " is only supported on `llvm.func` operations");
auto value = dyn_cast<StringAttr>(attribute.getValue());
if (!value)
- return failure();
+ return op->emitOpError(Twine(attribute.getName()) +
+ " must be a string");
llvm::Function *llvmFunc =
moduleTranslation.lookupFunction(func.getName());
@@ -135,16 +148,32 @@ public:
llvmAttrValue.append(value.getValue());
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
}
-
+ if (ROCDL::ROCDLDialect::getUniformWorkGroupSizeAttrName() ==
+ attribute.getName()) {
+ auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+ if (!func)
+ return op->emitOpError(Twine(attribute.getName()) +
+ " is only supported on `llvm.func` operations");
+ auto value = dyn_cast<BoolAttr>(attribute.getValue());
+ if (!value)
+ return op->emitOpError(Twine(attribute.getName()) +
+ " must be a boolean");
+ llvm::Function *llvmFunc =
+ moduleTranslation.lookupFunction(func.getName());
+ llvmFunc->addFnAttr("uniform-work-group-size",
+ value.getValue() ? "true" : "false");
+ }
// Set reqd_work_group_size metadata
if (dialect->getReqdWorkGroupSizeAttrHelper().getName() ==
attribute.getName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
- return failure();
+ return op->emitOpError(Twine(attribute.getName()) +
+ " is only supported on `llvm.func` operations");
auto value = dyn_cast<DenseI32ArrayAttr>(attribute.getValue());
if (!value)
- return failure();
+ return op->emitOpError(Twine(attribute.getName()) +
+ " must be a dense i32 array attribute");
llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
SmallVector<llvm::Metadata *, 3> metadata;
llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 06b7865..3ea6292 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -56,6 +56,12 @@ llvm.func @known_block_sizes()
llvm.return
}
+llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} {
+ // CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups()
+ // CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]]
+ llvm.return
+}
+
llvm.func @rocdl.lane_id() -> i32 {
// CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
// CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
@@ -505,8 +511,9 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
llvm.return %source5 : i32
}
-// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
+// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
+// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" }
// CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
// CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}