aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGuray Ozen <guray.ozen@gmail.com>2024-01-08 14:49:19 +0100
committerGitHub <noreply@github.com>2024-01-08 14:49:19 +0100
commit763109e346b90193027b24743e266495d992b1c6 (patch)
treec13328a9e881252109f20383a938b7504f03c0d6
parent2edce427a8b17d1d2192c1ee4a2227b6eb2971a0 (diff)
downloadllvm-763109e346b90193027b24743e266495d992b1c6.zip
llvm-763109e346b90193027b24743e266495d992b1c6.tar.gz
llvm-763109e346b90193027b24743e266495d992b1c6.tar.bz2
[mlir][gpu] Use `known_block_size` to set `maxntid` for NVVM target (#77301)
Setting thread block size with `maxntid` on the kernel has great performance benefits. In this way, downstream PTX compiler can do better register allocation. MLIR's `gpu.launch` and `gpu.launch_func` already has an attribute (`known_block_size`) that keeps the thread block size when it is known. This PR simply uses this attribute to set `maxntid`.
-rw-r--r--mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp20
-rw-r--r--mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h13
-rw-r--r--mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp4
-rw-r--r--mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir9
4 files changed, 40 insertions, 6 deletions
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 6a005e6..eeb8fbb 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -85,8 +85,26 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
// Add a dialect specific kernel attribute in addition to GPU kernel
// attribute. The former is necessary for further translation while the
// latter is expected by gpu.launch_func.
- if (gpuFuncOp.isKernel())
+ if (gpuFuncOp.isKernel()) {
attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
+
+ // Set the block size attribute if it is present.
+ if (kernelBlockSizeAttributeName.has_value()) {
+ std::optional<int32_t> dimX =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::x);
+ std::optional<int32_t> dimY =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::y);
+ std::optional<int32_t> dimZ =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::z);
+ if (dimX.has_value() || dimY.has_value() || dimZ.has_value()) {
+ // If any of the dimensions are missing, fill them in with 1.
+ attributes.emplace_back(
+ kernelBlockSizeAttributeName.value(),
+ rewriter.getI32ArrayAttr(
+ {dimX.value_or(1), dimY.value_or(1), dimZ.value_or(1)}));
+ }
+ }
+ }
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C,
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index a77db4a..471a688 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -36,13 +36,15 @@ private:
};
struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
- GPUFuncOpLowering(const LLVMTypeConverter &converter,
- unsigned allocaAddrSpace, unsigned workgroupAddrSpace,
- StringAttr kernelAttributeName)
+ GPUFuncOpLowering(
+ const LLVMTypeConverter &converter, unsigned allocaAddrSpace,
+ unsigned workgroupAddrSpace, StringAttr kernelAttributeName,
+ std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt)
: ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
allocaAddrSpace(allocaAddrSpace),
workgroupAddrSpace(workgroupAddrSpace),
- kernelAttributeName(kernelAttributeName) {}
+ kernelAttributeName(kernelAttributeName),
+ kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {}
LogicalResult
matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
@@ -56,6 +58,9 @@ private:
/// The attribute name to use instead of `gpu.kernel`.
StringAttr kernelAttributeName;
+
+ /// The attribute name to to set block size
+ std::optional<StringAttr> kernelBlockSizeAttributeName;
};
/// The lowering of gpu.printf to a call to HIP hostcalls
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index e60fe5c..a7ac233 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -352,7 +352,9 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
/*workgroupAddrSpace=*/
static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
StringAttr::get(&converter.getContext(),
- NVVM::NVVMDialect::getKernelFuncAttrName()));
+ NVVM::NVVMDialect::getKernelFuncAttrName()),
+ StringAttr::get(&converter.getContext(),
+ NVVM::NVVMDialect::getMaxntidAttrName()));
populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
"__nv_fabs");
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 20a200e..c7f1d4f 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -627,6 +627,15 @@ gpu.module @test_module_31 {
}
}
+gpu.module @gpumodule {
+// CHECK-LABEL: func @kernel_with_block_size()
+// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = [128 : i32, 1 : i32, 1 : i32]}
+ gpu.func @kernel_with_block_size() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
+ gpu.return
+ }
+}
+
+
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
%gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module