[mlir][gpu] Use `known_block_size` to set `maxntid` for NVVM target (#77301)

Setting thread block size with `maxntid` on the kernel has great performance benefits. In this way, downstream PTX compiler can do better register allocation. MLIR's `gpu.launch` and `gpu.launch_func` already has an attribute (`known_block_size`) that keeps the thread block size when it is known. This PR simply uses this attribute to set `maxntid`.
author: Guray Ozen <guray.ozen@gmail.com> 2024-01-08 14:49:19 +0100
committer: GitHub <noreply@github.com> 2024-01-08 14:49:19 +0100
commit: 763109e346b90193027b24743e266495d992b1c6 (patch)
tree: c13328a9e881252109f20383a938b7504f03c0d6
parent: 2edce427a8b17d1d2192c1ee4a2227b6eb2971a0 (diff)
download: llvm-763109e346b90193027b24743e266495d992b1c6.zip
llvm-763109e346b90193027b24743e266495d992b1c6.tar.gz
llvm-763109e346b90193027b24743e266495d992b1c6.tar.bz2
4 files changed, 40 insertions, 6 deletions
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 6a005e6..eeb8fbb 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -85,8 +85,26 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   // Add a dialect specific kernel attribute in addition to GPU kernel
   // attribute. The former is necessary for further translation while the
   // latter is expected by gpu.launch_func.
-  if (gpuFuncOp.isKernel())
+  if (gpuFuncOp.isKernel()) {
     attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
+
+    // Set the block size attribute if it is present.
+    if (kernelBlockSizeAttributeName.has_value()) {
+      std::optional<int32_t> dimX =
+          gpuFuncOp.getKnownBlockSize(gpu::Dimension::x);
+      std::optional<int32_t> dimY =
+          gpuFuncOp.getKnownBlockSize(gpu::Dimension::y);
+      std::optional<int32_t> dimZ =
+          gpuFuncOp.getKnownBlockSize(gpu::Dimension::z);
+      if (dimX.has_value() || dimY.has_value() || dimZ.has_value()) {
+        // If any of the dimensions are missing, fill them in with 1.
+        attributes.emplace_back(
+            kernelBlockSizeAttributeName.value(),
+            rewriter.getI32ArrayAttr(
+                {dimX.value_or(1), dimY.value_or(1), dimZ.value_or(1)}));
+      }
+    }
+  }
   auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
       gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
       LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C,
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index a77db4a..471a688 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -36,13 +36,15 @@ private:
 };
 
 struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
-  GPUFuncOpLowering(const LLVMTypeConverter &converter,
-                    unsigned allocaAddrSpace, unsigned workgroupAddrSpace,
-                    StringAttr kernelAttributeName)
+  GPUFuncOpLowering(
+      const LLVMTypeConverter &converter, unsigned allocaAddrSpace,
+      unsigned workgroupAddrSpace, StringAttr kernelAttributeName,
+      std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt)
       : ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
         allocaAddrSpace(allocaAddrSpace),
         workgroupAddrSpace(workgroupAddrSpace),
-        kernelAttributeName(kernelAttributeName) {}
+        kernelAttributeName(kernelAttributeName),
+        kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {}
 
   LogicalResult
   matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
@@ -56,6 +58,9 @@ private:
 
   /// The attribute name to use instead of `gpu.kernel`.
   StringAttr kernelAttributeName;
+
+  /// The attribute name to to set block size
+  std::optional<StringAttr> kernelBlockSizeAttributeName;
 };
 
 /// The lowering of gpu.printf to a call to HIP hostcalls
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index e60fe5c..a7ac233 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -352,7 +352,9 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
       /*workgroupAddrSpace=*/
       static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
       StringAttr::get(&converter.getContext(),
-                      NVVM::NVVMDialect::getKernelFuncAttrName()));
+                      NVVM::NVVMDialect::getKernelFuncAttrName()),
+      StringAttr::get(&converter.getContext(),
+                      NVVM::NVVMDialect::getMaxntidAttrName()));
 
   populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
                                    "__nv_fabs");
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 20a200e..c7f1d4f 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -627,6 +627,15 @@ gpu.module @test_module_31 {
   }
 }
 
+gpu.module @gpumodule {
+// CHECK-LABEL: func @kernel_with_block_size()
+// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = [128 : i32, 1 : i32, 1 : i32]} 
+  gpu.func @kernel_with_block_size() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
+    gpu.return
+  }
+}
+
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
     %gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module
author	Guray Ozen <guray.ozen@gmail.com>	2024-01-08 14:49:19 +0100
committer	GitHub <noreply@github.com>	2024-01-08 14:49:19 +0100
commit	763109e346b90193027b24743e266495d992b1c6 (patch)
tree	c13328a9e881252109f20383a938b7504f03c0d6
parent	2edce427a8b17d1d2192c1ee4a2227b6eb2971a0 (diff)
download	llvm-763109e346b90193027b24743e266495d992b1c6.zip llvm-763109e346b90193027b24743e266495d992b1c6.tar.gz llvm-763109e346b90193027b24743e266495d992b1c6.tar.bz2