diff options
author | Guray Ozen <guray.ozen@gmail.com> | 2025-02-05 12:38:37 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-02-05 12:38:37 +0100 |
commit | baf27862ddb23c3854cb6782a3f1675da4722a50 (patch) | |
tree | 9f7ce150b0a97e3c40c0af99a2732e873e55381c | |
parent | f0b8ff12519270adcfef93410abff76ab073476a (diff) | |
download | llvm-baf27862ddb23c3854cb6782a3f1675da4722a50.zip llvm-baf27862ddb23c3854cb6782a3f1675da4722a50.tar.gz llvm-baf27862ddb23c3854cb6782a3f1675da4722a50.tar.bz2 |
[MLIR][NVGPU] Move max threads/blocks size to dialect (NFC) (#124454)
This PR moves maximum number of threads in a block and block in a grid
to nvgpu dialect to avoid replicated code.
The limits are defined here:
https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
-rw-r--r-- | mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h | 12 | ||||
-rw-r--r-- | mlir/lib/Dialect/GPU/TransformOps/Utils.cpp | 25 |
2 files changed, 21 insertions, 16 deletions
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h index aad2ac6..db4c63b 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h @@ -22,8 +22,20 @@ #include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc" +// Maximum warp size constexpr int kWarpSize = 32; +// Maximum number of threads in a block and block in a grid +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability +constexpr int kMaxTotalBlockdim = 1024; +constexpr int kMaxBlockdimx = 1024; +constexpr int kMaxBlockdimy = 1024; +constexpr int kMaxBlockdimz = 64; +constexpr int kMaxTotalGriddim = 2147483647; +constexpr int kMaxGriddimx = 2147483647; +constexpr int kMaxGriddimy = 65535; +constexpr int kMaxGriddimz = 65535; + /// M size of wgmma.mma_async instruction constexpr int kWgmmaSizeM = 64; diff --git a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp index f4d3612..f5a6d08 100644 --- a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp +++ b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Transform/IR/TransformDialect.h" @@ -237,25 +238,17 @@ DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, std::optional<int64_t> blockDimZ) { // TODO: pass a configuration object to set the limits properly. - static constexpr int maxTotalBlockdim = 1024; - static constexpr int maxBlockdimx = 1024; - static constexpr int maxBlockdimy = 1024; - static constexpr int maxBlockdimz = 64; - static constexpr int maxTotalGriddim = 2147483647; - static constexpr int maxGriddimx = 2147483647; - static constexpr int maxGriddimy = 65535; - static constexpr int maxGriddimz = 65535; if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) > - maxTotalBlockdim || + kMaxTotalBlockdim || (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) > - maxTotalGriddim || - blockDimX.value_or(1) > maxBlockdimx || - blockDimY.value_or(1) > maxBlockdimy || - blockDimZ.value_or(1) > maxBlockdimz || - gridDimY.value_or(1) > maxGriddimy || - gridDimZ.value_or(1) > maxGriddimz || - gridDimX.value_or(1) > maxGriddimx) { + kMaxTotalGriddim || + blockDimX.value_or(1) > kMaxBlockdimx || + blockDimY.value_or(1) > kMaxBlockdimy || + blockDimZ.value_or(1) > kMaxBlockdimz || + gridDimY.value_or(1) > kMaxGriddimy || + gridDimZ.value_or(1) > kMaxGriddimz || + gridDimX.value_or(1) > kMaxGriddimx) { return transformOp.emitSilenceableError() << "Trying to launch a GPU kernel with grid_dims = (" << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", " |