aboutsummaryrefslogtreecommitdiff
path: root/mlir/test/Conversion/GPUCommon
diff options
context:
space:
mode:
authorGuray Ozen <guray.ozen@gmail.com>2023-11-27 11:05:07 +0100
committerGitHub <noreply@github.com>2023-11-27 11:05:07 +0100
commitedf5cae7391cdb097a090ea142dfa7ac6ac03555 (patch)
tree423383047badea2aa92ebc6e60cd0ced1cea9c85 /mlir/test/Conversion/GPUCommon
parentd1652ff0803ac9f2f3ea99336f71edacdf95a721 (diff)
downloadllvm-edf5cae7391cdb097a090ea142dfa7ac6ac03555.zip
llvm-edf5cae7391cdb097a090ea142dfa7ac6ac03555.tar.gz
llvm-edf5cae7391cdb097a090ea142dfa7ac6ac03555.tar.bz2
[mlir][gpu] Support Cluster of Thread Blocks in `gpu.launch_func` (#72871)
NVIDIA Hopper architecture introduced the Cooperative Group Array (CGA). It is a new level of parallelism, allowing clustering of Cooperative Thread Arrays (CTA) to synchronize and communicate through shared memory while running concurrently. This PR enables support for CGA within the `gpu.launch_func` in the GPU dialect. It extends `gpu.launch_func` to accommodate this functionality. The GPU dialect remains architecture-agnostic, so we've added CGA functionality as optional parameters. We want to leverage mechanisms that we have in the GPU dialects such as outlining and kernel launching, making it a practical and convenient choice. An example of this implementation can be seen below: ``` gpu.launch_func @kernel_module::@kernel clusters in (%1, %0, %0) // <-- Optional blocks in (%0, %0, %0) threads in (%0, %0, %0) ``` The PR also introduces index and dimensions Ops specific to clusters, binding them to NVVM Ops: ``` %cidX = gpu.cluster_id x %cidY = gpu.cluster_id y %cidZ = gpu.cluster_id z %cdimX = gpu.cluster_dim x %cdimY = gpu.cluster_dim y %cdimZ = gpu.cluster_dim z ``` We will introduce cluster support in `gpu.launch` Op in an upcoming PR. See [the documentation](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-of-cooperative-thread-arrays) provided by NVIDIA for details.
Diffstat (limited to 'mlir/test/Conversion/GPUCommon')
-rw-r--r--mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir38
1 files changed, 38 insertions, 0 deletions
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
index f5462b5..c0b05ef 100644
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -96,3 +96,41 @@ module attributes {gpu.container_module} {
return
}
}
+
+
+// -----
+
+module attributes {gpu.container_module} {
+ // CHECK: gpu.module
+ gpu.module @kernel_module [#nvvm.target] {
+ llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr,
+ %arg2: !llvm.ptr, %arg3: i64, %arg4: i64,
+ %arg5: i64) attributes {gpu.kernel} {
+ llvm.return
+ }
+ }
+
+ func.func @foo(%buffer: memref<?xf32>) {
+ // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64
+ // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32) : i32
+ // CHECK: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32
+ // CHECK: [[C2:%.*]] = llvm.mlir.constant(2 : index) : i64
+ %c8 = arith.constant 8 : index
+ %c32 = arith.constant 32 : i32
+ %c256 = arith.constant 256 : i32
+ %c2 = arith.constant 2 : index
+
+ // CHECK: gpu.launch_func @kernel_module::@kernel
+ // CHECK: clusters in ([[C2]], [[C2]], [[C2]])
+ // CHECK: blocks in ([[C8]], [[C8]], [[C8]]) threads in ([[C8]], [[C8]], [[C8]]) : i64
+ // CHECK: dynamic_shared_memory_size [[C256]]
+ // CHECK: args([[C32]] : i32, %{{.*}} : !llvm.ptr, %{{.*}} : !llvm.ptr, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64)
+ gpu.launch_func @kernel_module::@kernel
+ clusters in (%c2, %c2, %c2)
+ blocks in (%c8, %c8, %c8)
+ threads in (%c8, %c8, %c8)
+ dynamic_shared_memory_size %c256
+ args(%c32 : i32, %buffer : memref<?xf32>)
+ return
+ }
+}