aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKrzysztof Drewniak <Krzysztof.Drewniak@amd.com>2026-02-12 01:40:39 +0000
committerKrzysztof Drewniak <Krzysztof.Drewniak@amd.com>2026-02-12 02:25:06 +0000
commitdf92c4c8bf7e19aeae2bc7ecd815acd1cdbe2c6f (patch)
treeaef87a07e1fd7d954c43afe3921bb7a8aa8f854f
parent2ada4b8fb0914ebdddc386130db24bcc074b3554 (diff)
downloadllvm-users/krzysz00/rocdl-async-lds.tar.gz
llvm-users/krzysz00/rocdl-async-lds.tar.bz2
llvm-users/krzysz00/rocdl-async-lds.zip
[mlir][ROCDL] Add async variants of pre-gfx12 LDS load intrinsicsusers/krzysz00/rocdl-async-lds
These are MLIR wrappers around #180466. ----- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
-rw-r--r--mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td92
-rw-r--r--mlir/test/Dialect/LLVMIR/rocdl.mlir24
-rw-r--r--mlir/test/Target/LLVMIR/rocdl.mlir24
3 files changed, 140 insertions, 0 deletions
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index c3af1bd32ebd..7f6961ee3e06 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -1102,6 +1102,49 @@ def ROCDL_LoadToLDSOp :
}];
}
+def ROCDL_LoadAsyncToLDSOp :
+ ROCDL_IntrOp<"load.async.to.lds", [], [0], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> {
+ dag args = (ins Arg<LLVM_AnyPointer, "", [MemRead]>:$globalPtr,
+ Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+ I32Attr:$size,
+ I32Attr:$offset,
+ I32Attr:$aux);
+ let arguments = !con(args, baseArgs);
+ let assemblyFormat = [{
+ $globalPtr `,` $ldsPtr `,` $size `,` $offset `,` $aux
+ attr-dict `:` qualified(type($globalPtr)) `,` qualified(type($ldsPtr))
+ }];
+ let extraClassDefinition = [{
+ ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+ return {getGlobalPtr(), getLdsPtr()};
+ }
+ }];
+
+ let summary = "Gathering load to LDS that requires explicit async memory tracking";
+ let description = [{
+ Load `size` bytes (the valid sizes vary by architecture) from the global memory
+ pointed to by `globalPtr` and put them at `ldsPtr`, concantenating (and applying
+ padding for sizes less than 4 bytes, along with padding out 12-byte reads
+ to 16-byte writes). The value of `globalPtr` can vary between lanes, while
+ `sharedPtr` must be subgroup-uniform (the values from each lane are concatentated
+ before being written to LDS with appropriate padding applied.)
+
+ `offset` is a constant offset applied to **both** pointers, and `aux` sets the cache
+ policy. Unlike `rocdl.load.to.lds`, the compiler will not automatically inserts waits
+ for this load to complete at the point it thinks you're using a region of LDS you've
+ stored values to - you need to use the `rocdl.asyncmark` and `rocdl.wait.asyncmark`
+ operations to explicitly group these operations and wait for their completion.
+
+ Available on gfx10 and earlier with varying suppported values of `size`.
+
+ Example:
+ ```mlir
+ rocdl.load.async.to.lds %global, %shared, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+ rocdl.load.async.to.lds %fatBuffer, %shared, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3>
+ ```
+ }];
+}
+
def ROCDL_GlobalLoadLDSOp :
ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> {
dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
@@ -1121,6 +1164,27 @@ def ROCDL_GlobalLoadLDSOp :
}];
}
+def ROCDL_GlobalLoadAsyncLDSOp :
+ ROCDL_IntrOp<"global.load.async.lds", [], [], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> {
+ dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
+ Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+ I32Attr:$size,
+ I32Attr:$offset,
+ I32Attr:$aux);
+ let arguments = !con(args, baseArgs);
+ let assemblyFormat = [{
+ $globalPtr `,` $ldsPtr `,` $size `,` $offset `,` $aux
+ attr-dict `:` qualified(type($globalPtr)) `,` qualified(type($ldsPtr))
+ }];
+ let extraClassDefinition = [{
+ ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+ return {getGlobalPtr(), getLdsPtr()};
+ }
+ }];
+
+ let summary = "Version of rocdl.load.async.to.lds specialized to global pointers";
+}
+
//===---------------------------------------------------------------------===//
// Async load to LDS intrinsic (available in GFX1250)
//===---------------------------------------------------------------------===//
@@ -1293,6 +1357,34 @@ def ROCDL_RawPtrBufferLoadLdsOp :
}];
}
+def ROCDL_RawPtrBufferLoadAsyncLdsOp :
+ ROCDL_IntrOp<"raw.ptr.buffer.load.async.lds", [], [], [], 0, 0, 1> {
+ dag args = (ins Arg<ROCDLBufferRsrc, "", [MemRead]>:$rsrc,
+ Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+ I32:$size,
+ I32:$voffset,
+ I32:$soffset,
+ I32:$offset,
+ I32:$aux);
+ let arguments = !con(args, baseArgs);
+ let assemblyFormat = "operands attr-dict";
+ let extraClassDefinition = [{
+ ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+ return {getRsrc(), getLdsPtr()};
+ }
+ }];
+ let summary = "Async variant of raw.ptr.buffer.load.lds";
+ let description = [{
+ Load from a buffer resource `rsrc` to `ldsPtr`, which must be uniform.
+
+ See `rocdl.load.async.to.lds` for overall semantics of such loads, noting that
+ here `voffset` can be lane-varying and that `rsrc` (which holds the base addres)
+ must, as always, be uniform.
+
+ Available on gfx9 and gfx10.
+ }];
+}
+
def ROCDL_RawPtrBufferStoreOp :
ROCDL_IntrOp<"raw.ptr.buffer.store", [], [0], [], 0, 0, 1> {
dag args = (ins LLVM_Type:$vdata,
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 2adb5bc90915..1ab668a64179 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -763,6 +763,13 @@ llvm.func @rocdl.load.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) {
llvm.return
}
+llvm.func @rocdl.load.async.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) {
+ // CHECK-LABEL @rocdl.load.async.to.lds
+ //CHECK: rocdl.load.async.to.lds %{{.*}}, %{{.*}}, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3>
+ rocdl.load.async.to.lds %src, %dst, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3>
+ llvm.return
+}
+
llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
// CHECK-LABEL @rocdl.global.load.lds
//CHECK: rocdl.global.load.lds %{{.*}}, %{{.*}}, 4, 0, 0
@@ -770,6 +777,13 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
llvm.return
}
+llvm.func @rocdl.global.load.async.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ // CHECK-LABEL @rocdl.global.load.async.lds
+ //CHECK: rocdl.global.load.async.lds %{{.*}}, %{{.*}}, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+ rocdl.global.load.async.lds %src, %dst, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+ llvm.return
+}
+
llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
// CHECK-LABEL @rocdl.global.load.async.to.lds
// CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0
@@ -877,6 +891,16 @@ llvm.func @rocdl.raw.ptr.buffer.load.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.p
llvm.return
}
+llvm.func @rocdl.raw.ptr.buffer.load.async.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.ptr<3>,
+ %size: i32, %voffset : i32, %soffset : i32, %offset : i32,
+ %aux : i32) {
+ // CHECK-LABEL: rocdl.raw.ptr.buffer.load.async.lds
+ // CHECK: rocdl.raw.ptr.buffer.load.async.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
+ rocdl.raw.ptr.buffer.load.async.lds %rsrc, %dstLds, %size, %voffset, %soffset, %offset, %aux
+
+ llvm.return
+}
+
llvm.func @rocdl.raw.ptr.buffer.i32(%rsrc : !llvm.ptr<8>,
%offset : i32, %soffset : i32,
%aux : i32, %vdata1 : i32,
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 7a7e76410e4d..0d6e0c8aea50 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1224,12 +1224,24 @@ llvm.func @rocdl.load.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) {
llvm.return
}
+llvm.func @rocdl.load.async.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) {
+ //CHECK: call void @llvm.amdgcn.load.async.to.lds.p7
+ rocdl.load.async.to.lds %src, %dst, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3>
+ llvm.return
+}
+
llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
//CHECK: call void @llvm.amdgcn.global.load.lds
rocdl.global.load.lds %src, %dst, 4, 0, 0
llvm.return
}
+llvm.func @rocdl.global.load.async.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+ //CHECK: call void @llvm.amdgcn.global.load.async.lds
+ rocdl.global.load.async.lds %src, %dst, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+ llvm.return
+}
+
// CHECK-LABEL: rocdl.global.load.async.to.lds
llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
// CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8
@@ -1390,6 +1402,18 @@ llvm.func @rocdl.raw.ptr.buffer.load.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.p
llvm.return
}
+llvm.func @rocdl.raw.ptr.buffer.load.async.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.ptr<3>,
+ %voffset : i32, %soffset : i32) {
+ %size = llvm.mlir.constant(4 : i32) : i32
+ %offset = llvm.mlir.constant(128 : i32) : i32
+ %aux = llvm.mlir.constant(1 : i32) : i32
+ // CHECK-LABEL: rocdl.raw.ptr.buffer.load.async.lds
+ // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %{{.*}}, ptr addrspace(3) %{{.*}}, i32 4, i32 %{{.*}}, i32 %{{.*}}, i32 128, i32 1
+ rocdl.raw.ptr.buffer.load.async.lds %rsrc, %dstLds, %size, %voffset, %soffset, %offset, %aux
+
+ llvm.return
+}
+
llvm.func @rocdl.global.prefetch(%ptr : !llvm.ptr<1>) {
// CHECK-LABEL: rocdl.global.prefetch
// CHECK: call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %{{.*}}, i32 0)