diff options
| author | Krzysztof Drewniak <Krzysztof.Drewniak@amd.com> | 2026-02-12 01:40:39 +0000 |
|---|---|---|
| committer | Krzysztof Drewniak <Krzysztof.Drewniak@amd.com> | 2026-02-12 02:25:06 +0000 |
| commit | df92c4c8bf7e19aeae2bc7ecd815acd1cdbe2c6f (patch) | |
| tree | aef87a07e1fd7d954c43afe3921bb7a8aa8f854f | |
| parent | 2ada4b8fb0914ebdddc386130db24bcc074b3554 (diff) | |
| download | llvm-users/krzysz00/rocdl-async-lds.tar.gz llvm-users/krzysz00/rocdl-async-lds.tar.bz2 llvm-users/krzysz00/rocdl-async-lds.zip | |
[mlir][ROCDL] Add async variants of pre-gfx12 LDS load intrinsicsusers/krzysz00/rocdl-async-lds
These are MLIR wrappers around #180466.
-----
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
| -rw-r--r-- | mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 92 | ||||
| -rw-r--r-- | mlir/test/Dialect/LLVMIR/rocdl.mlir | 24 | ||||
| -rw-r--r-- | mlir/test/Target/LLVMIR/rocdl.mlir | 24 |
3 files changed, 140 insertions, 0 deletions
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index c3af1bd32ebd..7f6961ee3e06 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -1102,6 +1102,49 @@ def ROCDL_LoadToLDSOp : }]; } +def ROCDL_LoadAsyncToLDSOp : + ROCDL_IntrOp<"load.async.to.lds", [], [0], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> { + dag args = (ins Arg<LLVM_AnyPointer, "", [MemRead]>:$globalPtr, + Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr, + I32Attr:$size, + I32Attr:$offset, + I32Attr:$aux); + let arguments = !con(args, baseArgs); + let assemblyFormat = [{ + $globalPtr `,` $ldsPtr `,` $size `,` $offset `,` $aux + attr-dict `:` qualified(type($globalPtr)) `,` qualified(type($ldsPtr)) + }]; + let extraClassDefinition = [{ + ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() { + return {getGlobalPtr(), getLdsPtr()}; + } + }]; + + let summary = "Gathering load to LDS that requires explicit async memory tracking"; + let description = [{ + Load `size` bytes (the valid sizes vary by architecture) from the global memory + pointed to by `globalPtr` and put them at `ldsPtr`, concantenating (and applying + padding for sizes less than 4 bytes, along with padding out 12-byte reads + to 16-byte writes). The value of `globalPtr` can vary between lanes, while + `sharedPtr` must be subgroup-uniform (the values from each lane are concatentated + before being written to LDS with appropriate padding applied.) + + `offset` is a constant offset applied to **both** pointers, and `aux` sets the cache + policy. Unlike `rocdl.load.to.lds`, the compiler will not automatically inserts waits + for this load to complete at the point it thinks you're using a region of LDS you've + stored values to - you need to use the `rocdl.asyncmark` and `rocdl.wait.asyncmark` + operations to explicitly group these operations and wait for their completion. + + Available on gfx10 and earlier with varying suppported values of `size`. + + Example: + ```mlir + rocdl.load.async.to.lds %global, %shared, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + rocdl.load.async.to.lds %fatBuffer, %shared, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3> + ``` + }]; +} + def ROCDL_GlobalLoadLDSOp : ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> { dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr, @@ -1121,6 +1164,27 @@ def ROCDL_GlobalLoadLDSOp : }]; } +def ROCDL_GlobalLoadAsyncLDSOp : + ROCDL_IntrOp<"global.load.async.lds", [], [], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> { + dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr, + Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr, + I32Attr:$size, + I32Attr:$offset, + I32Attr:$aux); + let arguments = !con(args, baseArgs); + let assemblyFormat = [{ + $globalPtr `,` $ldsPtr `,` $size `,` $offset `,` $aux + attr-dict `:` qualified(type($globalPtr)) `,` qualified(type($ldsPtr)) + }]; + let extraClassDefinition = [{ + ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() { + return {getGlobalPtr(), getLdsPtr()}; + } + }]; + + let summary = "Version of rocdl.load.async.to.lds specialized to global pointers"; +} + //===---------------------------------------------------------------------===// // Async load to LDS intrinsic (available in GFX1250) //===---------------------------------------------------------------------===// @@ -1293,6 +1357,34 @@ def ROCDL_RawPtrBufferLoadLdsOp : }]; } +def ROCDL_RawPtrBufferLoadAsyncLdsOp : + ROCDL_IntrOp<"raw.ptr.buffer.load.async.lds", [], [], [], 0, 0, 1> { + dag args = (ins Arg<ROCDLBufferRsrc, "", [MemRead]>:$rsrc, + Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr, + I32:$size, + I32:$voffset, + I32:$soffset, + I32:$offset, + I32:$aux); + let arguments = !con(args, baseArgs); + let assemblyFormat = "operands attr-dict"; + let extraClassDefinition = [{ + ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() { + return {getRsrc(), getLdsPtr()}; + } + }]; + let summary = "Async variant of raw.ptr.buffer.load.lds"; + let description = [{ + Load from a buffer resource `rsrc` to `ldsPtr`, which must be uniform. + + See `rocdl.load.async.to.lds` for overall semantics of such loads, noting that + here `voffset` can be lane-varying and that `rsrc` (which holds the base addres) + must, as always, be uniform. + + Available on gfx9 and gfx10. + }]; +} + def ROCDL_RawPtrBufferStoreOp : ROCDL_IntrOp<"raw.ptr.buffer.store", [], [0], [], 0, 0, 1> { dag args = (ins LLVM_Type:$vdata, diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index 2adb5bc90915..1ab668a64179 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -763,6 +763,13 @@ llvm.func @rocdl.load.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) { llvm.return } +llvm.func @rocdl.load.async.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) { + // CHECK-LABEL @rocdl.load.async.to.lds + //CHECK: rocdl.load.async.to.lds %{{.*}}, %{{.*}}, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3> + rocdl.load.async.to.lds %src, %dst, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3> + llvm.return +} + llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { // CHECK-LABEL @rocdl.global.load.lds //CHECK: rocdl.global.load.lds %{{.*}}, %{{.*}}, 4, 0, 0 @@ -770,6 +777,13 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { llvm.return } +llvm.func @rocdl.global.load.async.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { + // CHECK-LABEL @rocdl.global.load.async.lds + //CHECK: rocdl.global.load.async.lds %{{.*}}, %{{.*}}, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + rocdl.global.load.async.lds %src, %dst, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + llvm.return +} + llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { // CHECK-LABEL @rocdl.global.load.async.to.lds // CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0 @@ -877,6 +891,16 @@ llvm.func @rocdl.raw.ptr.buffer.load.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.p llvm.return } +llvm.func @rocdl.raw.ptr.buffer.load.async.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.ptr<3>, + %size: i32, %voffset : i32, %soffset : i32, %offset : i32, + %aux : i32) { + // CHECK-LABEL: rocdl.raw.ptr.buffer.load.async.lds + // CHECK: rocdl.raw.ptr.buffer.load.async.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} + rocdl.raw.ptr.buffer.load.async.lds %rsrc, %dstLds, %size, %voffset, %soffset, %offset, %aux + + llvm.return +} + llvm.func @rocdl.raw.ptr.buffer.i32(%rsrc : !llvm.ptr<8>, %offset : i32, %soffset : i32, %aux : i32, %vdata1 : i32, diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 7a7e76410e4d..0d6e0c8aea50 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -1224,12 +1224,24 @@ llvm.func @rocdl.load.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) { llvm.return } +llvm.func @rocdl.load.async.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) { + //CHECK: call void @llvm.amdgcn.load.async.to.lds.p7 + rocdl.load.async.to.lds %src, %dst, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3> + llvm.return +} + llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { //CHECK: call void @llvm.amdgcn.global.load.lds rocdl.global.load.lds %src, %dst, 4, 0, 0 llvm.return } +llvm.func @rocdl.global.load.async.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { + //CHECK: call void @llvm.amdgcn.global.load.async.lds + rocdl.global.load.async.lds %src, %dst, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + llvm.return +} + // CHECK-LABEL: rocdl.global.load.async.to.lds llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8 @@ -1390,6 +1402,18 @@ llvm.func @rocdl.raw.ptr.buffer.load.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.p llvm.return } +llvm.func @rocdl.raw.ptr.buffer.load.async.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.ptr<3>, + %voffset : i32, %soffset : i32) { + %size = llvm.mlir.constant(4 : i32) : i32 + %offset = llvm.mlir.constant(128 : i32) : i32 + %aux = llvm.mlir.constant(1 : i32) : i32 + // CHECK-LABEL: rocdl.raw.ptr.buffer.load.async.lds + // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %{{.*}}, ptr addrspace(3) %{{.*}}, i32 4, i32 %{{.*}}, i32 %{{.*}}, i32 128, i32 1 + rocdl.raw.ptr.buffer.load.async.lds %rsrc, %dstLds, %size, %voffset, %soffset, %offset, %aux + + llvm.return +} + llvm.func @rocdl.global.prefetch(%ptr : !llvm.ptr<1>) { // CHECK-LABEL: rocdl.global.prefetch // CHECK: call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %{{.*}}, i32 0) |
