aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIvan Butygin <ivan.butygin@gmail.com>2025-07-22 22:37:56 +0200
committerGitHub <noreply@github.com>2025-07-22 23:37:56 +0300
commit4977100624c5320e50d1adce341042b966b36124 (patch)
tree9295b67c1df9112195c641a9286a02ac321048b4
parent921287e126465d6850954855ded640f0f78d72fd (diff)
downloadllvm-4977100624c5320e50d1adce341042b966b36124.zip
llvm-4977100624c5320e50d1adce341042b966b36124.tar.gz
llvm-4977100624c5320e50d1adce341042b966b36124.tar.bz2
[mlir][amdgpu] Add `rocdl.s.waitcnt` wrapper (#149670)
The main motivations is to pass vmcnt/expcnt/lgkmcnt values directly (similar to the asm format) and delegate architecture-dependent bitpacking to the amdgpu->rocdl lowering. --------- Signed-off-by: Ivan Butygin <ivan.butygin@gmail.com>
-rw-r--r--mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td23
-rw-r--r--mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp112
-rw-r--r--mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir42
-rw-r--r--mlir/test/Dialect/AMDGPU/ops.mlir17
4 files changed, 191 insertions, 3 deletions
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 5a53b15..b237f7b 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -719,6 +719,29 @@ def AMDGPU_SchedBarrierOp :
}];
}
+def AMDGPU_MemoryCounterWaitOp :
+ AMDGPU_Op<"memory_counter_wait">,
+ Arguments<(ins
+ OptionalAttr<I32Attr>:$load,
+ OptionalAttr<I32Attr>:$store,
+ OptionalAttr<I32Attr>:$ds,
+ OptionalAttr<I32Attr>:$exp
+ )>
+ {
+ let summary = "Wait for specified hardware counters";
+ let description = [{
+ Wait for the specified counters to be less-than or equal-to the provided
+ values before continuing.
+
+ Counters can lower to different instructions on different architectires,
+ including clamping to the some HW supported max value or combining multiple
+ counters into one.
+ }];
+ let assemblyFormat = [{
+ oilist( `load` `(` $load `)` | `store` `(` $store `)` | `ds` `(` $ds `)` | `exp` `(` $exp `)` ) attr-dict
+ }];
+}
+
def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB",
"The possible permutations of the lanes storing B available in an MFMA",
[
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index ef35ee2..309476c 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -419,6 +419,112 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
}
};
+// TODO: AMDGPU backend already have all this bitpacking logic, we should move
+// it to some common place.
+/// Vmcnt, Expcnt and Lgkmcnt are decoded as follows:
+/// Vmcnt = Waitcnt[3:0] (pre-gfx9)
+/// Vmcnt = Waitcnt[15:14,3:0] (gfx9,10)
+/// Vmcnt = Waitcnt[15:10] (gfx11)
+/// Expcnt = Waitcnt[6:4] (pre-gfx11)
+/// Expcnt = Waitcnt[2:0] (gfx11)
+/// Lgkmcnt = Waitcnt[11:8] (pre-gfx10)
+/// Lgkmcnt = Waitcnt[13:8] (gfx10)
+/// Lgkmcnt = Waitcnt[9:4] (gfx11)
+static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
+ unsigned expcnt, unsigned lgkmcnt) {
+ if (chipset.majorVersion < 9) {
+ vmcnt = std::min(15u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(15u, lgkmcnt);
+ return vmcnt | (expcnt << 4) | (lgkmcnt << 8);
+ }
+ if (chipset.majorVersion == 9) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(15u, lgkmcnt);
+ unsigned lowBits = vmcnt & 0xF;
+ unsigned highBits = (vmcnt >> 4) << 14;
+ unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+ return lowBits | highBits | otherCnts;
+ }
+ if (chipset.majorVersion == 10) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(63u, lgkmcnt);
+ unsigned lowBits = vmcnt & 0xF;
+ unsigned highBits = (vmcnt >> 4) << 14;
+ unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+ return lowBits | highBits | otherCnts;
+ }
+ if (chipset.majorVersion == 11) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(63u, lgkmcnt);
+ return (vmcnt << 10) | expcnt | (lgkmcnt << 4);
+ }
+ return failure();
+}
+
+struct MemoryCounterWaitOpLowering
+ : public ConvertOpToLLVMPattern<MemoryCounterWaitOp> {
+ MemoryCounterWaitOpLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<MemoryCounterWaitOp>(converter),
+ chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(MemoryCounterWaitOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset.majorVersion >= 12) {
+ Location loc = op.getLoc();
+ if (std::optional<int> ds = adaptor.getDs())
+ rewriter.create<ROCDL::WaitDscntOp>(loc, *ds);
+
+ if (std::optional<int> load = adaptor.getLoad())
+ rewriter.create<ROCDL::WaitLoadcntOp>(loc, *load);
+
+ if (std::optional<int> store = adaptor.getStore())
+ rewriter.create<ROCDL::WaitStorecntOp>(loc, *store);
+
+ if (std::optional<int> exp = adaptor.getExp())
+ rewriter.create<ROCDL::WaitExpcntOp>(loc, *exp);
+
+ rewriter.eraseOp(op);
+ return success();
+ }
+
+ auto getVal = [](Attribute attr) -> unsigned {
+ if (attr)
+ return cast<IntegerAttr>(attr).getInt();
+
+ // This value will be clamped to the maximum value for the chipset.
+ return 1024;
+ };
+ unsigned ds = getVal(adaptor.getDsAttr());
+ unsigned exp = getVal(adaptor.getExpAttr());
+
+ unsigned vmcnt = 1024;
+ Attribute load = adaptor.getLoadAttr();
+ Attribute store = adaptor.getStoreAttr();
+ if (load && store) {
+ vmcnt = getVal(load) + getVal(store);
+ } else if (load) {
+ vmcnt = getVal(load);
+ } else if (store) {
+ vmcnt = getVal(store);
+ }
+
+ FailureOr<unsigned> waitcnt = encodeWaitcnt(chipset, vmcnt, exp, ds);
+ if (failed(waitcnt))
+ return op.emitOpError("unsupported chipset");
+
+ rewriter.replaceOpWithNewOp<ROCDL::SWaitcntOp>(op, *waitcnt);
+ return success();
+ }
+};
+
struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
LDSBarrierOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
: ConvertOpToLLVMPattern<LDSBarrierOp>(converter), chipset(chipset) {}
@@ -1825,9 +1931,9 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
ROCDL::RawPtrBufferAtomicUminOp>,
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
ROCDL::RawPtrBufferAtomicCmpSwap>,
- AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering,
- MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering,
- ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
+ AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
+ SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
+ WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
TransposeLoadOpLowering>(converter, chipset);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
new file mode 100644
index 0000000..1016ee8
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12
+
+// CHECK-LABEL: func @memory_counter_wait
+func.func @memory_counter_wait() {
+ // GFX9: rocdl.s.waitcnt 53119
+ // GFX10: rocdl.s.waitcnt 65407
+ // GFX11: rocdl.s.waitcnt 65527
+ // GFX12-NOT: rocdl.s.wait.loadcnt
+ // GFX12-NOT: rocdl.s.wait.storecnt
+ // GFX12-NOT: rocdl.s.wait.expcnt
+ // GFX12-NOT: rocdl.s.wait.dscnt
+ amdgpu.memory_counter_wait
+
+ // GFX9: rocdl.s.waitcnt 3952
+ // GFX10: rocdl.s.waitcnt 16240
+ // GFX11: rocdl.s.waitcnt 1015
+ // GFX12: rocdl.s.wait.loadcnt 0
+ amdgpu.memory_counter_wait load(0)
+
+ // GFX9: rocdl.s.waitcnt 3952
+ // GFX10: rocdl.s.waitcnt 16240
+ // GFX11: rocdl.s.waitcnt 1015
+ // GFX12: rocdl.s.wait.storecnt 0
+ amdgpu.memory_counter_wait store(0)
+
+ // GFX9: rocdl.s.waitcnt 53007
+ // GFX10: rocdl.s.waitcnt 65295
+ // GFX11: rocdl.s.waitcnt 65520
+ // GFX12: rocdl.s.wait.expcnt 0
+ amdgpu.memory_counter_wait exp(0)
+
+ // GFX9: rocdl.s.waitcnt 49279
+ // GFX10: rocdl.s.waitcnt 49279
+ // GFX11: rocdl.s.waitcnt 64519
+ // GFX12: rocdl.s.wait.dscnt 0
+ amdgpu.memory_counter_wait ds(0)
+
+ return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index fe2b32b..fe78b53 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -548,3 +548,20 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %
amdgpu.gather_to_lds %mem1[%idx1], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
func.return
}
+
+// CHECK-LABEL: func @memory_counter_wait
+func.func @memory_counter_wait() {
+ // CHECK: amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
+ // CHECK: amdgpu.memory_counter_wait load(4) store(2) ds(3) exp(1)
+ // CHECK: amdgpu.memory_counter_wait load(1)
+ // CHECK: amdgpu.memory_counter_wait store(2)
+ // CHECK: amdgpu.memory_counter_wait ds(3)
+ // CHECK: amdgpu.memory_counter_wait exp(4)
+ amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
+ amdgpu.memory_counter_wait exp(1) store(2) ds(3) load(4)
+ amdgpu.memory_counter_wait load(1)
+ amdgpu.memory_counter_wait store(2)
+ amdgpu.memory_counter_wait ds(3)
+ amdgpu.memory_counter_wait exp(4)
+ func.return
+}