aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
diff options
context:
space:
mode:
authorPierre van Houtryve <pierre.vanhoutryve@amd.com>2025-09-10 10:18:11 +0200
committerGitHub <noreply@github.com>2025-09-10 10:18:11 +0200
commitbed9be954d5a8e4166629e489052c96e8cb24f99 (patch)
tree30243a6d72a6506559aef761895adc0bbab8a76a /llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
parent4d9a7fa9bad0d3f7eba801b784d4caa896d224d7 (diff)
downloadllvm-bed9be954d5a8e4166629e489052c96e8cb24f99.zip
llvm-bed9be954d5a8e4166629e489052c96e8cb24f99.tar.gz
llvm-bed9be954d5a8e4166629e489052c96e8cb24f99.tar.bz2
[AMDGPU][gfx1250] Implement SIMemoryLegalizer (#154726)
Implements the base of the MemoryLegalizer for a roughly correct GFX1250 memory model. Documentation will come later, and some remaining changes still have to be added, but this is the backbone of the model.
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp75
1 files changed, 56 insertions, 19 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 6ab4eb4..c20fcac 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -606,7 +606,11 @@ protected:
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
public:
- SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+ SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
+ // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
+ // the behavior is the same if assuming GFX12.0 in CU mode.
+ assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
+ }
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -2198,7 +2202,8 @@ bool SIGfx10CacheControl::insertBarrierStart(
// mode. This is because a CU mode release fence does not emit any wait, which
// is fine when only dealing with vmem, but isn't sufficient in the presence
// of barriers which do not go through vmem.
- if (!ST.isCuModeEnabled())
+ // GFX12.5 does not require this additional wait.
+ if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
return false;
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
@@ -2378,12 +2383,16 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
STORECnt |= true;
break;
case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore need to wait for operations to complete to ensure
- // they are visible to waves in the other CU as the L0 is per CU.
- // Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
- if (!ST.isCuModeEnabled()) {
+ // GFX12.0:
+ // In WGP mode the waves of a work-group can be executing on either CU
+ // of the WGP. Therefore need to wait for operations to complete to
+ // ensure they are visible to waves in the other CU as the L0 is per CU.
+ // Otherwise in CU mode and all waves of a work-group are on the same CU
+ // which shares the same L0.
+ //
+ // GFX12.5:
+ // TODO DOCS
+ if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2435,7 +2444,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
//
// This also applies to fences. Fences cannot pair with an instruction
// tracked with bvh/samplecnt as we don't have any atomics that do that.
- if (Order != AtomicOrdering::Acquire) {
+ if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
}
@@ -2487,10 +2496,14 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
break;
case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore we need to invalidate the L0 which is per CU.
- // Otherwise in CU mode all waves of a work-group are on the same CU, and so
- // the L0 does not need to be invalidated.
+ // GFX12.0:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore we need to invalidate the L0 which is per CU.
+ // Otherwise in CU mode all waves of a work-group are on the same CU, and
+ // so the L0 does not need to be invalidated.
+ //
+ // GFX12.5
+ // TODO DOCS
if (ST.isCuModeEnabled())
return false;
@@ -2535,7 +2548,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- // global_wb is only necessary at system scope for gfx120x targets.
+ // global_wb is only necessary at system scope for GFX12.0,
+ // they're also necessary at device scope for GFX12.5.
//
// Emitting it for lower scopes is a slow no-op, so we omit it
// for performance.
@@ -2545,6 +2559,12 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
.addImm(AMDGPU::CPol::SCOPE_SYS);
break;
case SIAtomicScope::AGENT:
+ // TODO DOCS
+ if (ST.hasGFX1250Insts()) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
+ .addImm(AMDGPU::CPol::SCOPE_DEV);
+ }
+ break;
case SIAtomicScope::WORKGROUP:
// No WB necessary, but we still have to wait.
break;
@@ -2607,17 +2627,32 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
}
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
- MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
- if (!CPol)
- return false;
+ assert(MI.mayStore() && "Not a Store inst");
+ const bool IsRMW = (MI.mayLoad() && MI.mayStore());
+ bool Changed = false;
+
+ // GFX12.5 only: xcnt wait is needed before flat and global atomics
+ // stores/rmw.
+ if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
+ Changed = true;
+ }
+
+ // Remaining fixes do not apply to RMWs.
+ if (IsRMW)
+ return Changed;
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+ if (!CPol) // Some vmem operations do not have a scope and are not concerned.
+ return Changed;
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
// GFX12.0 only: Extra waits needed before system scope stores.
if (!ST.hasGFX1250Insts()) {
if (!Atomic && Scope == CPol::SCOPE_SYS)
return insertWaitsBeforeSystemScopeStore(MI);
- return false;
+ return Changed;
}
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
@@ -2627,7 +2662,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
(!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
return setScope(MI, CPol::SCOPE_SE);
- return false;
+ return Changed;
}
bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
@@ -2839,6 +2874,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
+ MachineInstr &RMWMI = *MI;
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
@@ -2873,6 +2909,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
Position::AFTER);
}
+ Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
return Changed;
}