diff options
author | Pierre van Houtryve <pierre.vanhoutryve@amd.com> | 2025-09-10 10:18:11 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-09-10 10:18:11 +0200 |
commit | bed9be954d5a8e4166629e489052c96e8cb24f99 (patch) | |
tree | 30243a6d72a6506559aef761895adc0bbab8a76a /llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | |
parent | 4d9a7fa9bad0d3f7eba801b784d4caa896d224d7 (diff) | |
download | llvm-bed9be954d5a8e4166629e489052c96e8cb24f99.zip llvm-bed9be954d5a8e4166629e489052c96e8cb24f99.tar.gz llvm-bed9be954d5a8e4166629e489052c96e8cb24f99.tar.bz2 |
[AMDGPU][gfx1250] Implement SIMemoryLegalizer (#154726)
Implements the base of the MemoryLegalizer for a roughly correct GFX1250 memory model.
Documentation will come later, and some remaining changes still have to be added, but this is the backbone of the model.
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 75 |
1 files changed, 56 insertions, 19 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 6ab4eb4..c20fcac 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -606,7 +606,11 @@ protected: SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; public: - SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} + SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { + // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases + // the behavior is the same if assuming GFX12.0 in CU mode. + assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled()); + } bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, @@ -2198,7 +2202,8 @@ bool SIGfx10CacheControl::insertBarrierStart( // mode. This is because a CU mode release fence does not emit any wait, which // is fine when only dealing with vmem, but isn't sufficient in the presence // of barriers which do not go through vmem. - if (!ST.isCuModeEnabled()) + // GFX12.5 does not require this additional wait. + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) return false; BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), @@ -2378,12 +2383,16 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, STORECnt |= true; break; case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore need to wait for operations to complete to ensure - // they are visible to waves in the other CU as the L0 is per CU. - // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. - if (!ST.isCuModeEnabled()) { + // GFX12.0: + // In WGP mode the waves of a work-group can be executing on either CU + // of the WGP. Therefore need to wait for operations to complete to + // ensure they are visible to waves in the other CU as the L0 is per CU. + // Otherwise in CU mode and all waves of a work-group are on the same CU + // which shares the same L0. + // + // GFX12.5: + // TODO DOCS + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2435,7 +2444,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // // This also applies to fences. Fences cannot pair with an instruction // tracked with bvh/samplecnt as we don't have any atomics that do that. - if (Order != AtomicOrdering::Acquire) { + if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); } @@ -2487,10 +2496,14 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, ScopeImm = AMDGPU::CPol::SCOPE_DEV; break; case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore we need to invalidate the L0 which is per CU. - // Otherwise in CU mode all waves of a work-group are on the same CU, and so - // the L0 does not need to be invalidated. + // GFX12.0: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore we need to invalidate the L0 which is per CU. + // Otherwise in CU mode all waves of a work-group are on the same CU, and + // so the L0 does not need to be invalidated. + // + // GFX12.5 + // TODO DOCS if (ST.isCuModeEnabled()) return false; @@ -2535,7 +2548,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - // global_wb is only necessary at system scope for gfx120x targets. + // global_wb is only necessary at system scope for GFX12.0, + // they're also necessary at device scope for GFX12.5. // // Emitting it for lower scopes is a slow no-op, so we omit it // for performance. @@ -2545,6 +2559,12 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, .addImm(AMDGPU::CPol::SCOPE_SYS); break; case SIAtomicScope::AGENT: + // TODO DOCS + if (ST.hasGFX1250Insts()) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) + .addImm(AMDGPU::CPol::SCOPE_DEV); + } + break; case SIAtomicScope::WORKGROUP: // No WB necessary, but we still have to wait. break; @@ -2607,17 +2627,32 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( } bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { - MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); - if (!CPol) - return false; + assert(MI.mayStore() && "Not a Store inst"); + const bool IsRMW = (MI.mayLoad() && MI.mayStore()); + bool Changed = false; + + // GFX12.5 only: xcnt wait is needed before flat and global atomics + // stores/rmw. + if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) { + MachineBasicBlock &MBB = *MI.getParent(); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); + Changed = true; + } + + // Remaining fixes do not apply to RMWs. + if (IsRMW) + return Changed; + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); + if (!CPol) // Some vmem operations do not have a scope and are not concerned. + return Changed; const unsigned Scope = CPol->getImm() & CPol::SCOPE; // GFX12.0 only: Extra waits needed before system scope stores. if (!ST.hasGFX1250Insts()) { if (!Atomic && Scope == CPol::SCOPE_SYS) return insertWaitsBeforeSystemScopeStore(MI); - return false; + return Changed; } // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address @@ -2627,7 +2662,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) return setScope(MI, CPol::SCOPE_SE); - return false; + return Changed; } bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const { @@ -2839,6 +2874,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; + MachineInstr &RMWMI = *MI; if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); @@ -2873,6 +2909,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, Position::AFTER); } + Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true); return Changed; } |