diff options
author | Petar Avramovic <Petar.Avramovic@amd.com> | 2024-02-28 16:18:04 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-28 16:18:04 +0100 |
commit | 3e35ba53e20dbbd3ccc191d71ed75d52dc36ec59 (patch) | |
tree | 0ca422fe5ab27352eddc16be3b5e19b17f59a44c /llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | |
parent | cb6c0f1d28c0d1915d1ca9a198254e3828af2384 (diff) | |
download | llvm-3e35ba53e20dbbd3ccc191d71ed75d52dc36ec59.zip llvm-3e35ba53e20dbbd3ccc191d71ed75d52dc36ec59.tar.gz llvm-3e35ba53e20dbbd3ccc191d71ed75d52dc36ec59.tar.bz2 |
AMDGPU/GFX12: Insert waitcnts before stores with scope_sys (#82996)
Insert waitcnts for loads and atomics before stores with system scope.
Scope is field in instruction encoding and corresponds to desired
coherence level in cache hierarchy.
Intrinsic stores can set scope in cache policy operand.
If volatile keyword is used on generic stores memory legalizer will set
scope to system. Generic stores, by default, get lowest scope level.
Waitcnts are not required if it is guaranteed that memory is cached.
For example vulkan shaders can guarantee this.
TODO: implement flag for frontends to give us a hint not to insert
waits.
Expecting vulkan flag to be implemented as vulkan:private MMRA.
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index f62e808..4069a36 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -312,6 +312,10 @@ public: SIMemOp Op, bool IsVolatile, bool IsNonTemporal) const = 0; + virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { + return false; + }; + /// Inserts any necessary instructions at position \p Pos relative /// to instruction \p MI to ensure memory instructions before \p Pos of kind /// \p Op associated with address spaces \p AddrSpace have completed. Used @@ -589,6 +593,15 @@ protected: bool setScope(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Value) const; + // Stores with system scope (SCOPE_SYS) need to wait for: + // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0 + // - non-returning-atomics - wait for STORECNT==0 + // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits + // since it does not distinguish atomics-with-return from regular stores. + // There is no need to wait if memory is cached (mtype != UC). + bool + insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const; + public: SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} @@ -603,6 +616,8 @@ public: SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal) const override; + + bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; }; class SIMemoryLegalizer final : public MachineFunctionPass { @@ -2194,6 +2209,22 @@ bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, return false; } +bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore( + const MachineBasicBlock::iterator MI) const { + // TODO: implement flag for frontend to give us a hint not to insert waits. + + MachineBasicBlock &MBB = *MI->getParent(); + const DebugLoc &DL = MI->getDebugLoc(); + + BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0); + + return true; +} + bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, @@ -2364,6 +2395,9 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); + if (Op == SIMemOp::STORE) + Changed |= insertWaitsBeforeSystemScopeStore(MI); + // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2381,6 +2415,15 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } +bool SIGfx12CacheControl::expandSystemScopeStore( + MachineBasicBlock::iterator &MI) const { + MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); + if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) + return insertWaitsBeforeSystemScopeStore(MI); + + return false; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; @@ -2467,6 +2510,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, Changed |= CC->enableVolatileAndOrNonTemporal( MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), MOI.isNonTemporal()); + + // GFX12 specific, scope(desired coherence domain in cache hierarchy) is + // instruction field, do not confuse it with atomic scope. + Changed |= CC->expandSystemScopeStore(MI); return Changed; } |