diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index f62e808..4069a36 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -312,6 +312,10 @@ public: SIMemOp Op, bool IsVolatile, bool IsNonTemporal) const = 0; + virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { + return false; + }; + /// Inserts any necessary instructions at position \p Pos relative /// to instruction \p MI to ensure memory instructions before \p Pos of kind /// \p Op associated with address spaces \p AddrSpace have completed. Used @@ -589,6 +593,15 @@ protected: bool setScope(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Value) const; + // Stores with system scope (SCOPE_SYS) need to wait for: + // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0 + // - non-returning-atomics - wait for STORECNT==0 + // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits + // since it does not distinguish atomics-with-return from regular stores. + // There is no need to wait if memory is cached (mtype != UC). + bool + insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const; + public: SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} @@ -603,6 +616,8 @@ public: SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal) const override; + + bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; }; class SIMemoryLegalizer final : public MachineFunctionPass { @@ -2194,6 +2209,22 @@ bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, return false; } +bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore( + const MachineBasicBlock::iterator MI) const { + // TODO: implement flag for frontend to give us a hint not to insert waits. + + MachineBasicBlock &MBB = *MI->getParent(); + const DebugLoc &DL = MI->getDebugLoc(); + + BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0); + + return true; +} + bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, @@ -2364,6 +2395,9 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); + if (Op == SIMemOp::STORE) + Changed |= insertWaitsBeforeSystemScopeStore(MI); + // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2381,6 +2415,15 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } +bool SIGfx12CacheControl::expandSystemScopeStore( + MachineBasicBlock::iterator &MI) const { + MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); + if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) + return insertWaitsBeforeSystemScopeStore(MI); + + return false; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; @@ -2467,6 +2510,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, Changed |= CC->enableVolatileAndOrNonTemporal( MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), MOI.isNonTemporal()); + + // GFX12 specific, scope(desired coherence domain in cache hierarchy) is + // instruction field, do not confuse it with atomic scope. + Changed |= CC->expandSystemScopeStore(MI); return Changed; } |