diff options
author | Jay Foad <jay.foad@amd.com> | 2024-01-18 10:47:45 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-18 10:47:45 +0000 |
commit | ba52f06f9d92c7ca04b440f618f8d352ea121fcc (patch) | |
tree | 3c93f0f8895d5e47c3f4d563b1add318659f5ca2 /llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | |
parent | 9ca36932b5350a9d8d7ddf6c26ff8c1a81467430 (diff) | |
download | llvm-ba52f06f9d92c7ca04b440f618f8d352ea121fcc.zip llvm-ba52f06f9d92c7ca04b440f618f8d352ea121fcc.tar.gz llvm-ba52f06f9d92c7ca04b440f618f8d352ea121fcc.tar.bz2 |
[AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (#77438)
Update SIMemoryLegalizer and SIInsertWaitcnts to use separate wait
instructions per counter (e.g. S_WAIT_LOADCNT) and split VMCNT into
separate LOADCNT, SAMPLECNT and BVHCNT counters.
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 180 |
1 files changed, 180 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 6d749ad..84b9330 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -579,11 +579,30 @@ public: }; class SIGfx12CacheControl : public SIGfx11CacheControl { +protected: + // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. + // \returns Returns true if \p MI is modified, false otherwise. + bool setTH(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const; + // Sets Scope policy to \p Value if CPol operand is present in instruction \p + // MI. \returns Returns true if \p MI is modified, false otherwise. + bool setScope(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const; + public: SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} + bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsCrossAddrSpaceOrdering, Position Pos) const override; + bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; + + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, + bool IsNonTemporal) const override; }; class SIMemoryLegalizer final : public MachineFunctionPass { @@ -2142,6 +2161,132 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( return Changed; } +bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const { + MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); + if (!CPol) + return false; + + uint64_t NewTH = Value & AMDGPU::CPol::TH; + if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) { + CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH); + return true; + } + + return false; +} + +bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const { + MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); + if (!CPol) + return false; + + uint64_t NewScope = Value & AMDGPU::CPol::SCOPE; + if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) { + CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope); + return true; + } + + return false; +} + +bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + bool LOADCnt = false; + bool DSCnt = false; + bool STORECnt = false; + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != + SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + LOADCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + STORECnt |= true; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to wait for operations to complete to ensure + // they are visible to waves in the other CU as the L0 is per CU. + // Otherwise in CU mode and all waves of a work-group are on the same CU + // which shares the same L0. + if (!ST.isCuModeEnabled()) { + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + LOADCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + STORECnt |= true; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The L0 cache keeps all memory operations in order for + // work-items in the same wavefront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is + // not needed as LDS operations for all waves are executed in a total + // global ordering as observed by all waves. Required if also + // synchronizing with global/GDS memory as LDS operations could be + // reordered with respect to later global/GDS memory operations of the + // same wave. + DSCnt |= IsCrossAddrSpaceOrdering; + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The LDS keeps all memory operations in order for + // the same wavefront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if (LOADCnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); + Changed = true; + } + + if (STORECnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0); + Changed = true; + } + + if (DSCnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0); + Changed = true; + } + + if (Pos == Position::AFTER) + --MI; + + return Changed; +} + bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, @@ -2198,6 +2343,41 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return true; } +bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( + MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, bool IsNonTemporal) const { + + // Only handle load and store, not atomic read-modify-write instructions. + assert(MI->mayLoad() ^ MI->mayStore()); + + // Only update load and store, not LLVM IR atomic read-modify-write + // instructions. The latter are always marked as volatile so cannot sensibly + // handle it as do not want to pessimize all atomics. Also they do not support + // the nontemporal attribute. + assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); + + bool Changed = false; + + if (IsVolatile) { + Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); + + // Ensure operation has completed at system scope to cause all volatile + // operations to be visible outside the program in a global order. Do not + // request cross address space as only the global address space can be + // observable outside the program, so no need to cause a waitcnt for LDS + // address space operations. + Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, + Position::AFTER); + } + + if (IsNonTemporal) { + // Set non-temporal hint for all cache levels. + Changed |= setTH(MI, AMDGPU::CPol::TH_NT); + } + + return Changed; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; |