aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp180
1 files changed, 180 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 6d749ad..84b9330 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -579,11 +579,30 @@ public:
};
class SIGfx12CacheControl : public SIGfx11CacheControl {
+protected:
+ // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
+ // \returns Returns true if \p MI is modified, false otherwise.
+ bool setTH(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Value) const;
+ // Sets Scope policy to \p Value if CPol operand is present in instruction \p
+ // MI. \returns Returns true if \p MI is modified, false otherwise.
+ bool setScope(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Value) const;
+
public:
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+ bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering, Position Pos) const override;
+
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+
+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile,
+ bool IsNonTemporal) const override;
};
class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -2142,6 +2161,132 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
+bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Value) const {
+ MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+ if (!CPol)
+ return false;
+
+ uint64_t NewTH = Value & AMDGPU::CPol::TH;
+ if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
+ CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
+ return true;
+ }
+
+ return false;
+}
+
+bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Value) const {
+ MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+ if (!CPol)
+ return false;
+
+ uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
+ if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
+ CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
+ return true;
+ }
+
+ return false;
+}
+
+bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ bool LOADCnt = false;
+ bool DSCnt = false;
+ bool STORECnt = false;
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
+ SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+ LOADCnt |= true;
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+ STORECnt |= true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to wait for operations to complete to ensure
+ // they are visible to waves in the other CU as the L0 is per CU.
+ // Otherwise in CU mode and all waves of a work-group are on the same CU
+ // which shares the same L0.
+ if (!ST.isCuModeEnabled()) {
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+ LOADCnt |= true;
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+ STORECnt |= true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The L0 cache keeps all memory operations in order for
+ // work-items in the same wavefront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
+ // not needed as LDS operations for all waves are executed in a total
+ // global ordering as observed by all waves. Required if also
+ // synchronizing with global/GDS memory as LDS operations could be
+ // reordered with respect to later global/GDS memory operations of the
+ // same wave.
+ DSCnt |= IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The LDS keeps all memory operations in order for
+ // the same wavefront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if (LOADCnt) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
+ Changed = true;
+ }
+
+ if (STORECnt) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
+ Changed = true;
+ }
+
+ if (DSCnt) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
+ Changed = true;
+ }
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
@@ -2198,6 +2343,41 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return true;
}
+bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile, bool IsNonTemporal) const {
+
+ // Only handle load and store, not atomic read-modify-write instructions.
+ assert(MI->mayLoad() ^ MI->mayStore());
+
+ // Only update load and store, not LLVM IR atomic read-modify-write
+ // instructions. The latter are always marked as volatile so cannot sensibly
+ // handle it as do not want to pessimize all atomics. Also they do not support
+ // the nontemporal attribute.
+ assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+ bool Changed = false;
+
+ if (IsVolatile) {
+ Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
+
+ // Ensure operation has completed at system scope to cause all volatile
+ // operations to be visible outside the program in a global order. Do not
+ // request cross address space as only the global address space can be
+ // observable outside the program, so no need to cause a waitcnt for LDS
+ // address space operations.
+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+ Position::AFTER);
+ }
+
+ if (IsNonTemporal) {
+ // Set non-temporal hint for all cache levels.
+ Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
+ }
+
+ return Changed;
+}
+
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;