diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 201 |
1 files changed, 121 insertions, 80 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 484861d..07264d9 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/MemoryModelRelaxationAnnotations.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" @@ -277,6 +278,12 @@ public: /// rmw operation, "std::nullopt" otherwise. std::optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; + + /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store, + /// along with an indication of whether this is a load or store. If it is not + /// a direct-to-LDS operation, returns std::nullopt. + std::optional<SIMemOpInfo> + getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const; }; class SICacheControl { @@ -360,11 +367,13 @@ public: /// between memory instructions to enforce the order they become visible as /// observed by other memory instructions executing in memory scope \p Scope. /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between - /// address spaces. Returns true iff any instructions inserted. + /// address spaces. If \p AtomicsOnly is true, only insert waits for counters + /// that are used by atomic instructions. + /// Returns true iff any instructions inserted. virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const = 0; + AtomicOrdering Order, bool AtomicsOnly) const = 0; /// Inserts any necessary instructions at position \p Pos relative to /// instruction \p MI to ensure any subsequent memory instructions of this @@ -388,12 +397,6 @@ public: bool IsCrossAddrSpaceOrdering, Position Pos) const = 0; - /// Inserts any necessary instructions before the barrier start instruction - /// \p MI in order to support pairing of barriers and fences. - virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const { - return false; - }; - /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; }; @@ -437,7 +440,7 @@ public: bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; + AtomicOrdering Order, bool AtomicsOnly) const override; bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -484,7 +487,7 @@ public: bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; + AtomicOrdering Order, bool AtomicsOnly) const override; bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -572,14 +575,10 @@ public: bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; - - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; + AtomicOrdering Order, bool AtomicsOnly) const override; - bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override; + bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, Position Pos) const override; }; class SIGfx11CacheControl : public SIGfx10CacheControl { @@ -629,7 +628,7 @@ public: bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; + AtomicOrdering Order, bool AtomicsOnly) const override; bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; @@ -701,6 +700,9 @@ private: /// instructions are added/deleted or \p MI is modified, false otherwise. bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); + /// Expands LDS DMA operation \p MI. Returns true if instructions are + /// added/deleted or \p MI is modified, false otherwise. + bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); public: SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {}; @@ -830,6 +832,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { return SIAtomicAddrSpace::SCRATCH; if (AS == AMDGPUAS::REGION_ADDRESS) return SIAtomicAddrSpace::GDS; + if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE || + AS == AMDGPUAS::BUFFER_STRIDED_POINTER) + return SIAtomicAddrSpace::GLOBAL; return SIAtomicAddrSpace::OTHER; } @@ -985,6 +990,16 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( return constructFromMIWithMMO(MI); } +std::optional<SIMemOpInfo> +SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!SIInstrInfo::isLDSDMA(*MI)) + return std::nullopt; + + return constructFromMIWithMMO(MI); +} + SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { TII = ST.getInstrInfo(); IV = getIsaVersion(ST.getCPU()); @@ -1097,7 +1112,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1120,7 +1135,8 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } @@ -1140,7 +1156,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const { + AtomicOrdering Order, + bool AtomicsOnly) const { bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); @@ -1294,7 +1311,8 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, bool IsCrossAddrSpaceOrdering, Position Pos) const { return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); } bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, @@ -1424,7 +1442,7 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1447,7 +1465,8 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } @@ -1467,8 +1486,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, - Position Pos, - AtomicOrdering Order) const { + Position Pos, AtomicOrdering Order, + bool AtomicsOnly) const { if (ST.isTgSplitEnabled()) { // In threadgroup split mode the waves of a work-group can be executing on // different CUs. Therefore need to wait for global or GDS memory operations @@ -1488,7 +1507,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, AddrSpace &= ~SIAtomicAddrSpace::LDS; } return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, - IsCrossAddrSpaceOrdering, Pos, Order); + IsCrossAddrSpaceOrdering, Pos, Order, + AtomicsOnly); } bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, @@ -1726,7 +1746,7 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1747,7 +1767,8 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } @@ -1904,7 +1925,8 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other // S_WAITCNT needed. Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); return Changed; } @@ -1959,7 +1981,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1984,7 +2006,8 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } @@ -2007,7 +2030,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, - Position Pos, AtomicOrdering Order) const { + Position Pos, AtomicOrdering Order, + bool AtomicsOnly) const { bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); @@ -2035,8 +2059,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, // the WGP. Therefore need to wait for operations to complete to ensure // they are visible to waves in the other CU as the L0 is per CU. // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. - if (!ST.isCuModeEnabled()) { + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. + if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) VMCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2191,22 +2218,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx10CacheControl::insertBarrierStart( - MachineBasicBlock::iterator &MI) const { - // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU - // mode. This is because a CU mode release fence does not emit any wait, which - // is fine when only dealing with vmem, but isn't sufficient in the presence - // of barriers which do not go through vmem. - // GFX12.5 does not require this additional wait. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) - return false; - - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); - return true; -} - bool SIGfx11CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { @@ -2255,7 +2266,7 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -2281,7 +2292,8 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } @@ -2354,7 +2366,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, - Position Pos, AtomicOrdering Order) const { + Position Pos, AtomicOrdering Order, + bool AtomicsOnly) const { bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); @@ -2383,15 +2396,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // In WGP mode the waves of a work-group can be executing on either CU // of the WGP. Therefore need to wait for operations to complete to // ensure they are visible to waves in the other CU as the L0 is per CU. + // // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. // // GFX12.5: // CU$ has two ports. To ensure operations are visible at the workgroup // level, we need to ensure all operations in this port have completed // so the other SIMDs in the WG can see them. There is no ordering // guarantee between the ports. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() || + isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2444,7 +2462,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // // This also applies to fences. Fences cannot pair with an instruction // tracked with bvh/samplecnt as we don't have any atomics that do that. - if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) { + if (!AtomicsOnly && ST.hasImageInsts()) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); } @@ -2587,7 +2605,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), // we of course need to wait for that as well. Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); return Changed; } @@ -2597,7 +2616,7 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { // Only handle load and store, not atomic read-modify-write instructions. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -2624,7 +2643,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); } return Changed; @@ -2748,13 +2768,15 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), SIMemOp::LOAD | SIMemOp::STORE, MOI.getIsCrossAddressSpaceOrdering(), - Position::BEFORE, Order); + Position::BEFORE, Order, /*AtomicsOnly=*/false); if (Order == AtomicOrdering::Acquire || Order == AtomicOrdering::SequentiallyConsistent) { - Changed |= CC->insertWait( - MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD, - MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order); + // The wait below only needs to wait on the prior atomic. + Changed |= + CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(), + SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(), + Position::AFTER, Order, /*AtomicsOnly=*/true); Changed |= CC->insertAcquire(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); @@ -2830,9 +2852,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); if (Order == AtomicOrdering::Acquire) { - Changed |= CC->insertWait( - MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order); + // Acquire fences only need to wait on the previous atomic they pair with. + Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace, + SIMemOp::LOAD | SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE, Order, /*AtomicsOnly=*/true); } if (Order == AtomicOrdering::Release || @@ -2897,10 +2921,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, Order == AtomicOrdering::SequentiallyConsistent || MOI.getFailureOrdering() == AtomicOrdering::Acquire || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { - Changed |= CC->insertWait( - MI, MOI.getScope(), MOI.getInstrAddrSpace(), - isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order); + // Only wait on the previous atomic. + Changed |= + CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(), + isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, + Order, /*AtomicsOnly=*/true); Changed |= CC->insertAcquire(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); @@ -2913,6 +2939,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } +bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoad() && MI->mayStore()); + + // The volatility or nontemporal-ness of the operation is a + // function of the global memory, not the LDS. + SIMemOp OpKind = + SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE; + + // Handle volatile and/or nontemporal markers on direct-to-LDS loads and + // stores. The operation is treated as a volatile/nontemporal store + // to its second argument. + return CC->enableVolatileAndOrNonTemporal( + MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(), + MOI.isNonTemporal(), MOI.isLastUse()); +} + bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) { const MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); @@ -2956,22 +2999,20 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) { MI = II->getIterator(); } - if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) { - Changed |= CC->insertBarrierStart(MI); - continue; - } - if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; - if (const auto &MOI = MOA.getLoadInfo(MI)) + if (const auto &MOI = MOA.getLoadInfo(MI)) { Changed |= expandLoad(*MOI, MI); - else if (const auto &MOI = MOA.getStoreInfo(MI)) { + } else if (const auto &MOI = MOA.getStoreInfo(MI)) { Changed |= expandStore(*MOI, MI); - } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) + } else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) { + Changed |= expandLDSDMA(*MOI, MI); + } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) { Changed |= expandAtomicFence(*MOI, MI); - else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) + } else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) { Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); + } } } |