diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp | 73 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h | 21 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIDefines.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 13 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 140 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 114 |
11 files changed, 269 insertions, 125 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp new file mode 100644 index 0000000..30a1f05 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp @@ -0,0 +1,73 @@ +//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a DAG scheduling mutation to add latency to +/// barrier edges between ATOMIC_FENCE instructions and preceding +/// memory accesses potentially affected by the fence. +/// This encourages the scheduling of more instructions before +/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may +/// introduce wait counting or indicate an impending S_BARRIER +/// wait. Having more instructions in-flight across these +/// constructs improves latency hiding. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUBarrierLatency.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" + +using namespace llvm; + +namespace { + +class BarrierLatency : public ScheduleDAGMutation { +public: + BarrierLatency() = default; + void apply(ScheduleDAGInstrs *DAG) override; +}; + +void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { + constexpr unsigned SyntheticLatency = 2000; + for (SUnit &SU : DAG->SUnits) { + const MachineInstr *MI = SU.getInstr(); + if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) + continue; + + // Update latency on barrier edges of ATOMIC_FENCE. + // We don't consider the scope of the fence or type of instruction + // involved in the barrier edge. + for (SDep &PredDep : SU.Preds) { + if (!PredDep.isBarrier()) + continue; + SUnit *PredSU = PredDep.getSUnit(); + MachineInstr *MI = PredSU->getInstr(); + // Only consider memory loads + if (!MI->mayLoad() || MI->mayStore()) + continue; + SDep ForwardD = PredDep; + ForwardD.setSUnit(&SU); + for (SDep &SuccDep : PredSU->Succs) { + if (SuccDep == ForwardD) { + SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency); + break; + } + } + PredDep.setLatency(PredDep.getLatency() + SyntheticLatency); + PredSU->setDepthDirty(); + SU.setDepthDirty(); + } + } +} + +} // end namespace + +std::unique_ptr<ScheduleDAGMutation> +llvm::createAMDGPUBarrierLatencyDAGMutation() { + return std::make_unique<BarrierLatency>(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h new file mode 100644 index 0000000..c23f0b9 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h @@ -0,0 +1,21 @@ +//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include <memory> + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 12915c73..97c2c9c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3446,10 +3446,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { : 0); // swz MachineMemOperand *LoadMMO = *MI.memoperands_begin(); + // Don't set the offset value here because the pointer points to the base of + // the buffer. MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); - LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm(); + MachinePointerInfo StorePtrI = LoadPtrI; - StorePtrI.V = nullptr; + LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(), + AMDGPUAS::BUFFER_RESOURCE)); + LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE; StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & @@ -3627,13 +3631,17 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ if (isSGPR(Addr)) MIB.addReg(VOffset); - MIB.add(MI.getOperand(4)) // offset - .add(MI.getOperand(5)); // cpol + MIB.add(MI.getOperand(4)); // offset + + unsigned Aux = MI.getOperand(5).getImm(); + MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol MachineMemOperand *LoadMMO = *MI.memoperands_begin(); MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); LoadPtrI.Offset = MI.getOperand(4).getImm(); MachinePointerInfo StorePtrI = LoadPtrI; + LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(), + AMDGPUAS::GLOBAL_ADDRESS)); LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4958a20..996b55f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -17,6 +17,7 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" +#include "AMDGPUBarrierLatency.h" #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" #include "AMDGPUExportKernelRuntimeHandles.h" @@ -639,6 +640,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } @@ -659,6 +661,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } @@ -1197,6 +1200,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { EnableVOPD) DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 13f727b68..a1e0e52 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -52,6 +52,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUAsmPrinter.cpp AMDGPUAtomicOptimizer.cpp AMDGPUAttributor.cpp + AMDGPUBarrierLatency.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index ecc2824..b7a92a0 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -423,6 +423,9 @@ enum CPol { // Volatile (used to preserve/signal operation volatility for buffer // operations not a real instruction bit) VOLATILE = 1 << 31, + // The set of "cache policy" bits used for compiler features that + // do not correspond to handware features. + VIRTUAL_BITS = VOLATILE, }; } // namespace CPol diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a2841c11..a757421 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1651,6 +1651,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); Info.ptrVal = CI.getArgOperand(1); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1)); + if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) + Info.flags |= MachineMemOperand::MOVolatile; return true; } case Intrinsic::amdgcn_ds_bvh_stack_rtn: @@ -11219,8 +11222,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, MachinePointerInfo StorePtrI = LoadPtrI; LoadPtrI.V = PoisonValue::get( - PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); - LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; + PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE)); + LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE; StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & @@ -11307,7 +11310,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } Ops.push_back(Op.getOperand(5)); // Offset - Ops.push_back(Op.getOperand(6)); // CPol + + unsigned Aux = Op.getConstantOperandVal(6); + Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL, + MVT::i32)); // CPol + Ops.push_back(M0Val.getValue(0)); // Chain Ops.push_back(M0Val.getValue(1)); // Glue diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 50447f4..2ff2d2f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4032,28 +4032,31 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) { } } +/// Helper struct for the implementation of 3-address conversion to communicate +/// updates made to instruction operands. +struct SIInstrInfo::ThreeAddressUpdates { + /// Other instruction whose def is no longer used by the converted + /// instruction. + MachineInstr *RemoveMIUse = nullptr; +}; + MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { MachineBasicBlock &MBB = *MI.getParent(); - unsigned Opc = MI.getOpcode(); + ThreeAddressUpdates U; + MachineInstr *NewMI = convertToThreeAddressImpl(MI, U); - // Handle MFMA. - int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); - if (NewMFMAOpc != -1) { - MachineInstrBuilder MIB = - BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) - MIB.add(MI.getOperand(I)); - updateLiveVariables(LV, MI, *MIB); + if (NewMI) { + updateLiveVariables(LV, MI, *NewMI); if (LIS) { - LIS->ReplaceMachineInstrInMaps(MI, *MIB); + LIS->ReplaceMachineInstrInMaps(MI, *NewMI); // SlotIndex of defs needs to be updated when converting to early-clobber - MachineOperand &Def = MIB->getOperand(0); + MachineOperand &Def = NewMI->getOperand(0); if (Def.isEarlyClobber() && Def.isReg() && LIS->hasInterval(Def.getReg())) { - SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false); - SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true); + SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false); + SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true); auto &LI = LIS->getInterval(Def.getReg()); auto UpdateDefIndex = [&](LiveRange &LR) { auto *S = LR.find(OldIndex); @@ -4068,6 +4071,58 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, UpdateDefIndex(SR); } } + } + + if (U.RemoveMIUse) { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + // The only user is the instruction which will be killed. + Register DefReg = U.RemoveMIUse->getOperand(0).getReg(); + + if (MRI.hasOneNonDBGUse(DefReg)) { + // We cannot just remove the DefMI here, calling pass will crash. + U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF)); + U.RemoveMIUse->getOperand(0).setIsDead(true); + for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I) + U.RemoveMIUse->removeOperand(I); + if (LV) + LV->getVarInfo(DefReg).AliveBlocks.clear(); + } + + if (LIS) { + LiveInterval &DefLI = LIS->getInterval(DefReg); + + // We cannot delete the original instruction here, so hack out the use + // in the original instruction with a dummy register so we can use + // shrinkToUses to deal with any multi-use edge cases. Other targets do + // not have the complexity of deleting a use to consider here. + Register DummyReg = MRI.cloneVirtualRegister(DefReg); + for (MachineOperand &MIOp : MI.uses()) { + if (MIOp.isReg() && MIOp.getReg() == DefReg) { + MIOp.setIsUndef(true); + MIOp.setReg(DummyReg); + } + } + + LIS->shrinkToUses(&DefLI); + } + } + + return NewMI; +} + +MachineInstr * +SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI, + ThreeAddressUpdates &U) const { + MachineBasicBlock &MBB = *MI.getParent(); + unsigned Opc = MI.getOpcode(); + + // Handle MFMA. + int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); + if (NewMFMAOpc != -1) { + MachineInstrBuilder MIB = + BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); return MIB; } @@ -4077,11 +4132,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .setMIFlags(MI.getFlags()); for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) MIB->addOperand(MI.getOperand(I)); - - updateLiveVariables(LV, MI, *MIB); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - return MIB; } @@ -4152,39 +4202,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { MachineInstr *DefMI; - const auto killDef = [&]() -> void { - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - // The only user is the instruction which will be killed. - Register DefReg = DefMI->getOperand(0).getReg(); - - if (MRI.hasOneNonDBGUse(DefReg)) { - // We cannot just remove the DefMI here, calling pass will crash. - DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); - DefMI->getOperand(0).setIsDead(true); - for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) - DefMI->removeOperand(I); - if (LV) - LV->getVarInfo(DefReg).AliveBlocks.clear(); - } - - if (LIS) { - LiveInterval &DefLI = LIS->getInterval(DefReg); - - // We cannot delete the original instruction here, so hack out the use - // in the original instruction with a dummy register so we can use - // shrinkToUses to deal with any multi-use edge cases. Other targets do - // not have the complexity of deleting a use to consider here. - Register DummyReg = MRI.cloneVirtualRegister(DefReg); - for (MachineOperand &MIOp : MI.uses()) { - if (MIOp.isReg() && MIOp.getReg() == DefReg) { - MIOp.setIsUndef(true); - MIOp.setReg(DummyReg); - } - } - - LIS->shrinkToUses(&DefLI); - } - }; int64_t Imm; if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { @@ -4196,10 +4213,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Src1) .addImm(Imm) .setMIFlags(MI.getFlags()); - updateLiveVariables(LV, MI, *MIB); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - killDef(); + U.RemoveMIUse = DefMI; return MIB; } } @@ -4212,11 +4226,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .addImm(Imm) .add(*Src2) .setMIFlags(MI.getFlags()); - updateLiveVariables(LV, MI, *MIB); - - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - killDef(); + U.RemoveMIUse = DefMI; return MIB; } } @@ -4235,12 +4245,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .addImm(Imm) .add(*Src2) .setMIFlags(MI.getFlags()); - updateLiveVariables(LV, MI, *MIB); - - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - if (DefMI) - killDef(); + U.RemoveMIUse = DefMI; return MIB; } } @@ -4269,9 +4274,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .setMIFlags(MI.getFlags()); if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel)) MIB.addImm(OpSel ? OpSel->getImm() : 0); - updateLiveVariables(LV, MI, *MIB); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); return MIB; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index df27ec1..e1d7a07 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -88,6 +88,8 @@ private: }; class SIInstrInfo final : public AMDGPUGenInstrInfo { + struct ThreeAddressUpdates; + private: const SIRegisterInfo RI; const GCNSubtarget &ST; @@ -190,6 +192,9 @@ private: bool resultDependsOnExec(const MachineInstr &MI) const; + MachineInstr *convertToThreeAddressImpl(MachineInstr &MI, + ThreeAddressUpdates &Updates) const; + protected: /// If the specific machine instruction is a instruction that moves/copies /// value from one register to another register return destination and source diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 74d4153..6f1feb1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2223,8 +2223,8 @@ def : GCNPat < def : GCNPat < (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)), - (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, - 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0), + (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src, + !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0), 0, 0, 0, 0, 0) > { let SubtargetPredicate = HasPackedFP32Ops; diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 362ef14..07264d9 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/MemoryModelRelaxationAnnotations.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" @@ -277,6 +278,12 @@ public: /// rmw operation, "std::nullopt" otherwise. std::optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; + + /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store, + /// along with an indication of whether this is a load or store. If it is not + /// a direct-to-LDS operation, returns std::nullopt. + std::optional<SIMemOpInfo> + getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const; }; class SICacheControl { @@ -390,12 +397,6 @@ public: bool IsCrossAddrSpaceOrdering, Position Pos) const = 0; - /// Inserts any necessary instructions before the barrier start instruction - /// \p MI in order to support pairing of barriers and fences. - virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const { - return false; - }; - /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; }; @@ -576,12 +577,8 @@ public: bool IsCrossAddrSpaceOrdering, Position Pos, AtomicOrdering Order, bool AtomicsOnly) const override; - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; - - bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override; + bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, Position Pos) const override; }; class SIGfx11CacheControl : public SIGfx10CacheControl { @@ -703,6 +700,9 @@ private: /// instructions are added/deleted or \p MI is modified, false otherwise. bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); + /// Expands LDS DMA operation \p MI. Returns true if instructions are + /// added/deleted or \p MI is modified, false otherwise. + bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); public: SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {}; @@ -832,6 +832,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { return SIAtomicAddrSpace::SCRATCH; if (AS == AMDGPUAS::REGION_ADDRESS) return SIAtomicAddrSpace::GDS; + if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE || + AS == AMDGPUAS::BUFFER_STRIDED_POINTER) + return SIAtomicAddrSpace::GLOBAL; return SIAtomicAddrSpace::OTHER; } @@ -987,6 +990,16 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( return constructFromMIWithMMO(MI); } +std::optional<SIMemOpInfo> +SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!SIInstrInfo::isLDSDMA(*MI)) + return std::nullopt; + + return constructFromMIWithMMO(MI); +} + SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { TII = ST.getInstrInfo(); IV = getIsaVersion(ST.getCPU()); @@ -1099,7 +1112,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1429,7 +1442,7 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1733,7 +1746,7 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1968,7 +1981,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -2046,8 +2059,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, // the WGP. Therefore need to wait for operations to complete to ensure // they are visible to waves in the other CU as the L0 is per CU. // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. - if (!ST.isCuModeEnabled()) { + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. + if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) VMCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2202,22 +2218,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx10CacheControl::insertBarrierStart( - MachineBasicBlock::iterator &MI) const { - // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU - // mode. This is because a CU mode release fence does not emit any wait, which - // is fine when only dealing with vmem, but isn't sufficient in the presence - // of barriers which do not go through vmem. - // GFX12.5 does not require this additional wait. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) - return false; - - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); - return true; -} - bool SIGfx11CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { @@ -2266,7 +2266,7 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -2396,15 +2396,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // In WGP mode the waves of a work-group can be executing on either CU // of the WGP. Therefore need to wait for operations to complete to // ensure they are visible to waves in the other CU as the L0 is per CU. + // // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. // // GFX12.5: // CU$ has two ports. To ensure operations are visible at the workgroup // level, we need to ensure all operations in this port have completed // so the other SIMDs in the WG can see them. There is no ordering // guarantee between the ports. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() || + isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2611,7 +2616,7 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { // Only handle load and store, not atomic read-modify-write instructions. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -2934,6 +2939,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } +bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoad() && MI->mayStore()); + + // The volatility or nontemporal-ness of the operation is a + // function of the global memory, not the LDS. + SIMemOp OpKind = + SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE; + + // Handle volatile and/or nontemporal markers on direct-to-LDS loads and + // stores. The operation is treated as a volatile/nontemporal store + // to its second argument. + return CC->enableVolatileAndOrNonTemporal( + MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(), + MOI.isNonTemporal(), MOI.isLastUse()); +} + bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) { const MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); @@ -2977,22 +2999,20 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) { MI = II->getIterator(); } - if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) { - Changed |= CC->insertBarrierStart(MI); - continue; - } - if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; - if (const auto &MOI = MOA.getLoadInfo(MI)) + if (const auto &MOI = MOA.getLoadInfo(MI)) { Changed |= expandLoad(*MOI, MI); - else if (const auto &MOI = MOA.getStoreInfo(MI)) { + } else if (const auto &MOI = MOA.getStoreInfo(MI)) { Changed |= expandStore(*MOI, MI); - } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) + } else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) { + Changed |= expandLDSDMA(*MOI, MI); + } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) { Changed |= expandAtomicFence(*MOI, MI); - else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) + } else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) { Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); + } } } |