diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 38 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 63 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 311 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 53 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 23 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 16 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP1Instructions.td | 11 |
12 files changed, 224 insertions, 312 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index f01d5f6..a4ef524 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -647,7 +647,7 @@ public: ModuleScopeVariables.insert(GV); } else if (K.second.size() == 1) { KernelAccessVariables.insert(GV); - } else if (set_is_subset(K.second, HybridModuleRootKernels)) { + } else if (K.second == HybridModuleRootKernels) { ModuleScopeVariables.insert(GV); } else { TableLookupVariables.insert(GV); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 92a587b..280fbe2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1384,6 +1384,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() { if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); + TargetPassConfig::addCodeGenPrepare(); + + if (isPassEnabled(EnableLoadStoreVectorizer)) + addPass(createLoadStoreVectorizerPass()); + if (TM->getTargetTriple().isAMDGCN()) { // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). @@ -1392,15 +1397,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. - // - // FIXME: This should ideally be put after the LoadStoreVectorizer. - // However, due to some annoying facts about ResourceUsageAnalysis, - // (especially as exercised in the resource-usage-dead-function test), - // we need all the function passes codegenprepare all the way through - // said resource usage analysis to run on the call graph produced - // before codegenprepare runs (because codegenprepare will knock some - // nodes out of the graph, which leads to function-level passes not - // being run on them, which causes crashes in the resource usage analysis). addPass(createAMDGPULowerBufferFatPointersPass()); addPass(createAMDGPULowerIntrinsicsLegacyPass()); // In accordance with the above FIXME, manually force all the @@ -1408,11 +1404,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { addPass(new DummyCGSCCPass()); } - TargetPassConfig::addCodeGenPrepare(); - - if (isPassEnabled(EnableLoadStoreVectorizer)) - addPass(createLoadStoreVectorizerPass()); - // LowerSwitch pass may introduce unreachable blocks that can // cause unexpected behavior for subsequent passes. Placing it // here seems better that these blocks would get cleaned up by @@ -2125,6 +2116,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { if (EnableLowerKernelArguments) addPass(AMDGPULowerKernelArgumentsPass(TM)); + Base::addCodeGenPrepare(addPass); + + if (isPassEnabled(EnableLoadStoreVectorizer)) + addPass(LoadStoreVectorizerPass()); + // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). // It could be placed anywhere before uniformity annotations (an analysis @@ -2132,25 +2128,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. - // - // FIXME: This should ideally be put after the LoadStoreVectorizer. - // However, due to some annoying facts about ResourceUsageAnalysis, - // (especially as exercised in the resource-usage-dead-function test), - // we need all the function passes codegenprepare all the way through - // said resource usage analysis to run on the call graph produced - // before codegenprepare runs (because codegenprepare will knock some - // nodes out of the graph, which leads to function-level passes not - // being run on them, which causes crashes in the resource usage analysis). addPass(AMDGPULowerBufferFatPointersPass(TM)); addPass.requireCGSCCOrder(); addPass(AMDGPULowerIntrinsicsPass(TM)); - Base::addCodeGenPrepare(addPass); - - if (isPassEnabled(EnableLoadStoreVectorizer)) - addPass(LoadStoreVectorizerPass()); - // LowerSwitch pass may introduce unreachable blocks that can cause unexpected // behavior for subsequent passes. Placing it here seems better that these // blocks would get cleaned up by UnreachableBlockElim inserted next in the diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index fab78a9..bdc0810 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -29,6 +29,7 @@ #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/Support/ErrorHandling.h" @@ -1633,64 +1634,6 @@ void GCNSchedStage::revertScheduling() { DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); } -bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, - SlotIndex OriginalIdx, - SlotIndex RematIdx) const { - - LiveIntervals *LIS = DAG.LIS; - MachineRegisterInfo &MRI = DAG.MRI; - OriginalIdx = OriginalIdx.getRegSlot(true); - RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true)); - for (const MachineOperand &MO : InstToRemat->operands()) { - if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) - continue; - - if (!MO.getReg().isVirtual()) { - // Do not attempt to reason about PhysRegs - // TODO: better analysis of PhysReg livness - if (!DAG.MRI.isConstantPhysReg(MO.getReg()) && - !DAG.TII->isIgnorableUse(MO)) - return false; - - // Constant PhysRegs and IgnorableUses are okay - continue; - } - - LiveInterval &LI = LIS->getInterval(MO.getReg()); - const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx); - assert(OVNI); - - // Don't allow rematerialization immediately after the original def. - // It would be incorrect if InstToRemat redefines the register. - // See PR14098. - if (SlotIndex::isSameInstr(OriginalIdx, RematIdx)) - return false; - - if (OVNI != LI.getVNInfoAt(RematIdx)) - return false; - - // Check that subrange is live at RematIdx. - if (LI.hasSubRanges()) { - const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); - unsigned SubReg = MO.getSubReg(); - LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) - : MRI.getMaxLaneMaskForVReg(MO.getReg()); - for (LiveInterval::SubRange &SR : LI.subranges()) { - if ((SR.LaneMask & LM).none()) - continue; - if (!SR.liveAt(RematIdx)) - return false; - - // Early exit if all used lanes are checked. No need to continue. - LM &= ~SR.LaneMask; - if (LM.none()) - break; - } - } - } - return true; -} - bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { const Function &F = MF.getFunction(); @@ -1812,9 +1755,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { // Do not rematerialize an instruction it it uses registers that aren't // available at its use. This ensures that we are not extending any live // range while rematerializing. - SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI); SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true); - if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx)) + if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI, + *DAG.TII)) continue; REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 06b9b64..8ea4267 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -496,12 +496,6 @@ private: /// stage to their pre-stage values. void finalizeGCNSchedStage() override; - /// \p Returns true if all the uses in \p InstToRemat defined at \p - /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual - /// reg uses. - bool allUsesAvailableAt(const MachineInstr *InstToRemat, - SlotIndex OriginalIdx, SlotIndex RematIdx) const; - public: bool initGCNSchedStage() override; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 7c5d4fc..e4b3528 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -924,6 +924,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case TargetStackID::SGPRSpill: return true; case TargetStackID::ScalableVector: + case TargetStackID::ScalablePredicateVector: case TargetStackID::WasmLocal: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1653008..f7265c5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -64,14 +64,6 @@ static cl::opt<bool> UseDivergentRegisterIndexing( cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false)); -// TODO: This option should be removed once we switch to always using PTRADD in -// the SelectionDAG. -static cl::opt<bool> UseSelectionDAGPTRADD( - "amdgpu-use-sdag-ptradd", cl::Hidden, - cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " - "SelectionDAG ISel"), - cl::init(false)); - static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); @@ -11466,7 +11458,7 @@ static bool isNoUnsignedWrap(SDValue Addr) { bool SITargetLowering::shouldPreservePtrArith(const Function &F, EVT PtrVT) const { - return UseSelectionDAGPTRADD && PtrVT == MVT::i64; + return PtrVT == MVT::i64; } bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index f291191..5e27b37 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -418,15 +418,14 @@ public: class SIInsertWaitcnts { public: const GCNSubtarget *ST; + const SIInstrInfo *TII = nullptr; + const SIRegisterInfo *TRI = nullptr; + const MachineRegisterInfo *MRI = nullptr; InstCounterType SmemAccessCounter; InstCounterType MaxCounter; const unsigned *WaitEventMaskForInst; private: - const SIInstrInfo *TII = nullptr; - const SIRegisterInfo *TRI = nullptr; - const MachineRegisterInfo *MRI = nullptr; - DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; DenseMap<MachineBasicBlock *, bool> PreheadersToFlush; MachineLoopInfo *MLI; @@ -495,13 +494,6 @@ public: bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; bool run(MachineFunction &MF); - bool isForceEmitWaitcnt() const { - for (auto T : inst_counter_types()) - if (ForceEmitWaitcnt[T]) - return true; - return false; - } - void setForceEmitWaitcnt() { // For non-debug builds, ForceEmitWaitcnt has been initialized to false; // For debug builds, get the debug counter info and adjust if need be @@ -570,10 +562,6 @@ public: return VmemReadMapping[getVmemType(Inst)]; } - bool hasXcnt() const { return ST->hasWaitXCnt(); } - - bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; - bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -591,7 +579,6 @@ public: WaitcntBrackets &ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); - static bool asynchronouslyWritesSCC(unsigned Opcode); }; // This objects maintains the current score brackets of each wait counter, and @@ -643,8 +630,6 @@ public: bool merge(const WaitcntBrackets &Other); RegInterval getRegInterval(const MachineInstr *MI, - const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, const MachineOperand &Op) const; bool counterOutOfOrder(InstCounterType T) const; @@ -662,9 +647,7 @@ public: void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); void applyXcnt(const AMDGPU::Waitcnt &Wait); - void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, WaitEventType E, - MachineInstr &MI); + void updateByEvent(WaitEventType E, MachineInstr &MI); unsigned hasPendingEvent() const { return PendingEvents; } unsigned hasPendingEvent(WaitEventType E) const { @@ -773,10 +756,8 @@ private: void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, unsigned Score); - void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - const MachineOperand &Op, InstCounterType CntTy, - unsigned Val); + void setScoreByOperand(const MachineInstr *MI, const MachineOperand &Op, + InstCounterType CntTy, unsigned Val); const SIInsertWaitcnts *Context; @@ -833,12 +814,13 @@ public: } // end anonymous namespace RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, - const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, const MachineOperand &Op) const { if (Op.getReg() == AMDGPU::SCC) return {SCC, SCC + 1}; + const SIRegisterInfo *TRI = Context->TRI; + const MachineRegisterInfo *MRI = Context->MRI; + if (!TRI->isInAllocatableClass(Op.getReg())) return {-1, -1}; @@ -903,11 +885,9 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval, } void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI, - const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, const MachineOperand &Op, InstCounterType CntTy, unsigned Score) { - RegInterval Interval = getRegInterval(MI, MRI, TRI, Op); + RegInterval Interval = getRegInterval(MI, Op); setScoreByInterval(Interval, CntTy, Score); } @@ -939,10 +919,7 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes( return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER); } -void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - WaitEventType E, MachineInstr &Inst) { +void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E); unsigned UB = getScoreUB(T); @@ -955,6 +932,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, PendingEvents |= 1 << E; setScoreUB(T, CurrScore); + const SIRegisterInfo *TRI = Context->TRI; + const MachineRegisterInfo *MRI = Context->MRI; + const SIInstrInfo *TII = Context->TII; + if (T == EXP_CNT) { // Put score on the source vgprs. If this is a store, just use those // specific register(s). @@ -962,59 +943,56 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, // All GDS operations must protect their address register (same as // export.) if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr)) - setScoreByOperand(&Inst, TRI, MRI, *AddrOp, EXP_CNT, CurrScore); + setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore); if (Inst.mayStore()) { if (const auto *Data0 = TII->getNamedOperand(Inst, AMDGPU::OpName::data0)) - setScoreByOperand(&Inst, TRI, MRI, *Data0, EXP_CNT, CurrScore); + setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore); if (const auto *Data1 = TII->getNamedOperand(Inst, AMDGPU::OpName::data1)) - setScoreByOperand(&Inst, TRI, MRI, *Data1, EXP_CNT, CurrScore); + setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) && Inst.getOpcode() != AMDGPU::DS_APPEND && Inst.getOpcode() != AMDGPU::DS_CONSUME && Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { for (const MachineOperand &Op : Inst.all_uses()) { if (TRI->isVectorRegister(*MRI, Op.getReg())) - setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore); + setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore); } } } else if (TII->isFLAT(Inst)) { if (Inst.mayStore()) { - setScoreByOperand(&Inst, TRI, MRI, + setScoreByOperand(&Inst, *TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst)) { - setScoreByOperand(&Inst, TRI, MRI, + setScoreByOperand(&Inst, *TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } } else if (TII->isMIMG(Inst)) { if (Inst.mayStore()) { - setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT, - CurrScore); + setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst)) { - setScoreByOperand(&Inst, TRI, MRI, + setScoreByOperand(&Inst, *TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } } else if (TII->isMTBUF(Inst)) { if (Inst.mayStore()) - setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT, - CurrScore); + setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore); } else if (TII->isMUBUF(Inst)) { if (Inst.mayStore()) { - setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT, - CurrScore); + setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst)) { - setScoreByOperand(&Inst, TRI, MRI, + setScoreByOperand(&Inst, *TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } } else if (TII->isLDSDIR(Inst)) { // LDSDIR instructions attach the score to the destination. - setScoreByOperand(&Inst, TRI, MRI, + setScoreByOperand(&Inst, *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst), EXP_CNT, CurrScore); } else { @@ -1025,18 +1003,27 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, // score. for (MachineOperand &DefMO : Inst.all_defs()) { if (TRI->isVGPR(*MRI, DefMO.getReg())) { - setScoreByOperand(&Inst, TRI, MRI, DefMO, EXP_CNT, CurrScore); + setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore); } } } for (const MachineOperand &Op : Inst.all_uses()) { if (TRI->isVectorRegister(*MRI, Op.getReg())) - setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore); + setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore); } } } else if (T == X_CNT) { + WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP; + if (PendingEvents & (1 << OtherEvent)) { + // Hardware inserts an implicit xcnt between interleaved + // SMEM and VMEM operations. So there will never be + // outstanding address translations for both SMEM and + // VMEM at the same time. + setScoreLB(T, CurrScore - 1); + PendingEvents &= ~(1 << OtherEvent); + } for (const MachineOperand &Op : Inst.all_uses()) - setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore); + setScoreByOperand(&Inst, Op, T, CurrScore); } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { // Match the score to the destination registers. // @@ -1048,7 +1035,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, // Special cases where implicit register defs exists, such as M0 or VCC, // but none with memory instructions. for (const MachineOperand &Op : Inst.defs()) { - RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op); + RegInterval Interval = getRegInterval(&Inst, Op); if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) { if (Interval.first >= NUM_ALL_VGPRS) continue; @@ -1109,7 +1096,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, setRegScore(FIRST_LDS_VGPR, T, CurrScore); } - if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) { + if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) { setRegScore(SCC, T, CurrScore); PendingSCCWrite = &Inst; } @@ -1831,12 +1818,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( return Modified; } -static bool readsVCCZ(const MachineInstr &MI) { - unsigned Opc = MI.getOpcode(); - return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && - !MI.getOperand(1).isUndef(); -} - /// \returns true if the callee inserts an s_waitcnt 0 on function entry. static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { // Currently all conventions wait, but this may not always be the case. @@ -1871,26 +1852,24 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, assert(!MI.isMetaInstruction()); AMDGPU::Waitcnt Wait; + const unsigned Opc = MI.getOpcode(); // FIXME: This should have already been handled by the memory legalizer. // Removing this currently doesn't affect any lit tests, but we need to // verify that nothing was relying on this. The number of buffer invalidates // being handled here should not be expanded. - if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || - MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || - MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { + if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC || + Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV || + Opc == AMDGPU::BUFFER_GL1_INV) { Wait.LoadCnt = 0; } // All waits must be resolved at call return. // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. - if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || - MI.getOpcode() == AMDGPU::SI_RETURN || - MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || - MI.getOpcode() == AMDGPU::S_SETPC_B64_return || + if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN || + Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || + Opc == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); } @@ -1902,8 +1881,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // send a message to explicitly release all VGPRs before the stores have // completed, but it is only safe to do this if there are no outstanding // scratch stores. - else if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) { if (!WCG->isOptNone() && (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() || (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && @@ -1912,8 +1890,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ReleaseVGPRInsts.insert(&MI); } // Resolve vm waits before gs-done. - else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || - MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && + else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) && ST->hasLegacyGeometry() && ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { @@ -1938,7 +1915,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Wait for any pending GDS instruction to complete before any // "Always GDS" instruction. - if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS()) + if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS()) addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait()); if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { @@ -1950,7 +1927,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); if (CallAddrOp.isReg()) { RegInterval CallAddrOpInterval = - ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOp); + ScoreBrackets.getRegInterval(&MI, CallAddrOp); ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval, Wait); @@ -1958,13 +1935,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (const auto *RtnAddrOp = TII->getNamedOperand(MI, AMDGPU::OpName::dst)) { RegInterval RtnAddrOpInterval = - ScoreBrackets.getRegInterval(&MI, MRI, TRI, *RtnAddrOp); + ScoreBrackets.getRegInterval(&MI, *RtnAddrOp); ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval, Wait); } } - } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) { + } else if (Opc == AMDGPU::S_BARRIER_WAIT) { ScoreBrackets.tryClearSCCWriteEvent(&MI); } else { // FIXME: Should not be relying on memoperands. @@ -2022,7 +1999,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) continue; - RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, Op); + RegInterval Interval = ScoreBrackets.getRegInterval(&MI, Op); const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); if (IsVGPR) { @@ -2061,7 +2038,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait); } - if (hasXcnt() && Op.isDef()) + if (ST->hasWaitXCnt() && Op.isDef()) ScoreBrackets.determineWait(X_CNT, Interval, Wait); } } @@ -2079,18 +2056,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // // In all other cases, ensure safety by ensuring that there are no outstanding // memory operations. - if (MI.getOpcode() == AMDGPU::S_BARRIER && - !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { + if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && + !ST->supportsBackOffBarrier()) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); } // TODO: Remove this work-around, enable the assert for Bug 457939 // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. - if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { - if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { - Wait.DsCnt = 0; - } + if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { + Wait.DsCnt = 0; } // Verify that the wait is actually needed. @@ -2165,19 +2141,19 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, } // XCnt may be already consumed by a load wait. - if (Wait.KmCnt == 0 && Wait.XCnt != ~0u && - !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) - Wait.XCnt = ~0u; + if (Wait.XCnt != ~0u) { + if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) + Wait.XCnt = ~0u; - if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u && - !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) - Wait.XCnt = ~0u; + if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) + Wait.XCnt = ~0u; - // Since the translation for VMEM addresses occur in-order, we can skip the - // XCnt if the current instruction is of VMEM type and has a memory dependency - // with another VMEM instruction in flight. - if (Wait.XCnt != ~0u && isVmemAccess(*It)) - Wait.XCnt = ~0u; + // Since the translation for VMEM addresses occur in-order, we can skip the + // XCnt if the current instruction is of VMEM type and has a memory + // dependency with another VMEM instruction in flight. + if (isVmemAccess(*It)) + Wait.XCnt = ~0u; + } if (WCG->createNewWaitcnt(Block, It, Wait)) Modified = true; @@ -2185,75 +2161,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, return Modified; } -// This is a flat memory operation. Check to see if it has memory tokens other -// than LDS. Other address spaces supported by flat memory operations involve -// global memory. -bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // All flat instructions use the VMEM counter except prefetch. - if (!TII->usesVM_CNT(MI)) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access VMEM. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves VMEM. - // Flat operations only supported FLAT, LOCAL (LDS), or address spaces - // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION - // (GDS) address space is not supported by flat operations. Therefore, simply - // return true unless only the LDS address space is found. - for (const MachineMemOperand *Memop : MI.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - assert(AS != AMDGPUAS::REGION_ADDRESS); - if (AS != AMDGPUAS::LOCAL_ADDRESS) - return true; - } - - return false; -} - -// This is a flat memory operation. Check to see if it has memory tokens for -// either LDS or FLAT. -bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. - if (!TII->usesLGKM_CNT(MI)) - return false; - - // If in tgsplit mode then there can be no use of LDS. - if (ST->isTgSplitEnabled()) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access LDS. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves LDS. - for (const MachineMemOperand *Memop : MI.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) - return true; - } - - return false; -} - bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { - return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || + return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) || (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); } -static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) { - auto Opc = Inst.getOpcode(); - return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || - Opc == AMDGPU::GLOBAL_WBINV; -} - // Return true if the next instruction is S_ENDPGM, following fallthrough // blocks if necessary. bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It, @@ -2317,6 +2229,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // Now look at the instruction opcode. If it is a memory access // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. + // For architectures with X_CNT, mark the source address operands + // with the appropriate counter values. // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. bool IsVMEMAccess = false; @@ -2324,16 +2238,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { if (TII->isAlwaysGDS(Inst.getOpcode()) || TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { - ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); - ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); + ScoreBrackets->updateByEvent(GDS_ACCESS, Inst); + ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst); ScoreBrackets->setPendingGDS(); } else { - ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); + ScoreBrackets->updateByEvent(LDS_ACCESS, Inst); } } else if (TII->isFLAT(Inst)) { - if (isGFX12CacheInvOrWBInst(Inst)) { - ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), - Inst); + if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) { + ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst); return; } @@ -2341,16 +2254,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, int FlatASCount = 0; - if (mayAccessVMEMThroughFlat(Inst)) { + if (TII->mayAccessVMEMThroughFlat(Inst)) { ++FlatASCount; IsVMEMAccess = true; - ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), - Inst); + ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst); } - if (mayAccessLDSThroughFlat(Inst)) { + if (TII->mayAccessLDSThroughFlat(Inst)) { ++FlatASCount; - ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); + ScoreBrackets->updateByEvent(LDS_ACCESS, Inst); } // This is a flat memory operation that access both VMEM and LDS, so note it @@ -2361,16 +2273,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } else if (SIInstrInfo::isVMEM(Inst) && !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { IsVMEMAccess = true; - ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), - Inst); + ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst); if (ST->vmemWriteNeedsExpWaitcnt() && (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) { - ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); + ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst); } } else if (TII->isSMRD(Inst)) { IsSMEMAccess = true; - ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); + ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst); } else if (Inst.isCall()) { if (callWaitsOnFunctionReturn(Inst)) { // Act as a wait on everything @@ -2382,45 +2293,45 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); } } else if (SIInstrInfo::isLDSDIR(Inst)) { - ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst); + ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst); } else if (TII->isVINTERP(Inst)) { int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm(); ScoreBrackets->applyWaitcnt(EXP_CNT, Imm); } else if (SIInstrInfo::isEXP(Inst)) { unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31) - ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); + ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst); else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST) - ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); + ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst); else - ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); - } else if (asynchronouslyWritesSCC(Inst.getOpcode())) { - ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst); + ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst); + } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) { + ScoreBrackets->updateByEvent(SCC_WRITE, Inst); } else { switch (Inst.getOpcode()) { case AMDGPU::S_SENDMSG: case AMDGPU::S_SENDMSG_RTN_B32: case AMDGPU::S_SENDMSG_RTN_B64: case AMDGPU::S_SENDMSGHALT: - ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); + ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst); break; case AMDGPU::S_MEMTIME: case AMDGPU::S_MEMREALTIME: case AMDGPU::S_GET_BARRIER_STATE_M0: case AMDGPU::S_GET_BARRIER_STATE_IMM: - ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); + ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst); break; } } - if (!hasXcnt()) + if (!ST->hasWaitXCnt()) return; if (IsVMEMAccess) - ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst); + ScoreBrackets->updateByEvent(VMEM_GROUP, Inst); if (IsSMEMAccess) - ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst); + ScoreBrackets->updateByEvent(SMEM_GROUP, Inst); } bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, @@ -2478,9 +2389,8 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE); if (!OldEventsHasSCCWrite) { PendingSCCWrite = Other.PendingSCCWrite; - } else { - if (PendingSCCWrite != Other.PendingSCCWrite) - PendingSCCWrite = nullptr; + } else if (PendingSCCWrite != Other.PendingSCCWrite) { + PendingSCCWrite = nullptr; } } } @@ -2516,12 +2426,6 @@ static bool isWaitInstr(MachineInstr &Inst) { counterTypeForInstr(Opcode).has_value(); } -bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) { - return Opcode == AMDGPU::S_BARRIER_LEAVE || - Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || - Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0; -} - // Generate s_waitcnt instructions where needed. bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, @@ -2578,7 +2482,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, OldWaitcntInstr = nullptr; // Restore vccz if it's not known to be correct already. - bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); + bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst); // Don't examine operands unless we need to track vccz correctness. if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { @@ -2701,7 +2605,7 @@ bool SIInsertWaitcnts::isPreheaderToFlush( bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { if (SIInstrInfo::isFLAT(MI)) - return mayAccessVMEMThroughFlat(MI); + return TII->mayAccessVMEMThroughFlat(MI); return SIInstrInfo::isVMEM(MI); } @@ -2724,15 +2628,14 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { if (isVMEMOrFlatVMEM(MI)) { - if (MI.mayLoad()) - HasVMemLoad = true; - if (MI.mayStore()) - HasVMemStore = true; + HasVMemLoad |= MI.mayLoad(); + HasVMemStore |= MI.mayStore(); } + for (const MachineOperand &Op : MI.all_uses()) { if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg())) continue; - RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op); + RegInterval Interval = Brackets.getRegInterval(&MI, Op); // Vgpr use for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { // If we find a register that is loaded inside the loop, 1. and 2. @@ -2757,7 +2660,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, // VMem load vgpr def if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) { for (const MachineOperand &Op : MI.all_defs()) { - RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op); + RegInterval Interval = Brackets.getRegInterval(&MI, Op); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { // If we find a register that is loaded inside the loop, 1. and 2. // are invalidated and we can exit. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 044ea86..56435a5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4344,6 +4344,59 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { }); } +bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { + assert(isFLAT(MI)); + + // All flat instructions use the VMEM counter except prefetch. + if (!usesVM_CNT(MI)) + return false; + + // If there are no memory operands then conservatively assume the flat + // operation may access VMEM. + if (MI.memoperands_empty()) + return true; + + // See if any memory operand specifies an address space that involves VMEM. + // Flat operations only supported FLAT, LOCAL (LDS), or address spaces + // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION + // (GDS) address space is not supported by flat operations. Therefore, simply + // return true unless only the LDS address space is found. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + assert(AS != AMDGPUAS::REGION_ADDRESS); + if (AS != AMDGPUAS::LOCAL_ADDRESS) + return true; + } + + return false; +} + +bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const { + assert(isFLAT(MI)); + + // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. + if (!usesLGKM_CNT(MI)) + return false; + + // If in tgsplit mode then there can be no use of LDS. + if (ST.isTgSplitEnabled()) + return false; + + // If there are no memory operands then conservatively assume the flat + // operation may access LDS. + if (MI.memoperands_empty()) + return true; + + // See if any memory operand specifies an address space that involves LDS. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) + return true; + } + + return false; +} + bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { // Skip the full operand and register alias search modifiesRegister // does. There's only a handful of instructions that touch this, it's only an diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index c2252af..a21089f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -688,6 +688,12 @@ public: /// to not hit scratch. bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + /// \returns true for FLAT instructions that can access VMEM. + bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; + + /// \returns true for FLAT instructions that can access LDS. + bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; + static bool isBlockLoadStore(uint16_t Opcode) { switch (Opcode) { case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: @@ -748,6 +754,18 @@ public: return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD; } + static bool isSBarrierSCCWrite(unsigned Opcode) { + return Opcode == AMDGPU::S_BARRIER_LEAVE || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0; + } + + static bool isCBranchVCCZRead(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && + !MI.getOperand(1).isUndef(); + } + static bool isWQM(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::WQM; } @@ -1010,6 +1028,11 @@ public: Opcode == AMDGPU::DS_GWS_BARRIER; } + static bool isGFX12CacheInvOrWBInst(unsigned Opc) { + return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || + Opc == AMDGPU::GLOBAL_WBINV; + } + static bool isF16PseudoScalarTrans(unsigned Opcode) { return Opcode == AMDGPU::V_S_EXP_F16_e64 || Opcode == AMDGPU::V_S_LOG_F16_e64 || diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 8f1dd62..5630580 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1163,6 +1163,22 @@ def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, let HasSGPR = 1; let Size = 64; } + +def VS_128 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32, + (add VReg_128, SReg_128)> { + let isAllocatable = 0; + let HasVGPR = 1; + let HasSGPR = 1; + let Size = 128; +} + +def VS_128_Align2 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32, + (add VReg_128_Align2, SReg_128)> { + let isAllocatable = 0; + let HasVGPR = 1; + let HasSGPR = 1; + let Size = 128; +} } // End GeneratePressureSet = 0 // Define a register tuple class, along with one requiring an even diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index b3fd8c7..84287b6 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -352,10 +352,12 @@ def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">; } // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] +let Defs = [SCC] in { def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32", [(set i32:$sdst, (int_amdgcn_s_quadmask i32:$src0))]>; def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64", [(set i64:$sdst, (int_amdgcn_s_quadmask i64:$src0))]>; +} let Uses = [M0] in { def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 77df721..54f57e0 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -314,9 +314,10 @@ let SubtargetPredicate = HasGFX950Insts, OtherPredicates = [HasBF16ConversionIns defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>; } let SubtargetPredicate = isGFX1250Plus, OtherPredicates = [HasBF16ConversionInsts] in { - defm V_CVT_F32_BF16_gfx1250 : VOP1Inst_t16_with_profiles <"v_cvt_f32_bf16_gfx1250", VOP_F32_BF16, - VOPProfile_CVT_F32_BF16_gfx1250_t16, - VOPProfile_CVT_F32_BF16_gfx1250_fake16>; + let True16Predicate = UseRealTrue16Insts in + defm V_CVT_F32_BF16_gfx1250_t16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_t16", VOPProfile_CVT_F32_BF16_gfx1250_t16>; + let True16Predicate = UseFakeTrue16Insts in + defm V_CVT_F32_BF16_gfx1250_fake16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_fake16", VOPProfile_CVT_F32_BF16_gfx1250_fake16>; } let ReadsModeReg = 0, mayRaiseFPException = 0 in { @@ -899,6 +900,7 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = let DecoderNamespace = Gen.DecoderNamespace; let OtherPredicates = !listconcat(ps.OtherPredicates, !if(p.HasExt64BitDPP, [HasDPALU_DPP], [])); + let True16Predicate = ps.True16Predicate; } class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : @@ -921,6 +923,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf VOP1_DPP8<op, ps, p> { let AssemblerPredicate = Gen.AssemblerPredicate; let DecoderNamespace = Gen.DecoderNamespace; + let True16Predicate = ps.True16Predicate; } //===----------------------------------------------------------------------===// @@ -1149,7 +1152,7 @@ defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>; defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>; -defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; +defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">; defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>; defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; |