diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 508 |
1 files changed, 253 insertions, 255 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7ce1359..2af0a57 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { llvm_unreachable("event type has no associated counter"); } -// This objects maintains the current score brackets of each wait counter, and -// a per-register scoreboard for each wait counter. -// -// We also maintain the latest score for every event type that can change the -// waitcnt in order to know if there are multiple types of events within -// the brackets. When multiple types of event happen in the bracket, -// wait count may get decreased out of order, therefore we need to put in -// "s_waitcnt 0" before use. -class WaitcntBrackets { -public: - WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, - HardwareLimits Limits, const unsigned *WaitEventMaskForInst, - InstCounterType SmemAccessCounter) - : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), - WaitEventMaskForInst(WaitEventMaskForInst), - SmemAccessCounter(SmemAccessCounter) {} - - unsigned getWaitCountMax(InstCounterType T) const { - switch (T) { - case LOAD_CNT: - return Limits.LoadcntMax; - case DS_CNT: - return Limits.DscntMax; - case EXP_CNT: - return Limits.ExpcntMax; - case STORE_CNT: - return Limits.StorecntMax; - case SAMPLE_CNT: - return Limits.SamplecntMax; - case BVH_CNT: - return Limits.BvhcntMax; - case KM_CNT: - return Limits.KmcntMax; - case X_CNT: - return Limits.XcntMax; - default: - break; - } - return 0; - } - - bool isSmemCounter(InstCounterType T) const { - return T == SmemAccessCounter || T == X_CNT; - } - - unsigned getSgprScoresIdx(InstCounterType T) const { - assert(isSmemCounter(T) && "Invalid SMEM counter"); - return T == X_CNT ? 1 : 0; - } - - unsigned getScoreLB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreLBs[T]; - } - - unsigned getScoreUB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreUBs[T]; - } - - unsigned getScoreRange(InstCounterType T) const { - return getScoreUB(T) - getScoreLB(T); - } - - unsigned getRegScore(int GprNo, InstCounterType T) const { - if (GprNo < NUM_ALL_VGPRS) - return VgprScores[T][GprNo]; - return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; - } - - bool merge(const WaitcntBrackets &Other); - - RegInterval getRegInterval(const MachineInstr *MI, - const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, - const MachineOperand &Op) const; - - bool counterOutOfOrder(InstCounterType T) const; - void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; - void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; - - void determineWait(InstCounterType T, RegInterval Interval, - AMDGPU::Waitcnt &Wait) const; - void determineWait(InstCounterType T, int RegNo, - AMDGPU::Waitcnt &Wait) const { - determineWait(T, {RegNo, RegNo + 1}, Wait); - } - - void applyWaitcnt(const AMDGPU::Waitcnt &Wait); - void applyWaitcnt(InstCounterType T, unsigned Count); - void applyXcnt(const AMDGPU::Waitcnt &Wait); - void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, WaitEventType E, - MachineInstr &MI); - - unsigned hasPendingEvent() const { return PendingEvents; } - unsigned hasPendingEvent(WaitEventType E) const { - return PendingEvents & (1 << E); - } - unsigned hasPendingEvent(InstCounterType T) const { - unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; - assert((HasPending != 0) == (getScoreRange(T) != 0)); - return HasPending; - } - - bool hasMixedPendingEvents(InstCounterType T) const { - unsigned Events = hasPendingEvent(T); - // Return true if more than one bit is set in Events. - return Events & (Events - 1); - } - - bool hasPendingFlat() const { - return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && - LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || - (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && - LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); - } - - void setPendingFlat() { - LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; - LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; - } - - bool hasPendingGDS() const { - return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; - } - - unsigned getPendingGDSWait() const { - return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1); - } - - void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } - - // Return true if there might be pending writes to the vgpr-interval by VMEM - // instructions with types different from V. - bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - if (VgprVmemTypes[RegNo] & ~(1 << V)) - return true; - } - return false; - } - - void clearVgprVmemTypes(RegInterval Interval) { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - VgprVmemTypes[RegNo] = 0; - } - } - - void setStateOnFunctionEntryOrReturn() { - setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); - PendingEvents |= WaitEventMaskForInst[STORE_CNT]; - } - - ArrayRef<const MachineInstr *> getLDSDMAStores() const { - return LDSDMAStores; - } - - bool hasPointSampleAccel(const MachineInstr &MI) const; - bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, - RegInterval Interval) const; - - void print(raw_ostream &) const; - void dump() const { print(dbgs()); } - -private: - struct MergeInfo { - unsigned OldLB; - unsigned OtherLB; - unsigned MyShift; - unsigned OtherShift; - }; - static bool mergeScore(const MergeInfo &M, unsigned &Score, - unsigned OtherScore); - - void setScoreLB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreLBs[T] = Val; - } - - void setScoreUB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreUBs[T] = Val; - - if (T != EXP_CNT) - return; - - if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) - ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); - } - - void setRegScore(int GprNo, InstCounterType T, unsigned Val) { - setScoreByInterval({GprNo, GprNo + 1}, T, Val); - } - - void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, - unsigned Score); - - void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - const MachineOperand &Op, InstCounterType CntTy, - unsigned Val); - - const GCNSubtarget *ST = nullptr; - InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; - HardwareLimits Limits = {}; - const unsigned *WaitEventMaskForInst; - InstCounterType SmemAccessCounter; - unsigned ScoreLBs[NUM_INST_CNTS] = {0}; - unsigned ScoreUBs[NUM_INST_CNTS] = {0}; - unsigned PendingEvents = 0; - // Remember the last flat memory operation. - unsigned LastFlat[NUM_INST_CNTS] = {0}; - // Remember the last GDS operation. - unsigned LastGDS = 0; - // wait_cnt scores for every vgpr. - // Keep track of the VgprUB and SgprUB to make merge at join efficient. - int VgprUB = -1; - int SgprUB = -1; - unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; - // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt - // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. - // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the - // X_CNT score. - unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; - // Bitmask of the VmemTypes of VMEM instructions that might have a pending - // write to each vgpr. - unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; - // Store representative LDS DMA operations. The only useful info here is - // alias info. One store is kept per unique AAInfo. - SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; -}; +class WaitcntBrackets; // This abstracts the logic for generating and updating S_WAIT* instructions // away from the analysis that determines where they are needed. This was @@ -640,8 +407,13 @@ public: }; class SIInsertWaitcnts { +public: + const GCNSubtarget *ST; + InstCounterType SmemAccessCounter; + InstCounterType MaxCounter; + const unsigned *WaitEventMaskForInst; + private: - const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; @@ -657,8 +429,6 @@ private: bool Dirty = true; }; - InstCounterType SmemAccessCounter; - MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; bool ForceEmitWaitcnt[NUM_INST_CNTS]; @@ -675,7 +445,7 @@ private: // message. DenseSet<MachineInstr *> ReleaseVGPRInsts; - InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; + HardwareLimits Limits; public: SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, @@ -686,6 +456,30 @@ public: (void)ForceVMCounter; } + unsigned getWaitCountMax(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; + case EXP_CNT: + return Limits.ExpcntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; + default: + break; + } + return 0; + } + bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets); bool isPreheaderToFlush(MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets); @@ -791,6 +585,211 @@ public: WaitcntBrackets &ScoreBrackets); }; +// This objects maintains the current score brackets of each wait counter, and +// a per-register scoreboard for each wait counter. +// +// We also maintain the latest score for every event type that can change the +// waitcnt in order to know if there are multiple types of events within +// the brackets. When multiple types of event happen in the bracket, +// wait count may get decreased out of order, therefore we need to put in +// "s_waitcnt 0" before use. +class WaitcntBrackets { +public: + WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {} + + bool isSmemCounter(InstCounterType T) const { + return T == Context->SmemAccessCounter || T == X_CNT; + } + + unsigned getSgprScoresIdx(InstCounterType T) const { + assert(isSmemCounter(T) && "Invalid SMEM counter"); + return T == X_CNT ? 1 : 0; + } + + unsigned getScoreLB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreLBs[T]; + } + + unsigned getScoreUB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreUBs[T]; + } + + unsigned getScoreRange(InstCounterType T) const { + return getScoreUB(T) - getScoreLB(T); + } + + unsigned getRegScore(int GprNo, InstCounterType T) const { + if (GprNo < NUM_ALL_VGPRS) + return VgprScores[T][GprNo]; + return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; + } + + bool merge(const WaitcntBrackets &Other); + + RegInterval getRegInterval(const MachineInstr *MI, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, + const MachineOperand &Op) const; + + bool counterOutOfOrder(InstCounterType T) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + + void determineWait(InstCounterType T, RegInterval Interval, + AMDGPU::Waitcnt &Wait) const; + void determineWait(InstCounterType T, int RegNo, + AMDGPU::Waitcnt &Wait) const { + determineWait(T, {RegNo, RegNo + 1}, Wait); + } + + void applyWaitcnt(const AMDGPU::Waitcnt &Wait); + void applyWaitcnt(InstCounterType T, unsigned Count); + void applyXcnt(const AMDGPU::Waitcnt &Wait); + void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, WaitEventType E, + MachineInstr &MI); + + unsigned hasPendingEvent() const { return PendingEvents; } + unsigned hasPendingEvent(WaitEventType E) const { + return PendingEvents & (1 << E); + } + unsigned hasPendingEvent(InstCounterType T) const { + unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T]; + assert((HasPending != 0) == (getScoreRange(T) != 0)); + return HasPending; + } + + bool hasMixedPendingEvents(InstCounterType T) const { + unsigned Events = hasPendingEvent(T); + // Return true if more than one bit is set in Events. + return Events & (Events - 1); + } + + bool hasPendingFlat() const { + return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && + LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || + (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && + LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); + } + + void setPendingFlat() { + LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; + LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; + } + + bool hasPendingGDS() const { + return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; + } + + unsigned getPendingGDSWait() const { + return std::min(getScoreUB(DS_CNT) - LastGDS, + Context->getWaitCountMax(DS_CNT) - 1); + } + + void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } + + // Return true if there might be pending writes to the vgpr-interval by VMEM + // instructions with types different from V. + bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + if (VgprVmemTypes[RegNo] & ~(1 << V)) + return true; + } + return false; + } + + void clearVgprVmemTypes(RegInterval Interval) { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + VgprVmemTypes[RegNo] = 0; + } + } + + void setStateOnFunctionEntryOrReturn() { + setScoreUB(STORE_CNT, + getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT)); + PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT]; + } + + ArrayRef<const MachineInstr *> getLDSDMAStores() const { + return LDSDMAStores; + } + + bool hasPointSampleAccel(const MachineInstr &MI) const; + bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, + RegInterval Interval) const; + + void print(raw_ostream &) const; + void dump() const { print(dbgs()); } + +private: + struct MergeInfo { + unsigned OldLB; + unsigned OtherLB; + unsigned MyShift; + unsigned OtherShift; + }; + static bool mergeScore(const MergeInfo &M, unsigned &Score, + unsigned OtherScore); + + void setScoreLB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreLBs[T] = Val; + } + + void setScoreUB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreUBs[T] = Val; + + if (T != EXP_CNT) + return; + + if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT)) + ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT); + } + + void setRegScore(int GprNo, InstCounterType T, unsigned Val) { + setScoreByInterval({GprNo, GprNo + 1}, T, Val); + } + + void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, + unsigned Score); + + void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + const MachineOperand &Op, InstCounterType CntTy, + unsigned Val); + + const SIInsertWaitcnts *Context; + + unsigned ScoreLBs[NUM_INST_CNTS] = {0}; + unsigned ScoreUBs[NUM_INST_CNTS] = {0}; + unsigned PendingEvents = 0; + // Remember the last flat memory operation. + unsigned LastFlat[NUM_INST_CNTS] = {0}; + // Remember the last GDS operation. + unsigned LastGDS = 0; + // wait_cnt scores for every vgpr. + // Keep track of the VgprUB and SgprUB to make merge at join efficient. + int VgprUB = -1; + int SgprUB = -1; + unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; + // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt + // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. + // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the + // X_CNT score. + unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; + // Bitmask of the VmemTypes of VMEM instructions that might have a pending + // write to each vgpr. + unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; + // Store representative LDS DMA operations. The only useful info here is + // alias info. One store is kept per unique AAInfo. + SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; +}; + class SIInsertWaitcntsLegacy : public MachineFunctionPass { public: static char ID; @@ -827,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, RegInterval Result; - MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST); + MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST); unsigned RegIdx = TRI->getHWRegIndex(MCReg); assert(isUInt<8>(RegIdx)); @@ -885,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI, // this at compile time, so we have to assume it might be applied if the // instruction supports it). bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const { - if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) + if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) return false; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); @@ -911,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &Inst) { - InstCounterType T = eventCounter(WaitEventMaskForInst, E); + InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E); unsigned UB = getScoreUB(T); unsigned CurrScore = UB + 1; @@ -1080,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } void WaitcntBrackets::print(raw_ostream &OS) const { + const GCNSubtarget *ST = Context->ST; + OS << '\n'; - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { unsigned SR = getScoreRange(T); switch (T) { @@ -1195,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // s_waitcnt instruction. if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && - !ST->hasFlatLgkmVMemCountInOrder()) { + !Context->ST->hasFlatLgkmVMemCountInOrder()) { // If there is a pending FLAT operation, and this is a VMem or LGKM // waitcnt and the target can report early completion, then we need // to force a waitcnt 0. @@ -1209,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // If a counter has been maxed out avoid overflow by waiting for // MAX(CounterType) - 1 instead. unsigned NeededWait = - std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); + std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); addWait(Wait, T, NeededWait); } } @@ -1237,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { setScoreLB(T, std::max(getScoreLB(T), UB - Count)); } else { setScoreLB(T, UB); - PendingEvents &= ~WaitEventMaskForInst[T]; + PendingEvents &= ~Context->WaitEventMaskForInst[T]; } } @@ -1262,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { // the decrement may go out of order. bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { // Scalar memory read always can go out of order. - if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || + if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || (T == X_CNT && hasPendingEvent(SMEM_GROUP))) return true; return hasMixedPendingEvents(T); @@ -2386,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { VgprUB = std::max(VgprUB, Other.VgprUB); SgprUB = std::max(SgprUB, Other.SgprUB); - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { // Merge event flags for this counter + const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst; const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; if (OtherEvents & ~OldEvents) @@ -2746,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; - const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); + WaitEventMaskForInst = WCG->getWaitEventMask(); SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); - HardwareLimits Limits = {}; if (ST->hasExtendedWaitCounts()) { Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); Limits.DscntMax = AMDGPU::getDscntBitMask(IV); @@ -2807,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); } - auto NonKernelInitialState = std::make_unique<WaitcntBrackets>( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this); NonKernelInitialState->setStateOnFunctionEntryOrReturn(); BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); @@ -2839,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { *Brackets = *BI.Incoming; } else { if (!Brackets) { - Brackets = std::make_unique<WaitcntBrackets>( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + Brackets = std::make_unique<WaitcntBrackets>(this); } else { // Reinitialize in-place. N.B. do not do this by assigning from a // temporary because the WaitcntBrackets class is large and it could // cause this function to use an unreasonable amount of stack space. Brackets->~WaitcntBrackets(); - new (Brackets.get()) WaitcntBrackets( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + new (Brackets.get()) WaitcntBrackets(this); } } |