diff options
author | Lucas Ramirez <11032120+lucas-rami@users.noreply.github.com> | 2025-08-08 14:26:04 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-08-08 14:26:04 +0200 |
commit | 83c308f014da00cadbbe9ac7c8fe8a48ff777b76 (patch) | |
tree | 58ad7a7d4b6c027bff2bd596a31af184c23fe044 /llvm/lib | |
parent | ab7281d8969152a2cb0f302fe645e99e58e6d281 (diff) | |
download | llvm-83c308f014da00cadbbe9ac7c8fe8a48ff777b76.zip llvm-83c308f014da00cadbbe9ac7c8fe8a48ff777b76.tar.gz llvm-83c308f014da00cadbbe9ac7c8fe8a48ff777b76.tar.bz2 |
[AMDGPU][Scheduler] Consistent occupancy calculation during rematerialization (#149224)
The `RPTarget`'s way of determining whether VGPRs are beneficial to save
and whether the target has been reached w.r.t. VGPR usage currently
assumes, if `CombinedVGPRSavings` is true, that free slots in one VGPR
RC can always be used for the other. Implicitly, this makes the
rematerialization stage (only current user of `RPTarget`) follow a
different occupancy calculation than the "regular one" that the
scheduler uses, one that assumes that ArchVGPR/AGPR usage can be
balanced perfectly and at no cost, which is untrue in general. This
ultimately yields suboptimal rematerialization decisions that require
cross-VGPR-RC copies unnecessarily.
This fixes that, making the `RPTarget`'s internal model of occupancy
consistent with the regular one. The `CombinedVGPRSavings` flag is
removed, and a form of cross-VGPR-RC saving implemented only for unified
RFs, which is where it makes the most sense. Only when the amount of
free VGPRs in a given VGPR RC (ArchVPGR or AGPR) is lower than the
excess VGPR usage in the other VGPR RC does the `RPTarget` consider that
a pressure reduction in the former will be beneficial to the latter.
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 56 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNRegPressure.h | 43 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 145 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 9 |
4 files changed, 113 insertions, 140 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 334afd3..ef63acc 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, //////////////////////////////////////////////////////////////////////////////// // GCNRPTarget -GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { +GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const Function &F = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF); + setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F)); } GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { - setRegLimits(NumSGPRs, NumVGPRs, MF); + const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { + setTarget(NumSGPRs, NumVGPRs); } GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { + const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned DynamicVGPRBlockSize = MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), - ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF); + setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), + ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize)); } -void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF) { +void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs); MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs); - MaxUnifiedVGPRs = - ST.hasGFX90AInsts() - ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs) - : 0; + if (UnifiedRF) { + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxUnifiedVGPRs = + std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs); + } else { + MaxUnifiedVGPRs = 0; + } } -bool GCNRPTarget::isSaveBeneficial(Register Reg, - const MachineRegisterInfo &MRI) const { +bool GCNRPTarget::isSaveBeneficial(Register Reg) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *RC = MRI.getRegClass(Reg); const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); @@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg, return RP.getSGPRNum() > MaxSGPRs; unsigned NumVGPRs = SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum(); - return isVGPRBankSaveBeneficial(NumVGPRs); + // The addressable limit must always be respected. + if (NumVGPRs > MaxVGPRs) + return true; + // For unified RFs, combined VGPR usage limit must be respected as well. + return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs; } bool GCNRPTarget::satisfied() const { - if (RP.getSGPRNum() > MaxSGPRs) + if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs) return false; - if (RP.getVGPRNum(false) > MaxVGPRs && - (!CombineVGPRSavings || !satisifiesVGPRBanksTarget())) + if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs) return false; - return satisfiesUnifiedTarget(); + return true; } /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index ea33a22..a9c58bb 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -186,20 +186,22 @@ public: /// Sets up the target such that the register pressure starting at \p RP does /// not show register spilling on function \p MF (w.r.t. the function's /// mininum target occupancy). - GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings = false); + GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p /// MF. GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not prevent achieving an occupancy of at least \p Occupancy on function /// \p MF. GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); + + /// Changes the target (same semantics as constructor). + void setTarget(unsigned NumSGPRs, unsigned NumVGPRs); const GCNRegPressure &getCurrentRP() const { return RP; } @@ -207,7 +209,7 @@ public: /// Determines whether saving virtual register \p Reg will be beneficial /// towards achieving the RP target. - bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const; + bool isSaveBeneficial(Register Reg) const; /// Saves virtual register \p Reg with lanemask \p Mask. void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) { @@ -227,15 +229,15 @@ public: if (Target.MaxUnifiedVGPRs) { OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs << " VGPRs (unified)"; - } else if (Target.CombineVGPRSavings) { - OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/' - << 2 * Target.MaxVGPRs << " VGPRs (combined target)"; } return OS; } #endif private: + const MachineFunction &MF; + const bool UnifiedRF; + /// Current register pressure. GCNRegPressure RP; @@ -246,29 +248,10 @@ private: /// Target number of overall VGPRs for subtargets with unified RFs. Always 0 /// for subtargets with non-unified RFs. unsigned MaxUnifiedVGPRs; - /// Whether we consider that the register allocator will be able to swap - /// between ArchVGPRs and AGPRs by copying them to a super register class. - /// Concretely, this allows savings in one of the VGPR banks to help toward - /// savings in the other VGPR bank. - bool CombineVGPRSavings; - - inline bool satisifiesVGPRBanksTarget() const { - assert(CombineVGPRSavings && "only makes sense with combined savings"); - return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs; - } - - /// Always satisified when the subtarget doesn't have a unified RF. - inline bool satisfiesUnifiedTarget() const { - return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs; - } - - inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const { - return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() || - (CombineVGPRSavings && !satisifiesVGPRBanksTarget()); - } - void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs, - const MachineFunction &MF); + GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF) + : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()), + RP(RP) {} }; /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 96d5668..254b75b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() { } /// Allows to easily filter for this stage's debug output. -#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;) +#define REMAT_PREFIX "[PreRARemat] " +#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;) bool PreRARematStage::initGCNSchedStage() { // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for @@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() { rematerialize(); if (GCNTrackers) DAG.RegionLiveOuts.buildLiveRegMap(); - REMAT_DEBUG( - dbgs() << "Retrying function scheduling with new min. occupancy of " - << AchievedOcc << " from rematerializing (original was " - << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n"); + REMAT_DEBUG({ + dbgs() << "Retrying function scheduling with new min. occupancy of " + << AchievedOcc << " from rematerializing (original was " + << DAG.MinOccupancy; + if (TargetOcc) + dbgs() << ", target was " << *TargetOcc; + dbgs() << ")\n"; + }); + if (AchievedOcc > DAG.MinOccupancy) { DAG.MinOccupancy = AchievedOcc; SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); @@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { return GCNSchedStage::shouldRevertScheduling(WavesAfter) || - mayCauseSpilling(WavesAfter) || - (IncreaseOccupancy && WavesAfter < TargetOcc); + mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc); } bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { @@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, } bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { - REMAT_DEBUG({ - dbgs() << "Collecting rematerializable instructions in "; - MF.getFunction().printAsOperand(dbgs(), false); - dbgs() << '\n'; - }); + const Function &F = MF.getFunction(); // Maps optimizable regions (i.e., regions at minimum and register-limited // occupancy, or regions with spilling) to the target RP we would like to // reach. DenseMap<unsigned, GCNRPTarget> OptRegions; - const Function &F = MF.getFunction(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - - std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F); - const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F); - const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F); - const unsigned MaxSGPRsIncOcc = - ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false); - const unsigned MaxVGPRsIncOcc = - ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize); - IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy; - - // Collect optimizable regions. If there is spilling in any region we will - // just try to reduce spilling. Otherwise we will try to increase occupancy by - // one in the whole function. - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - GCNRegPressure &RP = DAG.Pressure[I]; - // We allow ArchVGPR or AGPR savings to count as savings of the other kind - // of VGPR only when trying to eliminate spilling. We cannot do this when - // trying to increase occupancy since VGPR class swaps only occur later in - // the register allocator i.e., the scheduler will not be able to reason - // about these savings and will not report an increase in the achievable - // occupancy, triggering rollbacks. - GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP, - /*CombineVGPRSavings=*/true); - if (!Target.satisfied() && IncreaseOccupancy) { - // There is spilling in the region and we were so far trying to increase - // occupancy. Strop trying that and focus on reducing spilling. - IncreaseOccupancy = false; - OptRegions.clear(); - } else if (IncreaseOccupancy) { - // There is no spilling in the region, try to increase occupancy. - Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP, - /*CombineVGPRSavings=*/false); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(F); + unsigned MaxVGPRs = ST.getMaxNumVGPRs(F); + auto ResetTargetRegions = [&]() { + OptRegions.clear(); + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + const GCNRegPressure &RP = DAG.Pressure[I]; + GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP); + if (!Target.satisfied()) + OptRegions.insert({I, Target}); } - if (!Target.satisfied()) - OptRegions.insert({I, Target}); - } - if (OptRegions.empty()) - return false; + }; -#ifndef NDEBUG - if (IncreaseOccupancy) { - REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy - << ") in regions:\n"); + ResetTargetRegions(); + if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { + // In addition to register usage being above addressable limits, occupancy + // below the minimum is considered like "spilling" as well. + TargetOcc = std::nullopt; } else { - REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy (" - << WavesPerEU.first << ") in regions:\n"); - } - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) - REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n'); + // There is no spilling and room to improve occupancy; set up "increased + // occupancy targets" for all regions. + TargetOcc = DAG.MinOccupancy + 1; + unsigned VGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false); + MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize); + ResetTargetRegions(); } -#endif - - // When we are reducing spilling, the target is the minimum target number of - // waves/EU determined by the subtarget. In cases where either one of - // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current - // minimum region occupancy may be higher than the latter. - TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 - : std::max(DAG.MinOccupancy, WavesPerEU.first); + REMAT_DEBUG({ + dbgs() << "Analyzing "; + MF.getFunction().printAsOperand(dbgs(), false); + dbgs() << ": "; + if (OptRegions.empty()) { + dbgs() << "no objective to achieve, occupancy is maximal at " + << MFI.getMaxWavesPerEU(); + } else if (!TargetOcc) { + dbgs() << "reduce spilling (minimum target occupancy is " + << MFI.getMinWavesPerEU() << ')'; + } else { + dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to " + << TargetOcc; + } + dbgs() << '\n'; + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) { + dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond() + << '\n'; + } + } + }); + if (OptRegions.empty()) + return false; // Accounts for a reduction in RP in an optimizable region. Returns whether we // estimate that we have identified enough rematerialization opportunities to @@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask, bool &Progress) -> bool { GCNRPTarget &Target = OptIt->getSecond(); - if (!Target.isSaveBeneficial(Reg, DAG.MRI)) + if (!Target.isSaveBeneficial(Reg)) return false; Progress = true; Target.saveReg(Reg, Mask, DAG.MRI); @@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { } } - if (IncreaseOccupancy) { + if (TargetOcc) { // We were trying to increase occupancy but failed, abort the stage. REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n"); Rematerializations.clear(); @@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() { // All regions impacted by at least one rematerialization must be rescheduled. // Maximum pressure must also be recomputed for all regions where it changed // non-predictably and checked against the target occupancy. - AchievedOcc = TargetOcc; + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + AchievedOcc = MFI.getMaxWavesPerEU(); for (auto &[I, OriginalRP] : ImpactedRegions) { bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; RescheduleRegions[I] = !IsEmptyRegion; @@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() { } } DAG.Pressure[I] = RP; - AchievedOcc = std::min( - AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>() - ->getDynamicVGPRBlockSize())); + AchievedOcc = + std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize)); } REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); } @@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() { // which case we do not want to rollback either (the rescheduling was already // reverted in PreRARematStage::shouldRevertScheduling in such cases). unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy); - if (!IncreaseOccupancy || MaxOcc >= TargetOcc) + if (!TargetOcc || MaxOcc >= *TargetOcc) return; REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n"); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 32139a9..790370f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -470,15 +470,12 @@ private: /// After successful stage initialization, indicates which regions should be /// rescheduled. BitVector RescheduleRegions; - /// Target occupancy the stage estimates is reachable through - /// rematerialization. Greater than or equal to the pre-stage min occupancy. - unsigned TargetOcc; + /// The target occupancy the stage is trying to achieve. Empty when the + /// objective is spilling reduction. + std::optional<unsigned> TargetOcc; /// Achieved occupancy *only* through rematerializations (pre-rescheduling). /// Smaller than or equal to the target occupancy. unsigned AchievedOcc; - /// Whether the stage is attempting to increase occupancy in the abscence of - /// spilling. - bool IncreaseOccupancy; /// Returns whether remat can reduce spilling or increase function occupancy /// by 1 through rematerialization. If it can do one, collects instructions in |