diff options
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 226 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 62 |
2 files changed, 286 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 254b75b784e7..874dfc09ad4e 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -23,9 +23,9 @@ /// //===----------------------------------------------------------------------===// -#include "GCNSchedStrategy.h" #include "AMDGPUIGroupLP.h" #include "GCNRegPressure.h" +#include "GCNSchedStrategy.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/STLExtras.h" @@ -33,6 +33,9 @@ #include "llvm/MC/LaneBitmask.h" #include "llvm/Support/ErrorHandling.h" +#include <cmath> +#include <limits> + #define DEBUG_TYPE "machine-scheduler" using namespace llvm; @@ -70,6 +73,79 @@ static cl::opt<bool> GCNTrackers( const unsigned ScheduleMetrics::ScaleFactor = 100; +//===----------------------------------------------------------------------===// +// Optional cost-function mode: command line switches +//===----------------------------------------------------------------------===// + +static cl::opt<bool> UseSchedCostFunction( + "amdgpu-use-cost-function", cl::Hidden, + cl::desc("Enable cost-function-based evaluation to compare candidate" + " schedules in GCNSchedStrategy."), + cl::init(false)); + +static cl::opt<double> SchedCostWeightOccupancy( + "amdgpu-sched-cost-weight-occupancy", cl::Hidden, + cl::desc("Weight for occupancy term in AMDGPU scheduler cost function"), + cl::init(1.0)); +static cl::opt<double> SchedCostWeightLength( + "amdgpu-sched-cost-weight-length", cl::Hidden, + cl::desc("Weight for schedule length term (cycles) in cost function"), + cl::init(1.0)); +static cl::opt<double> SchedCostWeightSpill( + "amdgpu-sched-cost-weight-spill", cl::Hidden, + cl::desc("Weight for spill term; typically much larger than length"), + cl::init(100.0)); + +// Shape the occupancy term: reciprocal exponent and low-occupancy penalty. +static cl::opt<double> SchedCostOccExponent( + "amdgpu-sched-cost-occ-exponent", cl::Hidden, + cl::desc("Exponent for occupancy diminishing-returns curve (cost ~ 1/W^exp)"), + cl::init(1.0)); +static cl::opt<unsigned> SchedCostLowOccFloor( + "amdgpu-sched-cost-lowocc-floor", cl::Hidden, + cl::desc("Preferred minimum waves; waves below this get extra penalty"), + cl::init(2)); +static cl::opt<double> SchedCostLowOccPenalty( + "amdgpu-sched-cost-lowocc-penalty", cl::Hidden, + cl::desc("Penalty weight multiplied by (floor - waves) when below floor"), + cl::init(0.0)); + +static cl::opt<bool> UseStageCostDecision( + "amdgpu-use-stage-cost-decision", cl::Hidden, + cl::desc("Defer cost decisions to end of stage using block-frequency" + " weighted totals, instead of per-region immediate reverts"), + cl::init(false)); + +// Helper: concave occupancy utility. Map waves -> diminishing cost reduction. +static inline double occupancyCost(unsigned Waves, double Exp) { + if (Waves == 0) + return std::numeric_limits<double>::infinity(); + // Use reciprocal to get a simple concave utility: higher waves -> smaller cost. + // We scale by a constant so typical ranges produce reasonable magnitudes. + return 1.0 / std::pow(static_cast<double>(Waves), Exp); +} + +double AMDGPUSchedCostFunction::score(unsigned Waves, unsigned LengthCycles, + unsigned SpillUnits, + double BlockFreq) const { + // BlockFreq defaults to 1.0 if unknown; scale length proportionally. + double Freq = BlockFreq > 0.0 ? BlockFreq : 1.0; + + // Occupancy cost: lower if more waves. Concave via reciprocal. + double OccTerm = OccW * occupancyCost(Waves, OccExp); + if (LowOccPenalty > 0.0 && Waves < LowOccFloor) + OccTerm += LowOccPenalty * static_cast<double>(LowOccFloor - Waves); + + // Length cost: cycles weighted by block frequency. + double LenTerm = LenW * (static_cast<double>(LengthCycles) * Freq); + + // Spill cost: heavy penalty. Interpret SpillUnits as an estimated count; + // could be number of excess registers or estimated bytes; we keep generic. + double SpillTerm = SpillW * static_cast<double>(SpillUnits); + + return OccTerm + LenTerm + SpillTerm; +} + GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { @@ -1032,6 +1108,12 @@ bool GCNSchedStage::initGCNSchedStage() { return false; LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n"); + if (UseSchedCostFunction && UseStageCostDecision) { + StageCostBefore = 0.0; + StageCostAfter = 0.0; + StageSavedOrder.clear(); + StageSavedPressure.clear(); + } return true; } @@ -1136,6 +1218,21 @@ bool PreRARematStage::initGCNSchedStage() { void GCNSchedStage::finalizeGCNSchedStage() { DAG.finishBlock(); LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); + + if (UseSchedCostFunction && UseStageCostDecision) { + if (StageCostAfter > StageCostBefore) { + LLVM_DEBUG(dbgs() << "[CostFunction] Reverting entire stage: cost before=" + << StageCostBefore << ", cost after=" << StageCostAfter + << "\n"); + for (const auto &It : StageSavedOrder) { + unsigned R = It.getFirst(); + const std::vector<MachineInstr *> &Order = It.getSecond(); + revertRegionToOrder(R, Order); + if (auto P = StageSavedPressure.find(R); P != StageSavedPressure.end()) + DAG.Pressure[R] = P->second; + } + } + } } void UnclusteredHighRPStage::finalizeGCNSchedStage() { @@ -1266,6 +1363,54 @@ void GCNSchedStage::finalizeGCNRegion() { // reason that the original schedule is better. checkScheduling(); + // If deferring cost decision to the end of stage, accumulate per-region + // costs now. Functional reverts still happen in checkScheduling(). + if (UseSchedCostFunction && UseStageCostDecision) { + if (!StageSavedOrder.contains(RegionIdx)) + StageSavedOrder[RegionIdx] = Unsched; + if (!StageSavedPressure.contains(RegionIdx)) + StageSavedPressure[RegionIdx] = PressureBefore; + + ScheduleMetrics MAfter = getScheduleMetrics(DAG); + ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits); + unsigned LengthAfter = MAfter.getLength(); + unsigned LengthBefore = MBefore.getLength(); + + auto EstimateSpill = [&](const GCNRegPressure &P) -> unsigned { + unsigned Spill = 0; + unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); + unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs()); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); + unsigned VG = P.getVGPRNum(ST.hasGFX90AInsts()); + unsigned AG = P.getAGPRNum(); + unsigned AV = P.getArchVGPRNum(); + unsigned SG = P.getSGPRNum(); + if (VG > MaxVGPRs) Spill += VG - MaxVGPRs; + if (AV > MaxArchVGPRs) Spill += AV - MaxArchVGPRs; + if (AG > MaxArchVGPRs) Spill += AG - MaxArchVGPRs; + if (SG > MaxSGPRs) Spill += SG - MaxSGPRs; + return Spill; + }; + + unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); + unsigned TargetOcc = std::min( + S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second); + unsigned WavesBefore = std::min( + TargetOcc, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize)); + unsigned WavesAfter = std::min( + TargetOcc, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize)); + unsigned SpillBefore = EstimateSpill(PressureBefore); + unsigned SpillAfter = EstimateSpill(PressureAfter); + + double BlockFreq = 1.0; // TODO: wire MBFI when available + AMDGPUSchedCostFunction CF(SchedCostWeightOccupancy, + SchedCostWeightLength, SchedCostWeightSpill, + SchedCostOccExponent, SchedCostLowOccFloor, + SchedCostLowOccPenalty); + StageCostBefore += CF.score(WavesBefore, LengthBefore, SpillBefore, BlockFreq); + StageCostAfter += CF.score(WavesAfter, LengthAfter, SpillAfter, BlockFreq); + } + if (DAG.RegionsWithIGLPInstrs[RegionIdx] && StageID != GCNSchedStageID::UnclusteredHighRPReschedule) SavedMutations.swap(DAG.Mutations); @@ -1340,7 +1485,70 @@ void GCNSchedStage::checkScheduling() { // Revert if this region's schedule would cause a drop in occupancy or // spilling. - if (shouldRevertScheduling(WavesAfter)) + bool Revert = shouldRevertScheduling(WavesAfter); + + // Optional: cost-function evaluation. Compare previous vs new schedule + // and retain whichever has lower total cost. This only triggers if + // enabled by flag; default behavior remains unchanged. + if (!Revert && UseSchedCostFunction) { + // Compute a simple schedule length estimate using existing metric helper. + // We reuse the bubble/length estimator for the current DAG versus the + // previously saved unscheduled order (DAG.SUnits captures pre-schedule). + ScheduleMetrics MAfter = getScheduleMetrics(DAG); + ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits); + + unsigned LengthAfter = MAfter.getLength(); + unsigned LengthBefore = MBefore.getLength(); + + // Estimate spill cost as excess over hardware maxima as a proxy. + // When in doubt, use the RegionsWithExcessRP bit as an indicator. + auto EstimateSpill = [&](const GCNRegPressure &P) -> unsigned { + unsigned Spill = 0; + // Excess over addressable limits captures risk of spills. + unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); + unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs()); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); + unsigned VG = P.getVGPRNum(ST.hasGFX90AInsts()); + unsigned AG = P.getAGPRNum(); + unsigned AV = P.getArchVGPRNum(); + unsigned SG = P.getSGPRNum(); + if (VG > MaxVGPRs) + Spill += VG - MaxVGPRs; + if (AV > MaxArchVGPRs) + Spill += AV - MaxArchVGPRs; + if (AG > MaxArchVGPRs) + Spill += AG - MaxArchVGPRs; + if (SG > MaxSGPRs) + Spill += SG - MaxSGPRs; + return Spill; + }; + + unsigned SpillAfter = EstimateSpill(PressureAfter); + unsigned SpillBefore = EstimateSpill(PressureBefore); + + // Occupancy for before/after. + unsigned WavesBeforeOcc = std::min( + TargetOccupancy, + PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize)); + unsigned WavesAfterOcc = std::min( + TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize)); + + // Block frequency weighting: use MBFI if available, else 1.0. The + // MachineSchedContext does not expose MBFI here, so default to 1.0. + double BlockFreq = 1.0; + + AMDGPUSchedCostFunction CF(SchedCostWeightOccupancy, + SchedCostWeightLength, SchedCostWeightSpill, + SchedCostOccExponent, SchedCostLowOccFloor, + SchedCostLowOccPenalty); + double CostBefore = CF.score(WavesBeforeOcc, LengthBefore, SpillBefore, BlockFreq); + double CostAfter = CF.score(WavesAfterOcc, LengthAfter, SpillAfter, BlockFreq); + + if (CostAfter > CostBefore) + Revert = true; + } + + if (Revert) revertScheduling(); else DAG.Pressure[RegionIdx] = PressureAfter; @@ -1633,6 +1841,20 @@ void GCNSchedStage::revertScheduling() { DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); } +void GCNSchedStage::revertRegionToOrder( + unsigned RIdx, const std::vector<MachineInstr *> &SavedOrder) { + auto &Bounds = DAG.Regions[RIdx]; + RegionIdx = RIdx; + DAG.RegionBegin = Bounds.first; + DAG.RegionEnd = Bounds.second; + + std::vector<MachineInstr *> OrigUnsched; + OrigUnsched.swap(Unsched); + Unsched = SavedOrder; + revertScheduling(); + Unsched.swap(OrigUnsched); +} + bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, SlotIndex OriginalIdx, SlotIndex RematIdx) const { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 790370ff8ab4..0e7b6d02a588 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -133,6 +133,58 @@ public: GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; } }; +/// Cost-function based schedule evaluation (optional, off by default). +/// +/// Purpose +/// - Provide a scalar score to compare candidate schedules using +/// tunable weights rather than purely greedy heuristics. +/// +/// Inputs +/// - Occupancy in waves (unsigned) +/// - Estimated schedule length in cycles (unsigned) +/// - Estimated spill cost (unsigned units; 0 when no spill is expected) +/// - Block execution frequency (double; typically normalized, defaults to 1) +/// +/// Behavior +/// - Occupancy has diminishing returns: the cost contribution decreases +/// concavely with more waves, so increasing 1→2 waves is more valuable +/// than 8→9. +/// - Spills dominate the cost: each unit of spill is penalized very heavily +/// relative to a single cycle of schedule length. +/// - Schedule length is weighted by basic block execution frequency, so hot +/// blocks count more. +/// +/// Configuration +/// - Weights are controllable via command-line flags: +/// - `-amdgpu-sched-cost-weight-occupancy` +/// - `-amdgpu-sched-cost-weight-length` +/// - `-amdgpu-sched-cost-weight-spill` +/// - Lower scores are better. +class AMDGPUSchedCostFunction { + double OccW = 1.0; + double LenW = 1.0; + double SpillW = 100.0; + // Shape of diminishing returns for occupancy: cost ~ 1 / Waves^OccExp. + double OccExp = 1.0; + // Extra penalty applied when Waves < LowOccFloor to emphasize that + // very low occupancy is particularly harmful (linear deficit penalty). + unsigned LowOccFloor = 2; // e.g. heavily prefer 2+ waves over 1 + double LowOccPenalty = 0.0; + +public: + AMDGPUSchedCostFunction() = default; + AMDGPUSchedCostFunction(double OccWeight, double LenWeight, double SpillWeight, + double OccExponent, unsigned LowOccPrefFloor, + double LowOccPenaltyWeight) + : OccW(OccWeight), LenW(LenWeight), SpillW(SpillWeight), + OccExp(OccExponent), LowOccFloor(LowOccPrefFloor), + LowOccPenalty(LowOccPenaltyWeight) {} + + /// Compute the total schedule score. Lower is better. + double score(unsigned Waves, unsigned LengthCycles, unsigned SpillUnits, + double BlockFreq) const; +}; + /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e. /// maximum number of waves per simd). class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy { @@ -338,6 +390,12 @@ protected: std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations; + // Stage-level cost aggregation and saved state for potential rollback. + double StageCostBefore = 0.0; + double StageCostAfter = 0.0; + DenseMap<unsigned, std::vector<MachineInstr *>> StageSavedOrder; + DenseMap<unsigned, GCNRegPressure> StageSavedPressure; + GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG); public: @@ -385,6 +443,10 @@ public: // Attempt to revert scheduling for this region. void revertScheduling(); + // Revert region at index RegionIdx to a previously saved instruction order. + void revertRegionToOrder(unsigned RegionIdx, + const std::vector<MachineInstr *> &SavedOrder); + void advanceRegion() { RegionIdx++; } virtual ~GCNSchedStage() = default; |
