2 files changed, 286 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 254b75b784e7..874dfc09ad4e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -23,9 +23,9 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "GCNSchedStrategy.h"
 #include "AMDGPUIGroupLP.h"
 #include "GCNRegPressure.h"
+#include "GCNSchedStrategy.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
@@ -33,6 +33,9 @@
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"
 
+#include <cmath>
+#include <limits>
+
 #define DEBUG_TYPE "machine-scheduler"
 
 using namespace llvm;
@@ -70,6 +73,79 @@ static cl::opt<bool> GCNTrackers(
 
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
+//===----------------------------------------------------------------------===//
+// Optional cost-function mode: command line switches
+//===----------------------------------------------------------------------===//
+
+static cl::opt<bool> UseSchedCostFunction(
+    "amdgpu-use-cost-function", cl::Hidden,
+    cl::desc("Enable cost-function-based evaluation to compare candidate"
+             " schedules in GCNSchedStrategy."),
+    cl::init(false));
+
+static cl::opt<double> SchedCostWeightOccupancy(
+    "amdgpu-sched-cost-weight-occupancy", cl::Hidden,
+    cl::desc("Weight for occupancy term in AMDGPU scheduler cost function"),
+    cl::init(1.0));
+static cl::opt<double> SchedCostWeightLength(
+    "amdgpu-sched-cost-weight-length", cl::Hidden,
+    cl::desc("Weight for schedule length term (cycles) in cost function"),
+    cl::init(1.0));
+static cl::opt<double> SchedCostWeightSpill(
+    "amdgpu-sched-cost-weight-spill", cl::Hidden,
+    cl::desc("Weight for spill term; typically much larger than length"),
+    cl::init(100.0));
+
+// Shape the occupancy term: reciprocal exponent and low-occupancy penalty.
+static cl::opt<double> SchedCostOccExponent(
+  "amdgpu-sched-cost-occ-exponent", cl::Hidden,
+  cl::desc("Exponent for occupancy diminishing-returns curve (cost ~ 1/W^exp)"),
+  cl::init(1.0));
+static cl::opt<unsigned> SchedCostLowOccFloor(
+  "amdgpu-sched-cost-lowocc-floor", cl::Hidden,
+  cl::desc("Preferred minimum waves; waves below this get extra penalty"),
+  cl::init(2));
+static cl::opt<double> SchedCostLowOccPenalty(
+  "amdgpu-sched-cost-lowocc-penalty", cl::Hidden,
+  cl::desc("Penalty weight multiplied by (floor - waves) when below floor"),
+  cl::init(0.0));
+
+static cl::opt<bool> UseStageCostDecision(
+  "amdgpu-use-stage-cost-decision", cl::Hidden,
+  cl::desc("Defer cost decisions to end of stage using block-frequency"
+       " weighted totals, instead of per-region immediate reverts"),
+  cl::init(false));
+
+// Helper: concave occupancy utility. Map waves -> diminishing cost reduction.
+static inline double occupancyCost(unsigned Waves, double Exp) {
+  if (Waves == 0)
+    return std::numeric_limits<double>::infinity();
+  // Use reciprocal to get a simple concave utility: higher waves -> smaller cost.
+  // We scale by a constant so typical ranges produce reasonable magnitudes.
+  return 1.0 / std::pow(static_cast<double>(Waves), Exp);
+}
+
+double AMDGPUSchedCostFunction::score(unsigned Waves, unsigned LengthCycles,
+                                      unsigned SpillUnits,
+                                      double BlockFreq) const {
+  // BlockFreq defaults to 1.0 if unknown; scale length proportionally.
+  double Freq = BlockFreq > 0.0 ? BlockFreq : 1.0;
+
+  // Occupancy cost: lower if more waves. Concave via reciprocal.
+  double OccTerm = OccW * occupancyCost(Waves, OccExp);
+  if (LowOccPenalty > 0.0 && Waves < LowOccFloor)
+    OccTerm += LowOccPenalty * static_cast<double>(LowOccFloor - Waves);
+
+  // Length cost: cycles weighted by block frequency.
+  double LenTerm = LenW * (static_cast<double>(LengthCycles) * Freq);
+
+  // Spill cost: heavy penalty. Interpret SpillUnits as an estimated count;
+  // could be number of excess registers or estimated bytes; we keep generic.
+  double SpillTerm = SpillW * static_cast<double>(SpillUnits);
+
+  return OccTerm + LenTerm + SpillTerm;
+}
+
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
       DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
@@ -1032,6 +1108,12 @@ bool GCNSchedStage::initGCNSchedStage() {
     return false;
 
   LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
+  if (UseSchedCostFunction && UseStageCostDecision) {
+    StageCostBefore = 0.0;
+    StageCostAfter = 0.0;
+    StageSavedOrder.clear();
+    StageSavedPressure.clear();
+  }
   return true;
 }
 
@@ -1136,6 +1218,21 @@ bool PreRARematStage::initGCNSchedStage() {
 void GCNSchedStage::finalizeGCNSchedStage() {
   DAG.finishBlock();
   LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
+
+  if (UseSchedCostFunction && UseStageCostDecision) {
+    if (StageCostAfter > StageCostBefore) {
+      LLVM_DEBUG(dbgs() << "[CostFunction] Reverting entire stage: cost before="
+                        << StageCostBefore << ", cost after=" << StageCostAfter
+                        << "\n");
+      for (const auto &It : StageSavedOrder) {
+        unsigned R = It.getFirst();
+        const std::vector<MachineInstr *> &Order = It.getSecond();
+        revertRegionToOrder(R, Order);
+        if (auto P = StageSavedPressure.find(R); P != StageSavedPressure.end())
+          DAG.Pressure[R] = P->second;
+      }
+    }
+  }
 }
 
 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
@@ -1266,6 +1363,54 @@ void GCNSchedStage::finalizeGCNRegion() {
   // reason that the original schedule is better.
   checkScheduling();
 
+  // If deferring cost decision to the end of stage, accumulate per-region
+  // costs now. Functional reverts still happen in checkScheduling().
+  if (UseSchedCostFunction && UseStageCostDecision) {
+    if (!StageSavedOrder.contains(RegionIdx))
+      StageSavedOrder[RegionIdx] = Unsched;
+    if (!StageSavedPressure.contains(RegionIdx))
+      StageSavedPressure[RegionIdx] = PressureBefore;
+
+    ScheduleMetrics MAfter = getScheduleMetrics(DAG);
+    ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
+    unsigned LengthAfter = MAfter.getLength();
+    unsigned LengthBefore = MBefore.getLength();
+
+    auto EstimateSpill = [&](const GCNRegPressure &P) -> unsigned {
+      unsigned Spill = 0;
+      unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+      unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
+      unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+      unsigned VG = P.getVGPRNum(ST.hasGFX90AInsts());
+      unsigned AG = P.getAGPRNum();
+      unsigned AV = P.getArchVGPRNum();
+      unsigned SG = P.getSGPRNum();
+      if (VG > MaxVGPRs) Spill += VG - MaxVGPRs;
+      if (AV > MaxArchVGPRs) Spill += AV - MaxArchVGPRs;
+      if (AG > MaxArchVGPRs) Spill += AG - MaxArchVGPRs;
+      if (SG > MaxSGPRs) Spill += SG - MaxSGPRs;
+      return Spill;
+    };
+
+    unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
+    unsigned TargetOcc = std::min(
+        S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
+    unsigned WavesBefore = std::min(
+        TargetOcc, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
+    unsigned WavesAfter = std::min(
+        TargetOcc, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
+    unsigned SpillBefore = EstimateSpill(PressureBefore);
+    unsigned SpillAfter = EstimateSpill(PressureAfter);
+
+    double BlockFreq = 1.0; // TODO: wire MBFI when available
+  AMDGPUSchedCostFunction CF(SchedCostWeightOccupancy,
+                 SchedCostWeightLength, SchedCostWeightSpill,
+                 SchedCostOccExponent, SchedCostLowOccFloor,
+                 SchedCostLowOccPenalty);
+    StageCostBefore += CF.score(WavesBefore, LengthBefore, SpillBefore, BlockFreq);
+    StageCostAfter += CF.score(WavesAfter, LengthAfter, SpillAfter, BlockFreq);
+  }
+
   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
       StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
     SavedMutations.swap(DAG.Mutations);
@@ -1340,7 +1485,70 @@ void GCNSchedStage::checkScheduling() {
 
   // Revert if this region's schedule would cause a drop in occupancy or
   // spilling.
-  if (shouldRevertScheduling(WavesAfter))
+  bool Revert = shouldRevertScheduling(WavesAfter);
+
+  // Optional: cost-function evaluation. Compare previous vs new schedule
+  // and retain whichever has lower total cost. This only triggers if
+  // enabled by flag; default behavior remains unchanged.
+  if (!Revert && UseSchedCostFunction) {
+    // Compute a simple schedule length estimate using existing metric helper.
+    // We reuse the bubble/length estimator for the current DAG versus the
+    // previously saved unscheduled order (DAG.SUnits captures pre-schedule).
+    ScheduleMetrics MAfter = getScheduleMetrics(DAG);
+    ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
+
+    unsigned LengthAfter = MAfter.getLength();
+    unsigned LengthBefore = MBefore.getLength();
+
+    // Estimate spill cost as excess over hardware maxima as a proxy.
+    // When in doubt, use the RegionsWithExcessRP bit as an indicator.
+    auto EstimateSpill = [&](const GCNRegPressure &P) -> unsigned {
+      unsigned Spill = 0;
+      // Excess over addressable limits captures risk of spills.
+      unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+      unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
+      unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+      unsigned VG = P.getVGPRNum(ST.hasGFX90AInsts());
+      unsigned AG = P.getAGPRNum();
+      unsigned AV = P.getArchVGPRNum();
+      unsigned SG = P.getSGPRNum();
+      if (VG > MaxVGPRs)
+        Spill += VG - MaxVGPRs;
+      if (AV > MaxArchVGPRs)
+        Spill += AV - MaxArchVGPRs;
+      if (AG > MaxArchVGPRs)
+        Spill += AG - MaxArchVGPRs;
+      if (SG > MaxSGPRs)
+        Spill += SG - MaxSGPRs;
+      return Spill;
+    };
+
+    unsigned SpillAfter = EstimateSpill(PressureAfter);
+    unsigned SpillBefore = EstimateSpill(PressureBefore);
+
+    // Occupancy for before/after.
+    unsigned WavesBeforeOcc = std::min(
+        TargetOccupancy,
+        PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
+    unsigned WavesAfterOcc = std::min(
+        TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
+
+    // Block frequency weighting: use MBFI if available, else 1.0. The
+    // MachineSchedContext does not expose MBFI here, so default to 1.0.
+    double BlockFreq = 1.0;
+
+  AMDGPUSchedCostFunction CF(SchedCostWeightOccupancy,
+                 SchedCostWeightLength, SchedCostWeightSpill,
+                 SchedCostOccExponent, SchedCostLowOccFloor,
+                 SchedCostLowOccPenalty);
+    double CostBefore = CF.score(WavesBeforeOcc, LengthBefore, SpillBefore, BlockFreq);
+    double CostAfter = CF.score(WavesAfterOcc, LengthAfter, SpillAfter, BlockFreq);
+
+    if (CostAfter > CostBefore)
+      Revert = true;
+  }
+
+  if (Revert)
     revertScheduling();
   else
     DAG.Pressure[RegionIdx] = PressureAfter;
@@ -1633,6 +1841,20 @@ void GCNSchedStage::revertScheduling() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
+void GCNSchedStage::revertRegionToOrder(
+    unsigned RIdx, const std::vector<MachineInstr *> &SavedOrder) {
+  auto &Bounds = DAG.Regions[RIdx];
+  RegionIdx = RIdx;
+  DAG.RegionBegin = Bounds.first;
+  DAG.RegionEnd = Bounds.second;
+
+  std::vector<MachineInstr *> OrigUnsched;
+  OrigUnsched.swap(Unsched);
+  Unsched = SavedOrder;
+  revertScheduling();
+  Unsched.swap(OrigUnsched);
+}
+
 bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
                                          SlotIndex OriginalIdx,
                                          SlotIndex RematIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 790370ff8ab4..0e7b6d02a588 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -133,6 +133,58 @@ public:
   GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; }
 };
 
+/// Cost-function based schedule evaluation (optional, off by default).
+///
+/// Purpose
+///  - Provide a scalar score to compare candidate schedules using
+///    tunable weights rather than purely greedy heuristics.
+///
+/// Inputs
+///  - Occupancy in waves (unsigned)
+///  - Estimated schedule length in cycles (unsigned)
+///  - Estimated spill cost (unsigned units; 0 when no spill is expected)
+///  - Block execution frequency (double; typically normalized, defaults to 1)
+///
+/// Behavior
+///  - Occupancy has diminishing returns: the cost contribution decreases
+///    concavely with more waves, so increasing 1→2 waves is more valuable
+///    than 8→9.
+///  - Spills dominate the cost: each unit of spill is penalized very heavily
+///    relative to a single cycle of schedule length.
+///  - Schedule length is weighted by basic block execution frequency, so hot
+///    blocks count more.
+///
+/// Configuration
+///  - Weights are controllable via command-line flags:
+///      - `-amdgpu-sched-cost-weight-occupancy`
+///      - `-amdgpu-sched-cost-weight-length`
+///      - `-amdgpu-sched-cost-weight-spill`
+///  - Lower scores are better.
+class AMDGPUSchedCostFunction {
+  double OccW = 1.0;
+  double LenW = 1.0;
+  double SpillW = 100.0;
+  // Shape of diminishing returns for occupancy: cost ~ 1 / Waves^OccExp.
+  double OccExp = 1.0;
+  // Extra penalty applied when Waves < LowOccFloor to emphasize that
+  // very low occupancy is particularly harmful (linear deficit penalty).
+  unsigned LowOccFloor = 2; // e.g. heavily prefer 2+ waves over 1
+  double LowOccPenalty = 0.0;
+
+public:
+  AMDGPUSchedCostFunction() = default;
+  AMDGPUSchedCostFunction(double OccWeight, double LenWeight, double SpillWeight,
+                          double OccExponent, unsigned LowOccPrefFloor,
+                          double LowOccPenaltyWeight)
+      : OccW(OccWeight), LenW(LenWeight), SpillW(SpillWeight),
+        OccExp(OccExponent), LowOccFloor(LowOccPrefFloor),
+        LowOccPenalty(LowOccPenaltyWeight) {}
+
+  /// Compute the total schedule score. Lower is better.
+  double score(unsigned Waves, unsigned LengthCycles, unsigned SpillUnits,
+               double BlockFreq) const;
+};
+
 /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
 /// maximum number of waves per simd).
 class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy {
@@ -338,6 +390,12 @@ protected:
 
   std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
 
+  // Stage-level cost aggregation and saved state for potential rollback.
+  double StageCostBefore = 0.0;
+  double StageCostAfter = 0.0;
+  DenseMap<unsigned, std::vector<MachineInstr *>> StageSavedOrder;
+  DenseMap<unsigned, GCNRegPressure> StageSavedPressure;
+
   GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG);
 
 public:
@@ -385,6 +443,10 @@ public:
   // Attempt to revert scheduling for this region.
   void revertScheduling();
 
+  // Revert region at index RegionIdx to a previously saved instruction order.
+  void revertRegionToOrder(unsigned RegionIdx,
+                           const std::vector<MachineInstr *> &SavedOrder);
+
   void advanceRegion() { RegionIdx++; }
 
   virtual ~GCNSchedStage() = default;