[MachineSink] Extend loop sinking capability (#117247)

The current MIR cycle sinking capabilities are rather limited. It only support sinking copies into a single successor block while obeying limits. This opt-in feature adds a more aggressive option, that is not limited to the above concerns. The feature will try to "sink" by duplicating any top-level preheader instruction (that we are sure is safe to sink) into any user block, then does some dead code cleanup. In particular, this is useful for high RP situations when loop bodies have control flow.
author: Jeffrey Byrnes <jeffrey.byrnes@amd.com> 2025-01-23 17:08:23 -0800
committer: GitHub <noreply@github.com> 2025-01-23 17:08:23 -0800
commit: acb7859f075f91b1105c04c37c6aa85db27a898a (patch)
tree: 3649d5c5dee305add91d3727d6d9337319f32cb8 /llvm/lib/CodeGen/MachineSink.cpp
parent: 24f177df61f673804a612dc48279c517bdecd696 (diff)
download: llvm-acb7859f075f91b1105c04c37c6aa85db27a898a.zip
llvm-acb7859f075f91b1105c04c37c6aa85db27a898a.tar.gz
llvm-acb7859f075f91b1105c04c37c6aa85db27a898a.tar.bz2
1 files changed, 176 insertions, 91 deletions
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 03d93ce..5cb5c0b 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -45,6 +45,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -113,6 +114,8 @@ STATISTIC(NumSplit, "Number of critical edges split");
 STATISTIC(NumCoalesces, "Number of copies coalesced");
 STATISTIC(NumPostRACopySink, "Number of copies sunk after RA");
 
+using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
+
 namespace {
 
 class MachineSinking : public MachineFunctionPass {
@@ -128,6 +131,7 @@ class MachineSinking : public MachineFunctionPass {
   const MachineBranchProbabilityInfo *MBPI = nullptr;
   AliasAnalysis *AA = nullptr;
   RegisterClassInfo RegClassInfo;
+  TargetSchedModel SchedModel;
 
   // Remember which edges have been considered for breaking.
   SmallSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>, 8>
@@ -161,6 +165,8 @@ class MachineSinking : public MachineFunctionPass {
   /// would re-order assignments.
   using SeenDbgUser = PointerIntPair<MachineInstr *, 1>;
 
+  using SinkItem = std::pair<MachineInstr *, MachineBasicBlock *>;
+
   /// Record of DBG_VALUE uses of vregs in a block, so that we can identify
   /// debug instructions to sink.
   SmallDenseMap<unsigned, TinyPtrVector<SeenDbgUser>> SeenDbgUsers;
@@ -255,7 +261,10 @@ private:
 
   void FindCycleSinkCandidates(MachineCycle *Cycle, MachineBasicBlock *BB,
                                SmallVectorImpl<MachineInstr *> &Candidates);
-  bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);
+
+  bool
+  aggressivelySinkIntoCycle(MachineCycle *Cycle, MachineInstr &I,
+                            DenseMap<SinkItem, MachineInstr *> &SunkInstrs);
 
   bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
                             MachineBasicBlock *MBB,
@@ -271,11 +280,14 @@ private:
   GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
                          AllSuccsCache &AllSuccessors) const;
 
-  std::vector<unsigned> &getBBRegisterPressure(const MachineBasicBlock &MBB);
+  std::vector<unsigned> &getBBRegisterPressure(const MachineBasicBlock &MBB,
+                                               bool UseCache = true);
 
   bool registerPressureSetExceedsLimit(unsigned NRegs,
                                        const TargetRegisterClass *RC,
                                        const MachineBasicBlock &MBB);
+
+  bool registerPressureExceedsLimit(const MachineBasicBlock &MBB);
 };
 
 } // end anonymous namespace
@@ -680,6 +692,10 @@ void MachineSinking::FindCycleSinkCandidates(
     SmallVectorImpl<MachineInstr *> &Candidates) {
   for (auto &MI : *BB) {
     LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
+    if (MI.isMetaInstruction()) {
+      LLVM_DEBUG(dbgs() << "CycleSink: not sinking meta instruction\n");
+      continue;
+    }
     if (!TII->shouldSink(MI)) {
       LLVM_DEBUG(dbgs() << "CycleSink: Instruction not a candidate for this "
                            "target\n");
@@ -775,31 +791,62 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
 
   if (SinkInstsIntoCycle) {
     SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
-    for (auto *Cycle : Cycles) {
-      MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
-      if (!Preheader) {
-        LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
-        continue;
-      }
-      SmallVector<MachineInstr *, 8> Candidates;
-      FindCycleSinkCandidates(Cycle, Preheader, Candidates);
-
-      // Walk the candidates in reverse order so that we start with the use
-      // of a def-use chain, if there is any.
-      // TODO: Sort the candidates using a cost-model.
-      unsigned i = 0;
-      for (MachineInstr *I : llvm::reverse(Candidates)) {
-        if (i++ == SinkIntoCycleLimit) {
-          LLVM_DEBUG(dbgs() << "CycleSink:   Limit reached of instructions to "
-                               "be analysed.");
-          break;
+    SchedModel.init(STI);
+    bool HasHighPressure;
+
+    DenseMap<SinkItem, MachineInstr *> SunkInstrs;
+
+    enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
+    for (unsigned Stage = CycleSinkStage::COPY; Stage != CycleSinkStage::END;
+         ++Stage, SunkInstrs.clear()) {
+      HasHighPressure = false;
+
+      for (auto *Cycle : Cycles) {
+        MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+        if (!Preheader) {
+          LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
+          continue;
         }
+        SmallVector<MachineInstr *, 8> Candidates;
+        FindCycleSinkCandidates(Cycle, Preheader, Candidates);
+
+        unsigned i = 0;
+
+        // Walk the candidates in reverse order so that we start with the use
+        // of a def-use chain, if there is any.
+        // TODO: Sort the candidates using a cost-model.
+        for (MachineInstr *I : llvm::reverse(Candidates)) {
+          // CycleSinkStage::COPY: Sink a limited number of copies
+          if (Stage == CycleSinkStage::COPY) {
+            if (i++ == SinkIntoCycleLimit) {
+              LLVM_DEBUG(dbgs()
+                         << "CycleSink:   Limit reached of instructions to "
+                            "be analyzed.");
+              break;
+            }
+
+            if (!I->isCopy())
+              continue;
+          }
 
-        if (!SinkIntoCycle(Cycle, *I))
-          break;
-        EverMadeChange = true;
-        ++NumCycleSunk;
+          // CycleSinkStage::LOW_LATENCY: sink unlimited number of instructions
+          // which the target specifies as low-latency
+          if (Stage == CycleSinkStage::LOW_LATENCY &&
+              !TII->hasLowDefLatency(SchedModel, *I, 0))
+            continue;
+
+          if (!aggressivelySinkIntoCycle(Cycle, *I, SunkInstrs))
+            break;
+          EverMadeChange = true;
+          ++NumCycleSunk;
+        }
+
+        // Recalculate the pressure after sinking
+        if (!HasHighPressure)
+          HasHighPressure = registerPressureExceedsLimit(*Preheader);
       }
+      if (!HasHighPressure)
+        break;
     }
   }
 
@@ -1055,13 +1102,15 @@ bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
 }
 
 std::vector<unsigned> &
-MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) {
+MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB,
+                                      bool UseCache) {
   // Currently to save compiling time, MBB's register pressure will not change
   // in one ProcessBlock iteration because of CachedRegisterPressure. but MBB's
   // register pressure is changed after sinking any instructions into it.
   // FIXME: need a accurate and cheap register pressure estiminate model here.
+
   auto RP = CachedRegisterPressure.find(&MBB);
-  if (RP != CachedRegisterPressure.end())
+  if (UseCache && RP != CachedRegisterPressure.end())
     return RP->second;
 
   RegionPressure Pressure;
@@ -1085,6 +1134,12 @@ MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) {
   }
 
   RPTracker.closeRegion();
+
+  if (RP != CachedRegisterPressure.end()) {
+    CachedRegisterPressure[&MBB] = RPTracker.getPressure().MaxSetPressure;
+    return CachedRegisterPressure[&MBB];
+  }
+
   auto It = CachedRegisterPressure.insert(
       std::make_pair(&MBB, RPTracker.getPressure().MaxSetPressure));
   return It.first->second;
@@ -1103,6 +1158,21 @@ bool MachineSinking::registerPressureSetExceedsLimit(
   return false;
 }
 
+// Recalculate RP and check if any pressure set exceeds the set limit.
+bool MachineSinking::registerPressureExceedsLimit(
+    const MachineBasicBlock &MBB) {
+  std::vector<unsigned> BBRegisterPressure = getBBRegisterPressure(MBB, false);
+
+  for (unsigned PS = 0; PS < BBRegisterPressure.size(); ++PS) {
+    if (BBRegisterPressure[PS] >=
+        TRI->getRegPressureSetLimit(*MBB.getParent(), PS)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 /// isProfitableToSinkTo - Return true if it is profitable to sink MI.
 bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
                                           MachineBasicBlock *MBB,
@@ -1581,83 +1651,98 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
   return HasAliasedStore;
 }
 
-/// Sink instructions into cycles if profitable. This especially tries to
-/// prevent register spills caused by register pressure if there is little to no
-/// overhead moving instructions into cycles.
-bool MachineSinking::SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I) {
-  LLVM_DEBUG(dbgs() << "CycleSink: Finding sink block for: " << I);
+/// Aggressively sink instructions into cycles. This will aggressively try to
+/// sink all instructions in the top-most preheaders in an attempt to reduce RP.
+/// In particular, it will sink into multiple successor blocks without limits
+/// based on the amount of sinking, or the type of ops being sunk (so long as
+/// they are safe to sink).
+bool MachineSinking::aggressivelySinkIntoCycle(
+    MachineCycle *Cycle, MachineInstr &I,
+    DenseMap<SinkItem, MachineInstr *> &SunkInstrs) {
+  // TODO: support instructions with multiple defs
+  if (I.getNumDefs() > 1)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
   MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
   assert(Preheader && "Cycle sink needs a preheader block");
-  MachineBasicBlock *SinkBlock = nullptr;
-  bool CanSink = true;
-  const MachineOperand &MO = I.getOperand(0);
-
-  for (MachineInstr &MI : MRI->use_instructions(MO.getReg())) {
-    LLVM_DEBUG(dbgs() << "CycleSink:   Analysing use: " << MI);
-    if (!Cycle->contains(MI.getParent())) {
-      LLVM_DEBUG(dbgs() << "CycleSink:   Use not in cycle, can't sink.\n");
-      CanSink = false;
-      break;
-    }
+  SmallVector<std::pair<RegSubRegPair, MachineInstr *>> Uses;
 
-    // FIXME: Come up with a proper cost model that estimates whether sinking
-    // the instruction (and thus possibly executing it on every cycle
-    // iteration) is more expensive than a register.
-    // For now assumes that copies are cheap and thus almost always worth it.
-    if (!MI.isCopy()) {
-      LLVM_DEBUG(dbgs() << "CycleSink:   Use is not a copy\n");
-      CanSink = false;
-      break;
+  MachineOperand &DefMO = I.getOperand(0);
+  for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
+    Uses.push_back({{DefMO.getReg(), DefMO.getSubReg()}, &MI});
+  }
+
+  for (std::pair<RegSubRegPair, MachineInstr *> Entry : Uses) {
+    MachineInstr *MI = Entry.second;
+    LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Analysing use: " << MI);
+    if (MI->isPHI()) {
+      LLVM_DEBUG(
+          dbgs() << "AggressiveCycleSink:   Not attempting to sink for PHI.\n");
+      continue;
     }
-    if (!SinkBlock) {
-      SinkBlock = MI.getParent();
-      LLVM_DEBUG(dbgs() << "CycleSink:   Setting sink block to: "
-                        << printMBBReference(*SinkBlock) << "\n");
+    // We cannot sink before the prologue
+    if (MI->isPosition() || TII->isBasicBlockPrologue(*MI)) {
+      LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Use is BasicBlock prologue, "
+                           "can't sink.\n");
       continue;
     }
-    SinkBlock = DT->findNearestCommonDominator(SinkBlock, MI.getParent());
-    if (!SinkBlock) {
-      LLVM_DEBUG(dbgs() << "CycleSink:   Can't find nearest dominator\n");
-      CanSink = false;
-      break;
+    if (!Cycle->contains(MI->getParent())) {
+      LLVM_DEBUG(
+          dbgs() << "AggressiveCycleSink:   Use not in cycle, can't sink.\n");
+      continue;
     }
-    LLVM_DEBUG(dbgs() << "CycleSink:   Setting nearest common dom block: "
-                      << printMBBReference(*SinkBlock) << "\n");
-  }
 
-  if (!CanSink) {
-    LLVM_DEBUG(dbgs() << "CycleSink: Can't sink instruction.\n");
-    return false;
-  }
-  if (!SinkBlock) {
-    LLVM_DEBUG(dbgs() << "CycleSink: Not sinking, can't find sink block.\n");
-    return false;
-  }
-  if (SinkBlock == Preheader) {
-    LLVM_DEBUG(
-        dbgs() << "CycleSink: Not sinking, sink block is the preheader\n");
-    return false;
-  }
-  if (SinkBlock->sizeWithoutDebugLargerThan(SinkLoadInstsPerBlockThreshold)) {
-    LLVM_DEBUG(
-        dbgs() << "CycleSink: Not Sinking, block too large to analyse.\n");
-    return false;
-  }
+    MachineBasicBlock *SinkBlock = MI->getParent();
+    MachineInstr *NewMI = nullptr;
+    SinkItem MapEntry(&I, SinkBlock);
+
+    auto SI = SunkInstrs.find(MapEntry);
+
+    // Check for the case in which we have already sunk a copy of this
+    // instruction into the user block.
+    if (SI != SunkInstrs.end()) {
+      LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Already sunk to block: "
+                        << printMBBReference(*SinkBlock) << "\n");
+      NewMI = SI->second;
+    }
 
-  LLVM_DEBUG(dbgs() << "CycleSink: Sinking instruction!\n");
-  SinkBlock->splice(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()), Preheader,
-                    I);
+    // Create a copy of the instruction in the use block.
+    if (!NewMI) {
+      LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Sinking instruction to block: "
+                        << printMBBReference(*SinkBlock) << "\n");
+
+      NewMI = I.getMF()->CloneMachineInstr(&I);
+      if (DefMO.getReg().isVirtual()) {
+        const TargetRegisterClass *TRC = MRI->getRegClass(DefMO.getReg());
+        Register DestReg = MRI->createVirtualRegister(TRC);
+        NewMI->substituteRegister(DefMO.getReg(), DestReg, DefMO.getSubReg(),
+                                  *TRI);
+      }
+      SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
+                        NewMI);
+      SunkInstrs.insert({MapEntry, NewMI});
+    }
 
-  // Conservatively clear any kill flags on uses of sunk instruction
-  for (MachineOperand &MO : I.operands()) {
-    if (MO.isReg() && MO.readsReg())
+    // Conservatively clear any kill flags on uses of sunk instruction
+    for (MachineOperand &MO : NewMI->all_uses()) {
+      assert(MO.isReg() && MO.isUse());
       RegsToClearKillFlags.insert(MO.getReg());
-  }
+    }
 
-  // The instruction is moved from its basic block, so do not retain the
-  // debug information.
-  assert(!I.isDebugInstr() && "Should not sink debug inst");
-  I.setDebugLoc(DebugLoc());
+    // The instruction is moved from its basic block, so do not retain the
+    // debug information.
+    assert(!NewMI->isDebugInstr() && "Should not sink debug inst");
+    NewMI->setDebugLoc(DebugLoc());
+
+    // Replace the use with the newly created virtual register.
+    RegSubRegPair &UseReg = Entry.first;
+    MI->substituteRegister(UseReg.Reg, NewMI->getOperand(0).getReg(),
+                           UseReg.SubReg, *TRI);
+  }
+  // If we have replaced all uses, then delete the dead instruction
+  if (I.isDead(*MRI))
+    I.eraseFromParent();
   return true;
 }
author	Jeffrey Byrnes <jeffrey.byrnes@amd.com>	2025-01-23 17:08:23 -0800
committer	GitHub <noreply@github.com>	2025-01-23 17:08:23 -0800
commit	acb7859f075f91b1105c04c37c6aa85db27a898a (patch)
tree	3649d5c5dee305add91d3727d6d9337319f32cb8 /llvm/lib/CodeGen/MachineSink.cpp
parent	24f177df61f673804a612dc48279c517bdecd696 (diff)
download	llvm-acb7859f075f91b1105c04c37c6aa85db27a898a.zip llvm-acb7859f075f91b1105c04c37c6aa85db27a898a.tar.gz llvm-acb7859f075f91b1105c04c37c6aa85db27a898a.tar.bz2