[InlineSpiller][AMDGPU] Implement subreg reload during RA spillusers/cdevadas/subreg-reload

Currently, when a virtual register is partially used, the entire tuple is restored from the spilled location, even if only a subset of its sub-registers is needed. This patch introduces support for partial reloads by analyzing actual register usage and restoring only the required sub-registers. This improvement enhances register allocation efficiency, particularly for cases involving tuple virtual registers. For AMDGPU, this change brings considerable improvements in workloads that involve matrix operations, large vectors, and complex control flows.
author: Christudasan Devadasan <Christudasan.Devadasan@amd.com> 2026-01-08 07:53:18 +0000
committer: Christudasan Devadasan <Christudasan.Devadasan@amd.com> 2026-01-13 03:50:24 +0000
commit: 070b3e99ac751c9e6cb38201e0eec68d72a55542 (patch)
tree: b798e5d2717b69aa89cccd322f5518d37f9e47c0 /llvm/lib
parent: 12d1aa0c8430c9d8015bfb285aae7d5e260db8ad (diff)
download: llvm-users/cdevadas/subreg-reload.zip
llvm-users/cdevadas/subreg-reload.tar.gz
llvm-users/cdevadas/subreg-reload.tar.bz2
4 files changed, 89 insertions, 11 deletions
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 6837030..c567b88f 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -217,7 +217,8 @@ private:
   bool coalesceStackAccess(MachineInstr *MI, Register Reg);
   bool foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>>,
                          MachineInstr *LoadMI = nullptr);
-  void insertReload(Register VReg, SlotIndex, MachineBasicBlock::iterator MI);
+  void insertReload(Register VReg, unsigned SubReg, SlotIndex,
+                    MachineBasicBlock::iterator MI);
   void insertSpill(Register VReg, bool isKill, MachineBasicBlock::iterator MI);
 
   void spillAroundUses(Register Reg);
@@ -1112,14 +1113,14 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
   return true;
 }
 
-void InlineSpiller::insertReload(Register NewVReg,
+void InlineSpiller::insertReload(Register NewVReg, unsigned SubReg,
                                  SlotIndex Idx,
                                  MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
 
   MachineInstrSpan MIS(MI, &MBB);
   TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot,
-                           MRI.getRegClass(NewVReg), Register());
+                           MRI.getRegClass(NewVReg), Register(), SubReg);
 
   LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI);
 
@@ -1248,10 +1249,51 @@ void InlineSpiller::spillAroundUses(Register Reg) {
 
     // Create a new virtual register for spill/fill.
     // FIXME: Infer regclass from instruction alone.
-    Register NewVReg = Edit->createFrom(Reg);
+
+    unsigned SubReg = 0;
+    LaneBitmask CoveringLanes = LaneBitmask::getNone();
+    // If the subreg liveness is enabled, identify the subreg use(s) to try
+    // subreg reload. Skip if the instruction also defines the register.
+    // For copy bundles, get the covering lane masks.
+    if (MRI.subRegLivenessEnabled() && !RI.Writes) {
+      for (auto [MI, OpIdx] : Ops) {
+        const MachineOperand &MO = MI->getOperand(OpIdx);
+        assert(MO.isReg() && MO.getReg() == Reg);
+        if (MO.isUse()) {
+          SubReg = MO.getSubReg();
+          if (SubReg)
+            CoveringLanes |= TRI.getSubRegIndexLaneMask(SubReg);
+        }
+      }
+    }
+
+    if (MI.isBundled() && CoveringLanes.any()) {
+      CoveringLanes = LaneBitmask(bit_ceil(CoveringLanes.getAsInteger()) - 1);
+      // Obtain the covering subregister index, including any missing indices
+      // within the identified small range. Although this may be suboptimal due
+      // to gaps in the subregisters that are not part of the copy bundle, it is
+      // benificial when components outside this range of the original tuple can
+      // be completely skipped from the reload.
+      SubReg = TRI.getSubRegIdxFromLaneMask(CoveringLanes);
+    }
+
+    // If the target doesn't support subreg reload, fallback to restoring the
+    // full tuple.
+    if (SubReg && !TRI.shouldEnableSubRegReload(SubReg))
+      SubReg = 0;
+
+    const TargetRegisterClass *OrigRC = MRI.getRegClass(Reg);
+    const TargetRegisterClass *NewRC =
+        SubReg ? TRI.getSubRegisterClass(OrigRC, SubReg) : nullptr;
+
+    // Check if the target needs to constrain the RC further.
+    if (NewRC)
+      NewRC = TRI.getConstrainedRegClass(NewRC);
+
+    Register NewVReg = Edit->createFrom(Reg, NewRC);
 
     if (RI.Reads)
-      insertReload(NewVReg, Idx, &MI);
+      insertReload(NewVReg, SubReg, Idx, &MI);
 
     // Rewrite instruction operands.
     bool hasLiveDef = false;
@@ -1259,7 +1301,10 @@ void InlineSpiller::spillAroundUses(Register Reg) {
       MachineOperand &MO = OpPair.first->getOperand(OpPair.second);
       MO.setReg(NewVReg);
       if (MO.isUse()) {
-        if (!OpPair.first->isRegTiedToDefOperand(OpPair.second))
+        if (SubReg && !MI.isBundled())
+          MO.setSubReg(0);
+        if (!OpPair.first->isRegTiedToDefOperand(OpPair.second) ||
+            (SubReg && !MI.isBundled()))
           MO.setIsKill();
       } else {
         if (!MO.isDead())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7dee976..0b9f56c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1886,12 +1886,23 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   const DebugLoc &DL = MBB.findDebugLoc(MI);
   unsigned SpillSize = RI.getSpillSize(*RC);
 
+  unsigned SubRegIdx = 0;
+  if (SubReg) {
+    uint64_t Mask = RI.getSubRegIndexLaneMask(SubReg).getAsInteger();
+    assert(llvm::popcount(Mask) % 2 == 0 &&
+           "expected only 32-bit subreg access");
+
+    // For subreg reload, identify the start offset. Each 32-bit register
+    // consists of two regunits and eventually two bits in the Lanemask.
+    SubRegIdx = llvm::countr_zero(Mask) / 2;
+  }
+
   MachinePointerInfo PtrInfo
     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
 
-  MachineMemOperand *MMO = MF->getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
-      FrameInfo.getObjectAlign(FrameIndex));
+  MachineMemOperand *MMO =
+      MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, SpillSize,
+                               FrameInfo.getObjectAlign(FrameIndex));
 
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
@@ -1911,19 +1922,22 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
     BuildMI(MBB, MI, DL, OpDesc, DestReg)
         .addFrameIndex(FrameIndex) // addr
-        .addImm(0)                 // offset
+        .addImm(SubRegIdx)         // offset
         .addMemOperand(MMO)
         .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
 
     return;
   }
 
+  // Convert the subreg index to stack offset.
+  SubRegIdx *= 4;
+
   unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
                                                    SpillSize, *MFI);
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
       .addFrameIndex(FrameIndex)           // vaddr
       .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
-      .addImm(0)                           // offset
+      .addImm(SubRegIdx)                   // offset
       .addMemOperand(MMO);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 4777e06..d7d5648 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3634,6 +3634,15 @@ bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
   return RC && isAGPRClass(RC);
 }
 
+bool SIRegisterInfo::shouldEnableSubRegReload(unsigned SubReg) const {
+  // Disable lo16 and hi16 (16-bit) accesses as they are subreg views of the
+  // same 32-bit register and don't represent independent storage. If the number
+  // of bits set in the mask is odd, it indicates the presence of a 16-bit
+  // access as each 32-bit register consists of two Regunits and they take two
+  // bits in the regmask.
+  return getSubRegIndexLaneMask(SubReg).getNumLanes() % 2 == 0;
+}
+
 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                              MachineFunction &MF) const {
   unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
@@ -3768,6 +3777,11 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
 }
 
 const TargetRegisterClass *
+SIRegisterInfo::getConstrainedRegClass(const TargetRegisterClass *RC) const {
+  return getProperlyAlignedRC(RC);
+}
+
+const TargetRegisterClass *
 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
                                          const MachineRegisterInfo &MRI) const {
   const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 4c8e217..60cc776 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -346,6 +346,8 @@ public:
   ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
                                      unsigned EltSize) const;
 
+  bool shouldEnableSubRegReload(unsigned SubReg) const override;
+
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
@@ -370,6 +372,9 @@ public:
   }
 
   const TargetRegisterClass *
+  getConstrainedRegClass(const TargetRegisterClass *RC) const override;
+
+  const TargetRegisterClass *
   getConstrainedRegClassForOperand(const MachineOperand &MO,
                                  const MachineRegisterInfo &MRI) const override;
author	Christudasan Devadasan <Christudasan.Devadasan@amd.com>	2026-01-08 07:53:18 +0000
committer	Christudasan Devadasan <Christudasan.Devadasan@amd.com>	2026-01-13 03:50:24 +0000
commit	070b3e99ac751c9e6cb38201e0eec68d72a55542 (patch)
tree	b798e5d2717b69aa89cccd322f5518d37f9e47c0 /llvm/lib
parent	12d1aa0c8430c9d8015bfb285aae7d5e260db8ad (diff)
download	llvm-users/cdevadas/subreg-reload.zip llvm-users/cdevadas/subreg-reload.tar.gz llvm-users/cdevadas/subreg-reload.tar.bz2