diff options
| author | Christudasan Devadasan <Christudasan.Devadasan@amd.com> | 2026-01-08 07:53:18 +0000 |
|---|---|---|
| committer | Christudasan Devadasan <Christudasan.Devadasan@amd.com> | 2026-01-13 03:50:24 +0000 |
| commit | 070b3e99ac751c9e6cb38201e0eec68d72a55542 (patch) | |
| tree | b798e5d2717b69aa89cccd322f5518d37f9e47c0 /llvm/lib/Target | |
| parent | 12d1aa0c8430c9d8015bfb285aae7d5e260db8ad (diff) | |
| download | llvm-users/cdevadas/subreg-reload.zip llvm-users/cdevadas/subreg-reload.tar.gz llvm-users/cdevadas/subreg-reload.tar.bz2 | |
[InlineSpiller][AMDGPU] Implement subreg reload during RA spillusers/cdevadas/subreg-reload
Currently, when a virtual register is partially used, the
entire tuple is restored from the spilled location, even if
only a subset of its sub-registers is needed. This patch
introduces support for partial reloads by analyzing actual
register usage and restoring only the required sub-registers.
This improvement enhances register allocation efficiency,
particularly for cases involving tuple virtual registers.
For AMDGPU, this change brings considerable improvements
in workloads that involve matrix operations, large vectors,
and complex control flows.
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 24 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 14 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 5 |
3 files changed, 38 insertions, 5 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 7dee976..0b9f56c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1886,12 +1886,23 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const DebugLoc &DL = MBB.findDebugLoc(MI); unsigned SpillSize = RI.getSpillSize(*RC); + unsigned SubRegIdx = 0; + if (SubReg) { + uint64_t Mask = RI.getSubRegIndexLaneMask(SubReg).getAsInteger(); + assert(llvm::popcount(Mask) % 2 == 0 && + "expected only 32-bit subreg access"); + + // For subreg reload, identify the start offset. Each 32-bit register + // consists of two regunits and eventually two bits in the Lanemask. + SubRegIdx = llvm::countr_zero(Mask) / 2; + } + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); - MachineMemOperand *MMO = MF->getMachineMemOperand( - PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), - FrameInfo.getObjectAlign(FrameIndex)); + MachineMemOperand *MMO = + MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, SpillSize, + FrameInfo.getObjectAlign(FrameIndex)); if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); @@ -1911,19 +1922,22 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr - .addImm(0) // offset + .addImm(SubRegIdx) // offset .addMemOperand(MMO) .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); return; } + // Convert the subreg index to stack offset. + SubRegIdx *= 4; + unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, SpillSize, *MFI); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset - .addImm(0) // offset + .addImm(SubRegIdx) // offset .addMemOperand(MMO); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 4777e06..d7d5648 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3634,6 +3634,15 @@ bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, return RC && isAGPRClass(RC); } +bool SIRegisterInfo::shouldEnableSubRegReload(unsigned SubReg) const { + // Disable lo16 and hi16 (16-bit) accesses as they are subreg views of the + // same 32-bit register and don't represent independent storage. If the number + // of bits set in the mask is odd, it indicates the presence of a 16-bit + // access as each 32-bit register consists of two Regunits and they take two + // bits in the regmask. + return getSubRegIndexLaneMask(SubReg).getNumLanes() % 2 == 0; +} + unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first; @@ -3768,6 +3777,11 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, } const TargetRegisterClass * +SIRegisterInfo::getConstrainedRegClass(const TargetRegisterClass *RC) const { + return getProperlyAlignedRC(RC); +} + +const TargetRegisterClass * SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const { const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 4c8e217..60cc776 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -346,6 +346,8 @@ public: ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const; + bool shouldEnableSubRegReload(unsigned SubReg) const override; + unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; @@ -370,6 +372,9 @@ public: } const TargetRegisterClass * + getConstrainedRegClass(const TargetRegisterClass *RC) const override; + + const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override; |
