aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
diff options
context:
space:
mode:
authorChristudasan Devadasan <Christudasan.Devadasan@amd.com>2026-01-08 07:53:18 +0000
committerChristudasan Devadasan <Christudasan.Devadasan@amd.com>2026-01-13 03:50:24 +0000
commit070b3e99ac751c9e6cb38201e0eec68d72a55542 (patch)
treeb798e5d2717b69aa89cccd322f5518d37f9e47c0 /llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
parent12d1aa0c8430c9d8015bfb285aae7d5e260db8ad (diff)
downloadllvm-users/cdevadas/subreg-reload.zip
llvm-users/cdevadas/subreg-reload.tar.gz
llvm-users/cdevadas/subreg-reload.tar.bz2
[InlineSpiller][AMDGPU] Implement subreg reload during RA spillusers/cdevadas/subreg-reload
Currently, when a virtual register is partially used, the entire tuple is restored from the spilled location, even if only a subset of its sub-registers is needed. This patch introduces support for partial reloads by analyzing actual register usage and restoring only the required sub-registers. This improvement enhances register allocation efficiency, particularly for cases involving tuple virtual registers. For AMDGPU, this change brings considerable improvements in workloads that involve matrix operations, large vectors, and complex control flows.
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp24
1 files changed, 19 insertions, 5 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7dee976..0b9f56c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1886,12 +1886,23 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const DebugLoc &DL = MBB.findDebugLoc(MI);
unsigned SpillSize = RI.getSpillSize(*RC);
+ unsigned SubRegIdx = 0;
+ if (SubReg) {
+ uint64_t Mask = RI.getSubRegIndexLaneMask(SubReg).getAsInteger();
+ assert(llvm::popcount(Mask) % 2 == 0 &&
+ "expected only 32-bit subreg access");
+
+ // For subreg reload, identify the start offset. Each 32-bit register
+ // consists of two regunits and eventually two bits in the Lanemask.
+ SubRegIdx = llvm::countr_zero(Mask) / 2;
+ }
+
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
- MachineMemOperand *MMO = MF->getMachineMemOperand(
- PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
- FrameInfo.getObjectAlign(FrameIndex));
+ MachineMemOperand *MMO =
+ MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, SpillSize,
+ FrameInfo.getObjectAlign(FrameIndex));
if (RI.isSGPRClass(RC)) {
MFI->setHasSpilledSGPRs();
@@ -1911,19 +1922,22 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
- .addImm(0) // offset
+ .addImm(SubRegIdx) // offset
.addMemOperand(MMO)
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
return;
}
+ // Convert the subreg index to stack offset.
+ SubRegIdx *= 4;
+
unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
SpillSize, *MFI);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
- .addImm(0) // offset
+ .addImm(SubRegIdx) // offset
.addMemOperand(MMO);
}