aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp85
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp56
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h43
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp145
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h9
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h15
10 files changed, 172 insertions, 219 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 2a324e5..626734a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -997,89 +997,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const Function &F = MF.getFunction();
// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
- // dispatch registers are function args.
- unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
-
- if (isShader(F.getCallingConv())) {
- bool IsPixelShader =
- F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
-
- // Calculate the number of VGPR registers based on the SPI input registers
- uint32_t InputEna = 0;
- uint32_t InputAddr = 0;
- unsigned LastEna = 0;
-
- if (IsPixelShader) {
- // Note for IsPixelShader:
- // By this stage, all enabled inputs are tagged in InputAddr as well.
- // We will use InputAddr to determine whether the input counts against the
- // vgpr total and only use the InputEnable to determine the last input
- // that is relevant - if extra arguments are used, then we have to honour
- // the InputAddr for any intermediate non-enabled inputs.
- InputEna = MFI->getPSInputEnable();
- InputAddr = MFI->getPSInputAddr();
-
- // We only need to consider input args up to the last used arg.
- assert((InputEna || InputAddr) &&
- "PSInputAddr and PSInputEnable should "
- "never both be 0 for AMDGPU_PS shaders");
- // There are some rare circumstances where InputAddr is non-zero and
- // InputEna can be set to 0. In this case we default to setting LastEna
- // to 1.
- LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
- }
+ // dispatch registers as function args.
+ unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
+ WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
- // FIXME: We should be using the number of registers determined during
- // calling convention lowering to legalize the types.
- const DataLayout &DL = F.getDataLayout();
- unsigned PSArgCount = 0;
- unsigned IntermediateVGPR = 0;
- for (auto &Arg : F.args()) {
- unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
- if (Arg.hasAttribute(Attribute::InReg)) {
- WaveDispatchNumSGPR += NumRegs;
- } else {
- // If this is a PS shader and we're processing the PS Input args (first
- // 16 VGPR), use the InputEna and InputAddr bits to define how many
- // VGPRs are actually used.
- // Any extra VGPR arguments are handled as normal arguments (and
- // contribute to the VGPR count whether they're used or not).
- if (IsPixelShader && PSArgCount < 16) {
- if ((1 << PSArgCount) & InputAddr) {
- if (PSArgCount < LastEna)
- WaveDispatchNumVGPR += NumRegs;
- else
- IntermediateVGPR += NumRegs;
- }
- PSArgCount++;
- } else {
- // If there are extra arguments we have to include the allocation for
- // the non-used (but enabled with InputAddr) input arguments
- if (IntermediateVGPR) {
- WaveDispatchNumVGPR += IntermediateVGPR;
- IntermediateVGPR = 0;
- }
- WaveDispatchNumVGPR += NumRegs;
- }
- }
- }
+ if (WaveDispatchNumSGPR) {
ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
- {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
+ {ProgInfo.NumSGPR,
+ MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
+ Ctx)},
+ Ctx);
+ }
+ if (WaveDispatchNumVGPR) {
ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
- } else if (isKernel(F.getCallingConv()) &&
- MFI->getNumKernargPreloadedSGPRs()) {
- // Consider cases where the total number of UserSGPRs with trailing
- // allocated preload SGPRs, is greater than the number of explicitly
- // referenced SGPRs.
- const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
- CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
- ProgInfo.NumSGPR =
- AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 3d8d274..64a9bde 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
++i;
}
+ if (Info->getNumKernargPreloadedSGPRs())
+ Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
+
TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
return true;
@@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!determineAssignments(Assigner, SplitArgs, CCInfo))
return false;
+ if (IsEntryFunc) {
+ // This assumes the registers are allocated by CCInfo in ascending order
+ // with no gaps.
+ Info->setNumWaveDispatchSGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+ Info->setNumWaveDispatchVGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+ }
+
FormalArgHandler Handler(B, MRI);
if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index f580f43..c21a9a1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -109,12 +109,17 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
// Find AV_* registers assigned to AGPRs.
const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg);
- if (!TRI.isVectorSuperClass(VirtRegRC))
+ if (!TRI.hasAGPRs(VirtRegRC))
continue;
- const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
- if (!TRI.isAGPRClass(AssignedRC))
- continue;
+ const TargetRegisterClass *AssignedRC = VirtRegRC;
+ if (TRI.hasVGPRs(VirtRegRC)) {
+ // If this is an AV register, we have to check if the actual assignment is
+ // to an AGPR
+ AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
+ if (!TRI.isAGPRClass(AssignedRC))
+ continue;
+ }
LiveInterval &LI = LIS.getInterval(VReg);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 334afd3..ef63acc 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
////////////////////////////////////////////////////////////////////////////////
// GCNRPTarget
-GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
const Function &F = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF);
+ setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F));
}
GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs,
- const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
- setRegLimits(NumSGPRs, NumVGPRs, MF);
+ const MachineFunction &MF, const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
+ setTarget(NumSGPRs, NumVGPRs);
}
GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+ const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
- setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
- ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF);
+ setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
+ ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize));
}
-void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
- const MachineFunction &MF) {
+void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs);
MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs);
- MaxUnifiedVGPRs =
- ST.hasGFX90AInsts()
- ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs)
- : 0;
+ if (UnifiedRF) {
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ MaxUnifiedVGPRs =
+ std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs);
+ } else {
+ MaxUnifiedVGPRs = 0;
+ }
}
-bool GCNRPTarget::isSaveBeneficial(Register Reg,
- const MachineRegisterInfo &MRI) const {
+bool GCNRPTarget::isSaveBeneficial(Register Reg) const {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
@@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg,
return RP.getSGPRNum() > MaxSGPRs;
unsigned NumVGPRs =
SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
- return isVGPRBankSaveBeneficial(NumVGPRs);
+ // The addressable limit must always be respected.
+ if (NumVGPRs > MaxVGPRs)
+ return true;
+ // For unified RFs, combined VGPR usage limit must be respected as well.
+ return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs;
}
bool GCNRPTarget::satisfied() const {
- if (RP.getSGPRNum() > MaxSGPRs)
+ if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs)
return false;
- if (RP.getVGPRNum(false) > MaxVGPRs &&
- (!CombineVGPRSavings || !satisifiesVGPRBanksTarget()))
+ if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs)
return false;
- return satisfiesUnifiedTarget();
+ return true;
}
///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ea33a22..a9c58bb 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -186,20 +186,22 @@ public:
/// Sets up the target such that the register pressure starting at \p RP does
/// not show register spilling on function \p MF (w.r.t. the function's
/// mininum target occupancy).
- GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings = false);
+ GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP);
/// Sets up the target such that the register pressure starting at \p RP does
/// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p
/// MF.
GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+ const GCNRegPressure &RP);
/// Sets up the target such that the register pressure starting at \p RP does
/// not prevent achieving an occupancy of at least \p Occupancy on function
/// \p MF.
GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+ const GCNRegPressure &RP);
+
+ /// Changes the target (same semantics as constructor).
+ void setTarget(unsigned NumSGPRs, unsigned NumVGPRs);
const GCNRegPressure &getCurrentRP() const { return RP; }
@@ -207,7 +209,7 @@ public:
/// Determines whether saving virtual register \p Reg will be beneficial
/// towards achieving the RP target.
- bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const;
+ bool isSaveBeneficial(Register Reg) const;
/// Saves virtual register \p Reg with lanemask \p Mask.
void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) {
@@ -227,15 +229,15 @@ public:
if (Target.MaxUnifiedVGPRs) {
OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs
<< " VGPRs (unified)";
- } else if (Target.CombineVGPRSavings) {
- OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/'
- << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
}
return OS;
}
#endif
private:
+ const MachineFunction &MF;
+ const bool UnifiedRF;
+
/// Current register pressure.
GCNRegPressure RP;
@@ -246,29 +248,10 @@ private:
/// Target number of overall VGPRs for subtargets with unified RFs. Always 0
/// for subtargets with non-unified RFs.
unsigned MaxUnifiedVGPRs;
- /// Whether we consider that the register allocator will be able to swap
- /// between ArchVGPRs and AGPRs by copying them to a super register class.
- /// Concretely, this allows savings in one of the VGPR banks to help toward
- /// savings in the other VGPR bank.
- bool CombineVGPRSavings;
-
- inline bool satisifiesVGPRBanksTarget() const {
- assert(CombineVGPRSavings && "only makes sense with combined savings");
- return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs;
- }
-
- /// Always satisified when the subtarget doesn't have a unified RF.
- inline bool satisfiesUnifiedTarget() const {
- return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs;
- }
-
- inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const {
- return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() ||
- (CombineVGPRSavings && !satisifiesVGPRBanksTarget());
- }
- void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs,
- const MachineFunction &MF);
+ GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF)
+ : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()),
+ RP(RP) {}
};
///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 96d5668..254b75b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
}
/// Allows to easily filter for this stage's debug output.
-#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
+#define REMAT_PREFIX "[PreRARemat] "
+#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
@@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() {
rematerialize();
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
- REMAT_DEBUG(
- dbgs() << "Retrying function scheduling with new min. occupancy of "
- << AchievedOcc << " from rematerializing (original was "
- << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
+ REMAT_DEBUG({
+ dbgs() << "Retrying function scheduling with new min. occupancy of "
+ << AchievedOcc << " from rematerializing (original was "
+ << DAG.MinOccupancy;
+ if (TargetOcc)
+ dbgs() << ", target was " << *TargetOcc;
+ dbgs() << ")\n";
+ });
+
if (AchievedOcc > DAG.MinOccupancy) {
DAG.MinOccupancy = AchievedOcc;
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
- mayCauseSpilling(WavesAfter) ||
- (IncreaseOccupancy && WavesAfter < TargetOcc);
+ mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
}
bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
}
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
- REMAT_DEBUG({
- dbgs() << "Collecting rematerializable instructions in ";
- MF.getFunction().printAsOperand(dbgs(), false);
- dbgs() << '\n';
- });
+ const Function &F = MF.getFunction();
// Maps optimizable regions (i.e., regions at minimum and register-limited
// occupancy, or regions with spilling) to the target RP we would like to
// reach.
DenseMap<unsigned, GCNRPTarget> OptRegions;
- const Function &F = MF.getFunction();
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
-
- std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
- const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
- const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
- const unsigned MaxSGPRsIncOcc =
- ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
- const unsigned MaxVGPRsIncOcc =
- ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
- IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
-
- // Collect optimizable regions. If there is spilling in any region we will
- // just try to reduce spilling. Otherwise we will try to increase occupancy by
- // one in the whole function.
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- GCNRegPressure &RP = DAG.Pressure[I];
- // We allow ArchVGPR or AGPR savings to count as savings of the other kind
- // of VGPR only when trying to eliminate spilling. We cannot do this when
- // trying to increase occupancy since VGPR class swaps only occur later in
- // the register allocator i.e., the scheduler will not be able to reason
- // about these savings and will not report an increase in the achievable
- // occupancy, triggering rollbacks.
- GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP,
- /*CombineVGPRSavings=*/true);
- if (!Target.satisfied() && IncreaseOccupancy) {
- // There is spilling in the region and we were so far trying to increase
- // occupancy. Strop trying that and focus on reducing spilling.
- IncreaseOccupancy = false;
- OptRegions.clear();
- } else if (IncreaseOccupancy) {
- // There is no spilling in the region, try to increase occupancy.
- Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
- /*CombineVGPRSavings=*/false);
+ unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
+ auto ResetTargetRegions = [&]() {
+ OptRegions.clear();
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ const GCNRegPressure &RP = DAG.Pressure[I];
+ GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
+ if (!Target.satisfied())
+ OptRegions.insert({I, Target});
}
- if (!Target.satisfied())
- OptRegions.insert({I, Target});
- }
- if (OptRegions.empty())
- return false;
+ };
-#ifndef NDEBUG
- if (IncreaseOccupancy) {
- REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy
- << ") in regions:\n");
+ ResetTargetRegions();
+ if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
+ // In addition to register usage being above addressable limits, occupancy
+ // below the minimum is considered like "spilling" as well.
+ TargetOcc = std::nullopt;
} else {
- REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy ("
- << WavesPerEU.first << ") in regions:\n");
- }
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
- REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n');
+ // There is no spilling and room to improve occupancy; set up "increased
+ // occupancy targets" for all regions.
+ TargetOcc = DAG.MinOccupancy + 1;
+ unsigned VGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
+ MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
+ ResetTargetRegions();
}
-#endif
-
- // When we are reducing spilling, the target is the minimum target number of
- // waves/EU determined by the subtarget. In cases where either one of
- // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
- // minimum region occupancy may be higher than the latter.
- TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
- : std::max(DAG.MinOccupancy, WavesPerEU.first);
+ REMAT_DEBUG({
+ dbgs() << "Analyzing ";
+ MF.getFunction().printAsOperand(dbgs(), false);
+ dbgs() << ": ";
+ if (OptRegions.empty()) {
+ dbgs() << "no objective to achieve, occupancy is maximal at "
+ << MFI.getMaxWavesPerEU();
+ } else if (!TargetOcc) {
+ dbgs() << "reduce spilling (minimum target occupancy is "
+ << MFI.getMinWavesPerEU() << ')';
+ } else {
+ dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
+ << TargetOcc;
+ }
+ dbgs() << '\n';
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
+ << '\n';
+ }
+ }
+ });
+ if (OptRegions.empty())
+ return false;
// Accounts for a reduction in RP in an optimizable region. Returns whether we
// estimate that we have identified enough rematerialization opportunities to
@@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
bool &Progress) -> bool {
GCNRPTarget &Target = OptIt->getSecond();
- if (!Target.isSaveBeneficial(Reg, DAG.MRI))
+ if (!Target.isSaveBeneficial(Reg))
return false;
Progress = true;
Target.saveReg(Reg, Mask, DAG.MRI);
@@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
}
}
- if (IncreaseOccupancy) {
+ if (TargetOcc) {
// We were trying to increase occupancy but failed, abort the stage.
REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
Rematerializations.clear();
@@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() {
// All regions impacted by at least one rematerialization must be rescheduled.
// Maximum pressure must also be recomputed for all regions where it changed
// non-predictably and checked against the target occupancy.
- AchievedOcc = TargetOcc;
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ AchievedOcc = MFI.getMaxWavesPerEU();
for (auto &[I, OriginalRP] : ImpactedRegions) {
bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
RescheduleRegions[I] = !IsEmptyRegion;
@@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() {
}
}
DAG.Pressure[I] = RP;
- AchievedOcc = std::min(
- AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
- ->getDynamicVGPRBlockSize()));
+ AchievedOcc =
+ std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
}
@@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
// which case we do not want to rollback either (the rescheduling was already
// reverted in PreRARematStage::shouldRevertScheduling in such cases).
unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
- if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
+ if (!TargetOcc || MaxOcc >= *TargetOcc)
return;
REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 32139a9..790370f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -470,15 +470,12 @@ private:
/// After successful stage initialization, indicates which regions should be
/// rescheduled.
BitVector RescheduleRegions;
- /// Target occupancy the stage estimates is reachable through
- /// rematerialization. Greater than or equal to the pre-stage min occupancy.
- unsigned TargetOcc;
+ /// The target occupancy the stage is trying to achieve. Empty when the
+ /// objective is spilling reduction.
+ std::optional<unsigned> TargetOcc;
/// Achieved occupancy *only* through rematerializations (pre-rescheduling).
/// Smaller than or equal to the target occupancy.
unsigned AchievedOcc;
- /// Whether the stage is attempting to increase occupancy in the abscence of
- /// spilling.
- bool IncreaseOccupancy;
/// Returns whether remat can reduce spilling or increase function occupancy
/// by 1 through rematerialization. If it can do one, collects instructions in
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b327fb..1b7d65a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments(
if (!IsKernel) {
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+
+ // This assumes the registers are allocated by CCInfo in ascending order
+ // with no gaps.
+ Info->setNumWaveDispatchSGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+ Info->setNumWaveDispatchVGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+ } else if (Info->getNumKernargPreloadedSGPRs()) {
+ Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
}
SmallVector<SDValue, 16> Chains;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 9a1448f..49425d5 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
+ NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
+ NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
Occupancy(MFI.getOccupancy()),
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
@@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
WaveLimiter = YamlMFI.WaveLimiter;
HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
+ NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
+ NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
ReturnsVoid = YamlMFI.ReturnsVoid;
IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 08b0206..ca8f803 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool WaveLimiter = false;
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
+ uint16_t NumWaveDispatchSGPRs = 0;
+ uint16_t NumWaveDispatchVGPRs = 0;
uint32_t HighBitsOf32BitAddress = 0;
// TODO: 10 may be a better default since it's the maximum.
@@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false);
+ YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false);
+ YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false);
YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
StringValue("$private_rsrc_reg"));
YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
@@ -465,6 +469,9 @@ private:
unsigned NumUserSGPRs = 0;
unsigned NumSystemSGPRs = 0;
+ unsigned NumWaveDispatchSGPRs = 0;
+ unsigned NumWaveDispatchVGPRs = 0;
+
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
bool HasNonSpillStackObjects = false;
@@ -991,6 +998,14 @@ public:
return UserSGPRInfo.getNumKernargPreloadSGPRs();
}
+ unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; }
+
+ void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; }
+
+ unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; }
+
+ void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; }
+
Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}