aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp31
1 files changed, 25 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 75ce67c..9a1448f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,6 +29,16 @@ enum { MAX_LANES = 64 };
using namespace llvm;
+// TODO -- delete this flag once we have more robust mechanisms to allocate the
+// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
+// where it is better to produce the VGPR form (e.g. if there are VGPR users
+// of the MFMA result).
+cl::opt<bool> MFMAVGPRForm(
+ "amdgpu-mfma-vgpr-form", cl::Hidden,
+ cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
+ "unspecified, default to compiler heuristics"),
+ cl::init(false));
+
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
const SITargetLowering *TLI = STI->getTargetLowering();
return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
@@ -41,7 +51,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
- GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
+ GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
+ IsWholeWaveFunction(F.getCallingConv() ==
+ CallingConv::AMDGPU_Gfx_WholeWave) {
const GCNSubtarget &ST = *STI;
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
@@ -70,10 +82,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
}
MayNeedAGPRs = ST.hasMAIInsts();
- if (ST.hasGFX90AInsts() &&
- ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
- !mayUseAGPRs(F))
- MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+ if (ST.hasGFX90AInsts()) {
+ // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
+ // should be separated from availability of AGPRs
+ if (MFMAVGPRForm ||
+ (ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
+ !mayUseAGPRs(F)))
+ MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+ }
if (AMDGPU::isChainCC(CC)) {
// Chain functions don't receive an SP from their caller, but are free to
@@ -89,7 +105,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
ImplicitArgPtr = false;
} else if (!isEntryFunction()) {
- if (CC != CallingConv::AMDGPU_Gfx)
+ if (CC != CallingConv::AMDGPU_Gfx &&
+ CC != CallingConv::AMDGPU_Gfx_WholeWave)
ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
FrameOffsetReg = AMDGPU::SGPR33;
@@ -722,6 +739,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
+ IsWholeWaveFunction(MFI.isWholeWaveFunction()),
DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
@@ -768,6 +786,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
ReturnsVoid = YamlMFI.ReturnsVoid;
+ IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
if (YamlMFI.ScavengeFI) {
auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());