diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 104 |
1 files changed, 93 insertions, 11 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index cb49936..ef58004 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1211,16 +1211,81 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP, llvm_unreachable("AAAMDWavesPerEU is only valid for function position"); } -static bool inlineAsmUsesAGPRs(const InlineAsm *IA) { - for (const auto &CI : IA->ParseConstraints()) { +/// Compute the minimum number of AGPRs required to allocate the inline asm. +static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA, + const CallBase &Call) { + unsigned ArgNo = 0; + unsigned ResNo = 0; + unsigned AGPRDefCount = 0; + unsigned AGPRUseCount = 0; + unsigned MaxPhysReg = 0; + const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout(); + + // TODO: Overestimates due to not accounting for tied operands + for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) { + Type *Ty = nullptr; + switch (CI.Type) { + case InlineAsm::isOutput: { + Ty = Call.getType(); + if (auto *STy = dyn_cast<StructType>(Ty)) + Ty = STy->getElementType(ResNo); + ++ResNo; + break; + } + case InlineAsm::isInput: { + Ty = Call.getArgOperand(ArgNo++)->getType(); + break; + } + case InlineAsm::isLabel: + continue; + case InlineAsm::isClobber: + // Parse the physical register reference. + break; + } + for (StringRef Code : CI.Codes) { - Code.consume_front("{"); - if (Code.starts_with("a")) - return true; + unsigned RegCount = 0; + if (Code.starts_with("a")) { + // Virtual register, compute number of registers based on the type. + // + // We ought to be going through TargetLowering to get the number of + // registers, but we should avoid the dependence on CodeGen here. + RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32); + } else { + // Physical register reference + auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code); + if (Kind == 'a') { + RegCount = NumRegs; + MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u)); + } + + continue; + } + + if (CI.Type == InlineAsm::isOutput) { + // Apply tuple alignment requirement + // + // TODO: This is more conservative than necessary. + AGPRDefCount = alignTo(AGPRDefCount, RegCount); + + AGPRDefCount += RegCount; + if (CI.isEarlyClobber) { + AGPRUseCount = alignTo(AGPRUseCount, RegCount); + AGPRUseCount += RegCount; + } + } else { + AGPRUseCount = alignTo(AGPRUseCount, RegCount); + AGPRUseCount += RegCount; + } } } - return false; + unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount); + + // TODO: This is overly conservative. If there are any physical registers, + // allocate any virtual registers after them so we don't have to solve optimal + // packing. + return std::min(MaxVirtReg + MaxPhysReg, 256u); } // TODO: Migrate to range merge of amdgpu-agpr-alloc. @@ -1259,14 +1324,29 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> { const Function *Callee = dyn_cast<Function>(CalleeOp); if (!Callee) { if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) - return !inlineAsmUsesAGPRs(IA); + return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0; return false; } - // Some intrinsics may use AGPRs, but if we have a choice, we are not - // required to use AGPRs. - if (Callee->isIntrinsic()) + switch (Callee->getIntrinsicID()) { + case Intrinsic::not_intrinsic: + break; + case Intrinsic::write_register: + case Intrinsic::read_register: + case Intrinsic::read_volatile_register: { + const MDString *RegName = cast<MDString>( + cast<MDNode>( + cast<MetadataAsValue>(CB.getArgOperand(0))->getMetadata()) + ->getOperand(0)); + auto [Kind, RegIdx, NumRegs] = + AMDGPU::parseAsmPhysRegName(RegName->getString()); + return Kind != 'a'; + } + default: + // Some intrinsics may use AGPRs, but if we have a choice, we are not + // required to use AGPRs. return true; + } // TODO: Handle callsite attributes const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>( @@ -1504,7 +1584,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F)); A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F)); A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F)); - A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); CallingConv::ID CC = F->getCallingConv(); if (!AMDGPU::isEntryFunctionCC(CC)) { A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F)); @@ -1515,6 +1594,9 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, if (!F->isDeclaration() && ST.hasClusters()) A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F)); + if (ST.hasGFX90AInsts()) + A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); + for (auto &I : instructions(F)) { Value *Ptr = nullptr; if (auto *LI = dyn_cast<LoadInst>(&I)) |