diff options
Diffstat (limited to 'llvm/lib')
53 files changed, 730 insertions, 1038 deletions
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index d9d41f1..dc81843 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -445,7 +445,6 @@ private: bool optimizeSwitchInst(SwitchInst *SI); bool optimizeExtractElementInst(Instruction *Inst); bool dupRetToEnableTailCallOpts(BasicBlock *BB, ModifyDT &ModifiedDT); - bool fixupDbgValue(Instruction *I); bool fixupDbgVariableRecord(DbgVariableRecord &I); bool fixupDbgVariableRecordsOnInst(Instruction &I); bool placeDbgValues(Function &F); @@ -2762,9 +2761,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { case Intrinsic::fshl: case Intrinsic::fshr: return optimizeFunnelShift(II); - case Intrinsic::dbg_assign: - case Intrinsic::dbg_value: - return fixupDbgValue(II); case Intrinsic::masked_gather: return optimizeGatherScatterInst(II, II->getArgOperand(0)); case Intrinsic::masked_scatter: @@ -3554,8 +3550,6 @@ class TypePromotionTransaction { /// Keep track of the original uses (pair Instruction, Index). SmallVector<InstructionAndIdx, 4> OriginalUses; /// Keep track of the debug users. - SmallVector<DbgValueInst *, 1> DbgValues; - /// And non-instruction debug-users too. SmallVector<DbgVariableRecord *, 1> DbgVariableRecords; /// Keep track of the new value so that we can undo it by replacing @@ -3577,7 +3571,9 @@ class TypePromotionTransaction { } // Record the debug uses separately. They are not in the instruction's // use list, but they are replaced by RAUW. + SmallVector<DbgValueInst *> DbgValues; findDbgValues(DbgValues, Inst, &DbgVariableRecords); + assert(DbgValues.empty()); // Now, we can replace the uses. Inst->replaceAllUsesWith(New); @@ -3591,11 +3587,7 @@ class TypePromotionTransaction { // RAUW has replaced all original uses with references to the new value, // including the debug uses. Since we are undoing the replacements, // the original debug uses must also be reinstated to maintain the - // correctness and utility of debug value instructions. - for (auto *DVI : DbgValues) - DVI->replaceVariableLocationOp(New, Inst); - // Similar story with DbgVariableRecords, the non-instruction - // representation of dbg.values. + // correctness and utility of debug value records. for (DbgVariableRecord *DVR : DbgVariableRecords) DVR->replaceVariableLocationOp(New, Inst); } @@ -8933,32 +8925,6 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT) { return MadeChange; } -// Some CGP optimizations may move or alter what's computed in a block. Check -// whether a dbg.value intrinsic could be pointed at a more appropriate operand. -bool CodeGenPrepare::fixupDbgValue(Instruction *I) { - assert(isa<DbgValueInst>(I)); - DbgValueInst &DVI = *cast<DbgValueInst>(I); - - // Does this dbg.value refer to a sunk address calculation? - bool AnyChange = false; - SmallDenseSet<Value *> LocationOps(DVI.location_ops().begin(), - DVI.location_ops().end()); - for (Value *Location : LocationOps) { - WeakTrackingVH SunkAddrVH = SunkAddrs[Location]; - Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr; - if (SunkAddr) { - // Point dbg.value at locally computed address, which should give the best - // opportunity to be accurately lowered. This update may change the type - // of pointer being referred to; however this makes no difference to - // debugging information, and we can't generate bitcasts that may affect - // codegen. - DVI.replaceVariableLocationOp(Location, SunkAddr); - AnyChange = true; - } - } - return AnyChange; -} - bool CodeGenPrepare::fixupDbgVariableRecordsOnInst(Instruction &I) { bool AnyChange = false; for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) @@ -8993,14 +8959,6 @@ bool CodeGenPrepare::fixupDbgVariableRecord(DbgVariableRecord &DVR) { return AnyChange; } -static void DbgInserterHelper(DbgValueInst *DVI, BasicBlock::iterator VI) { - DVI->removeFromParent(); - if (isa<PHINode>(VI)) - DVI->insertBefore(VI->getParent()->getFirstInsertionPt()); - else - DVI->insertAfter(VI); -} - static void DbgInserterHelper(DbgVariableRecord *DVR, BasicBlock::iterator VI) { DVR->removeFromParent(); BasicBlock *VIBB = VI->getParent(); @@ -9065,15 +9023,8 @@ bool CodeGenPrepare::placeDbgValues(Function &F) { for (BasicBlock &BB : F) { for (Instruction &Insn : llvm::make_early_inc_range(BB)) { - // Process dbg.value intrinsics. - DbgValueInst *DVI = dyn_cast<DbgValueInst>(&Insn); - if (DVI) { - DbgProcessor(DVI, DVI); - continue; - } - - // If this isn't a dbg.value, process any attached DbgVariableRecord - // records attached to this instruction. + // Process any DbgVariableRecord records attached to this + // instruction. for (DbgVariableRecord &DVR : llvm::make_early_inc_range( filterDbgVars(Insn.getDbgRecordRange()))) { if (DVR.Type != DbgVariableRecord::LocationType::Value) diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 9daacfd..e7fa082 100644 --- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -202,8 +202,8 @@ MachineBlockFrequencyInfo::MachineBlockFrequencyInfo( MachineBlockFrequencyInfo &&) = default; MachineBlockFrequencyInfo::MachineBlockFrequencyInfo( - MachineFunction &F, MachineBranchProbabilityInfo &MBPI, - MachineLoopInfo &MLI) { + const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI, + const MachineLoopInfo &MLI) { calculate(F, MBPI, MLI); } diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp index 9b9cebc..1a20fe5 100644 --- a/llvm/lib/CodeGen/MachineDebugify.cpp +++ b/llvm/lib/CodeGen/MachineDebugify.cpp @@ -63,24 +63,9 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI, // which cover a wide range of lines can help stress the debug info passes: // if we can't do that, fall back to using the local variable which precedes // all the others. - Function *DbgValF = M.getFunction("llvm.dbg.value"); - DbgValueInst *EarliestDVI = nullptr; DbgVariableRecord *EarliestDVR = nullptr; DenseMap<unsigned, DILocalVariable *> Line2Var; DIExpression *Expr = nullptr; - if (DbgValF) { - for (const Use &U : DbgValF->uses()) { - auto *DVI = dyn_cast<DbgValueInst>(U.getUser()); - if (!DVI || DVI->getFunction() != &F) - continue; - unsigned Line = DVI->getDebugLoc().getLine(); - assert(Line != 0 && "debugify should not insert line 0 locations"); - Line2Var[Line] = DVI->getVariable(); - if (!EarliestDVI || Line < EarliestDVI->getDebugLoc().getLine()) - EarliestDVI = DVI; - Expr = DVI->getExpression(); - } - } for (BasicBlock &BB : F) { for (Instruction &I : BB) { for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) { @@ -125,8 +110,7 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI, unsigned Line = MI.getDebugLoc().getLine(); auto It = Line2Var.find(Line); if (It == Line2Var.end()) { - Line = EarliestDVI ? EarliestDVI->getDebugLoc().getLine() - : EarliestDVR->getDebugLoc().getLine(); + Line = EarliestDVR->getDebugLoc().getLine(); It = Line2Var.find(Line); assert(It != Line2Var.end()); } diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp index e7a4d6d6..116a919 100644 --- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp @@ -45,3 +45,9 @@ MachineFunctionAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { return Result(std::move(MF)); } + +PreservedAnalyses FreeMachineFunctionPass::run(Function &F, + FunctionAnalysisManager &FAM) { + FAM.clearAnalysis<MachineFunctionAnalysis>(F); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 682d93d..5453828 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5569,6 +5569,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::BUILD_VECTOR: case ISD::BUILD_PAIR: case ISD::SPLAT_VECTOR: + case ISD::FABS: return false; case ISD::ABS: @@ -6750,7 +6751,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT, return SDValue(); int64_t Offset = C2->getSExtValue(); switch (Opcode) { - case ISD::ADD: break; + case ISD::ADD: + case ISD::PTRADD: + break; case ISD::SUB: Offset = -uint64_t(Offset); break; default: return SDValue(); } diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 84a56058..8fb33c3 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -2288,39 +2288,36 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) { // Collect a map of {backing storage : dbg.declares} (currently "backing // storage" is limited to Allocas). We'll use this to find dbg.declares to // delete after running `trackAssignments`. - DenseMap<const AllocaInst *, SmallPtrSet<DbgDeclareInst *, 2>> DbgDeclares; DenseMap<const AllocaInst *, SmallPtrSet<DbgVariableRecord *, 2>> DVRDeclares; // Create another similar map of {storage : variables} that we'll pass to // trackAssignments. StorageToVarsMap Vars; - auto ProcessDeclare = [&](auto *Declare, auto &DeclareList) { + auto ProcessDeclare = [&](DbgVariableRecord &Declare) { // FIXME: trackAssignments doesn't let you specify any modifiers to the // variable (e.g. fragment) or location (e.g. offset), so we have to // leave dbg.declares with non-empty expressions in place. - if (Declare->getExpression()->getNumElements() != 0) + if (Declare.getExpression()->getNumElements() != 0) return; - if (!Declare->getAddress()) + if (!Declare.getAddress()) return; if (AllocaInst *Alloca = - dyn_cast<AllocaInst>(Declare->getAddress()->stripPointerCasts())) { + dyn_cast<AllocaInst>(Declare.getAddress()->stripPointerCasts())) { // FIXME: Skip VLAs for now (let these variables use dbg.declares). if (!Alloca->isStaticAlloca()) return; // Similarly, skip scalable vectors (use dbg.declares instead). if (auto Sz = Alloca->getAllocationSize(*DL); Sz && Sz->isScalable()) return; - DeclareList[Alloca].insert(Declare); - Vars[Alloca].insert(VarRecord(Declare)); + DVRDeclares[Alloca].insert(&Declare); + Vars[Alloca].insert(VarRecord(&Declare)); } }; for (auto &BB : F) { for (auto &I : BB) { for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) { if (DVR.isDbgDeclare()) - ProcessDeclare(&DVR, DVRDeclares); + ProcessDeclare(DVR); } - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(&I)) - ProcessDeclare(DDI, DbgDeclares); } } @@ -2336,8 +2333,8 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) { trackAssignments(F.begin(), F.end(), Vars, *DL); // Delete dbg.declares for variables now tracked with assignment tracking. - auto DeleteSubsumedDeclare = [&](const auto &Markers, auto &Declares) { - (void)Markers; + for (auto &[Insts, Declares] : DVRDeclares) { + auto Markers = at::getDVRAssignmentMarkers(Insts); for (auto *Declare : Declares) { // Assert that the alloca that Declare uses is now linked to a dbg.assign // describing the same variable (i.e. check that this dbg.declare has @@ -2356,10 +2353,6 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) { Changed = true; } }; - for (auto &P : DbgDeclares) - DeleteSubsumedDeclare(at::getAssignmentMarkers(P.first), P.second); - for (auto &P : DVRDeclares) - DeleteSubsumedDeclare(at::getDVRAssignmentMarkers(P.first), P.second); return Changed; } diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index 2270923..f16963d 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -49,20 +49,11 @@ uint32_t DIType::getAlignInBits() const { const DIExpression::FragmentInfo DebugVariable::DefaultFragment = { std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()}; -DebugVariable::DebugVariable(const DbgVariableIntrinsic *DII) - : Variable(DII->getVariable()), - Fragment(DII->getExpression()->getFragmentInfo()), - InlinedAt(DII->getDebugLoc().getInlinedAt()) {} - DebugVariable::DebugVariable(const DbgVariableRecord *DVR) : Variable(DVR->getVariable()), Fragment(DVR->getExpression()->getFragmentInfo()), InlinedAt(DVR->getDebugLoc().getInlinedAt()) {} -DebugVariableAggregate::DebugVariableAggregate(const DbgVariableIntrinsic *DVI) - : DebugVariable(DVI->getVariable(), std::nullopt, - DVI->getDebugLoc()->getInlinedAt()) {} - DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line, unsigned Column, uint64_t AtomGroup, uint8_t AtomRank, ArrayRef<Metadata *> MDs, bool ImplicitCode) diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp index 79eeb08..2579fa3 100644 --- a/llvm/lib/Object/IRSymtab.cpp +++ b/llvm/lib/Object/IRSymtab.cpp @@ -54,11 +54,6 @@ static const char *PreservedSymbols[] = { "__stack_chk_guard", }; -static bool isPreservedGlobalVarName(StringRef Name) { - return StringRef(PreservedSymbols[0]) == Name || - StringRef(PreservedSymbols[1]) == Name; -} - namespace { const char *getExpectedProducerName() { @@ -86,16 +81,12 @@ struct Builder { // The StringTableBuilder does not create a copy of any strings added to it, // so this provides somewhere to store any strings that we create. Builder(SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder, - BumpPtrAllocator &Alloc, const Triple &TT) - : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc), TT(TT), - Libcalls(TT) {} + BumpPtrAllocator &Alloc) + : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc) {} DenseMap<const Comdat *, int> ComdatMap; Mangler Mang; - const Triple &TT; - - // FIXME: This shouldn't be here. - RTLIB::RuntimeLibcallsInfo Libcalls; + Triple TT; std::vector<storage::Comdat> Comdats; std::vector<storage::Module> Mods; @@ -107,10 +98,6 @@ struct Builder { std::vector<storage::Str> DependentLibraries; - bool isPreservedLibFuncName(StringRef Name) { - return Libcalls.getSupportedLibcallImpl(Name) != RTLIB::Unsupported; - } - void setStr(storage::Str &S, StringRef Value) { S.Offset = StrtabBuilder.add(Value); S.Size = Value.size(); @@ -226,6 +213,18 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) { return P.first->second; } +static DenseSet<StringRef> buildPreservedSymbolsSet(const Triple &TT) { + DenseSet<StringRef> PreservedSymbolSet(std::begin(PreservedSymbols), + std::end(PreservedSymbols)); + // FIXME: Do we need to pass in ABI fields from TargetOptions? + RTLIB::RuntimeLibcallsInfo Libcalls(TT); + for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) { + if (Impl != RTLIB::Unsupported) + PreservedSymbolSet.insert(Libcalls.getLibcallImplName(Impl)); + } + return PreservedSymbolSet; +} + Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, const SmallPtrSet<GlobalValue *, 4> &Used, ModuleSymbolTable::Symbol Msym) { @@ -279,11 +278,13 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, return Error::success(); } - StringRef GVName = GV->getName(); - setStr(Sym.IRName, GVName); + setStr(Sym.IRName, GV->getName()); + + static const DenseSet<StringRef> PreservedSymbolsSet = + buildPreservedSymbolsSet(GV->getParent()->getTargetTriple()); + bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName()); - if (Used.count(GV) || isPreservedLibFuncName(GVName) || - isPreservedGlobalVarName(GVName)) + if (Used.count(GV) || IsPreservedSymbol) Sym.Flags |= 1 << storage::Symbol::FB_used; if (GV->isThreadLocal()) Sym.Flags |= 1 << storage::Symbol::FB_tls; @@ -350,6 +351,7 @@ Error Builder::build(ArrayRef<Module *> IRMods) { setStr(Hdr.Producer, kExpectedProducerName); setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple().str()); setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName()); + TT = IRMods[0]->getTargetTriple(); for (auto *M : IRMods) if (Error Err = addModule(M)) @@ -375,8 +377,7 @@ Error Builder::build(ArrayRef<Module *> IRMods) { Error irsymtab::build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder, BumpPtrAllocator &Alloc) { - const Triple &TT = Mods[0]->getTargetTriple(); - return Builder(Symtab, StrtabBuilder, Alloc, TT).build(Mods); + return Builder(Symtab, StrtabBuilder, Alloc).build(Mods); } // Upgrade a vector of bitcode modules created by an old version of LLVM by diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 9a94315..caa78b6 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -434,6 +434,7 @@ FUNCTION_PASS("extra-vector-passes", FUNCTION_PASS("fix-irreducible", FixIrreduciblePass()) FUNCTION_PASS("flatten-cfg", FlattenCFGPass()) FUNCTION_PASS("float2int", Float2IntPass()) +FUNCTION_PASS("free-machine-function", FreeMachineFunctionPass()) FUNCTION_PASS("gc-lowering", GCLoweringPass()) FUNCTION_PASS("guard-widening", GuardWideningPass()) FUNCTION_PASS("gvn-hoist", GVNHoistPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 23f106a..007b481 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -153,6 +153,9 @@ private: const TargetMachine &TM; }; +void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &); +extern char &AMDGPUPrepareAGPRAllocLegacyID; + void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &); extern char &AMDGPUReserveWWMRegsLegacyID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 250547a..b6c6d92 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -114,6 +114,7 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass()) +MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass()) MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass()) MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp new file mode 100644 index 0000000..3b06e9b --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp @@ -0,0 +1,108 @@ +//===-- AMDGPUPrepareAGPRAlloc.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Make simple transformations to relax register constraints for cases which can +// allocate to AGPRs or VGPRs. Replace materialize of inline immediates into +// AGPR or VGPR with a pseudo with an AV_* class register constraint. This +// allows later passes to inflate the register class if necessary. The register +// allocator does not know to replace instructions to relax constraints. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUPrepareAGPRAlloc.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-prepare-agpr-alloc" + +namespace { + +class AMDGPUPrepareAGPRAllocImpl { +private: + const SIInstrInfo &TII; + MachineRegisterInfo &MRI; + +public: + AMDGPUPrepareAGPRAllocImpl(const GCNSubtarget &ST, MachineRegisterInfo &MRI) + : TII(*ST.getInstrInfo()), MRI(MRI) {} + bool run(MachineFunction &MF); +}; + +class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) { + initializeAMDGPUPrepareAGPRAllocLegacyPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "AMDGPU Prepare AGPR Alloc"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE, + "AMDGPU Prepare AGPR Alloc", false, false) +INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE, + "AMDGPU Prepare AGPR Alloc", false, false) + +char AMDGPUPrepareAGPRAllocLegacy::ID = 0; + +char &llvm::AMDGPUPrepareAGPRAllocLegacyID = AMDGPUPrepareAGPRAllocLegacy::ID; + +bool AMDGPUPrepareAGPRAllocLegacy::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + return AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF); +} + +PreservedAnalyses +AMDGPUPrepareAGPRAllocPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF); + return PreservedAnalyses::all(); +} + +bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) { + if (MRI.isReserved(AMDGPU::AGPR0)) + return false; + + const MCInstrDesc &AVImmPseudo = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO); + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if ((MI.getOpcode() == AMDGPU::V_MOV_B32_e32 && + TII.isInlineConstant(MI, 1)) || + (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI.getOperand(1).isImm())) { + MI.setDesc(AVImmPseudo); + Changed = true; + } + } + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h new file mode 100644 index 0000000..dc598c9 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h @@ -0,0 +1,23 @@ +//===- AMDGPUPrepareAGPRAlloc.h ---------------------------------*- C++- *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { +class AMDGPUPrepareAGPRAllocPass + : public PassInfoMixin<AMDGPUPrepareAGPRAllocPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 46027b8..8101c68 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -167,77 +167,39 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); + Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass, + /*IncludeCalls=*/false); + if (ST.hasMAIInsts()) + Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass, + /*IncludeCalls=*/false); // If there are no calls, MachineRegisterInfo can tell us the used register // count easily. // A tail call isn't considered a call for MachineFrameInfo's purposes. if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { - Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass); - Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass); - if (ST.hasMAIInsts()) - Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass); + Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass, + /*IncludeCalls=*/false); return Info; } int32_t MaxVGPR = -1; - int32_t MaxAGPR = -1; - int32_t MaxSGPR = -1; Info.CalleeSegmentSize = 0; for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { - // TODO: Check regmasks? Do they occur anywhere except calls? - for (const MachineOperand &MO : MI.operands()) { - unsigned Width = 0; - bool IsSGPR = false; - bool IsAGPR = false; + for (unsigned I = 0; I < MI.getNumOperands(); ++I) { + const MachineOperand &MO = MI.getOperand(I); if (!MO.isReg()) continue; Register Reg = MO.getReg(); switch (Reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - case AMDGPU::M0_LO16: - case AMDGPU::M0_HI16: - case AMDGPU::SRC_SHARED_BASE_LO: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT_LO: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE_LO: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT_LO: - case AMDGPU::SRC_PRIVATE_LIMIT: - case AMDGPU::SRC_POPS_EXITING_WAVE_ID: - case AMDGPU::SGPR_NULL: - case AMDGPU::SGPR_NULL64: - case AMDGPU::MODE: - continue; - case AMDGPU::NoRegister: assert(MI.isDebugInstr() && "Instruction uses invalid noreg register"); continue; - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - case AMDGPU::VCC_LO_LO16: - case AMDGPU::VCC_LO_HI16: - case AMDGPU::VCC_HI_LO16: - case AMDGPU::VCC_HI_HI16: - Info.UsesVCC = true; - continue; - - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - continue; - case AMDGPU::XNACK_MASK: case AMDGPU::XNACK_MASK_LO: case AMDGPU::XNACK_MASK_HI: @@ -267,170 +229,22 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( break; } - if (AMDGPU::SGPR_32RegClass.contains(Reg) || - AMDGPU::SGPR_LO16RegClass.contains(Reg) || - AMDGPU::SGPR_HI16RegClass.contains(Reg)) { - IsSGPR = true; - Width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || - AMDGPU::VGPR_16RegClass.contains(Reg)) { - IsSGPR = false; - Width = 1; - } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || - AMDGPU::AGPR_LO16RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 1; - } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { - IsSGPR = true; - Width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { - IsSGPR = false; - Width = 2; - } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { - IsSGPR = false; - Width = 3; - } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { - IsSGPR = true; - Width = 3; - } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 3; - } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { - IsSGPR = true; - Width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { - IsSGPR = false; - Width = 4; - } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 4; - } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { - IsSGPR = false; - Width = 5; - } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { - IsSGPR = true; - Width = 5; - } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 5; - } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { - IsSGPR = false; - Width = 6; - } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { - IsSGPR = true; - Width = 6; - } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 6; - } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { - IsSGPR = false; - Width = 7; - } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { - IsSGPR = true; - Width = 7; - } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 7; - } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { - IsSGPR = true; - Width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { - IsSGPR = false; - Width = 8; - } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 8; - } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { - IsSGPR = false; - Width = 9; - } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { - IsSGPR = true; - Width = 9; - } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 9; - } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { - IsSGPR = false; - Width = 10; - } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { - IsSGPR = true; - Width = 10; - } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 10; - } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { - IsSGPR = false; - Width = 11; - } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { - IsSGPR = true; - Width = 11; - } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 11; - } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { - IsSGPR = false; - Width = 12; - } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { - IsSGPR = true; - Width = 12; - } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 12; - } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { - IsSGPR = true; - Width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { - IsSGPR = false; - Width = 16; - } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 16; - } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { - IsSGPR = true; - Width = 32; - } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - Width = 32; - } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 32; - } else { - // We only expect TTMP registers or registers that do not belong to - // any RC. - assert((AMDGPU::TTMP_32RegClass.contains(Reg) || - AMDGPU::TTMP_64RegClass.contains(Reg) || - AMDGPU::TTMP_128RegClass.contains(Reg) || - AMDGPU::TTMP_256RegClass.contains(Reg) || - AMDGPU::TTMP_512RegClass.contains(Reg) || - !TRI.getPhysRegBaseClass(Reg)) && - "Unknown register class"); - } + const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg); + assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) || + TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) || + AMDGPU::TTMP_64RegClass.contains(Reg) || + AMDGPU::TTMP_128RegClass.contains(Reg) || + AMDGPU::TTMP_256RegClass.contains(Reg) || + AMDGPU::TTMP_512RegClass.contains(Reg)) && + "Unknown register class"); + + if (!RC || !TRI.isVGPRClass(RC)) + continue; + + unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32); unsigned HWReg = TRI.getHWRegIndex(Reg); int MaxUsed = HWReg + Width - 1; - if (IsSGPR) { - MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; - } else if (IsAGPR) { - MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; - } else { - MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; - } + MaxVGPR = std::max(MaxUsed, MaxVGPR); } if (MI.isCall()) { @@ -492,9 +306,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( } } - Info.NumExplicitSGPR = MaxSGPR + 1; Info.NumVGPR = MaxVGPR + 1; - Info.NumAGPR = MaxAGPR + 1; return Info; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 31a80e0..c865082 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -25,6 +25,7 @@ #include "AMDGPUMacroFusion.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUPreloadKernArgProlog.h" +#include "AMDGPUPrepareAGPRAlloc.h" #include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPUReserveWWMRegs.h" #include "AMDGPUResourceUsageAnalysis.h" @@ -499,6 +500,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeGlobalISel(*PR); initializeAMDGPUAsmPrinterPass(*PR); initializeAMDGPUDAGToDAGISelLegacyPass(*PR); + initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR); initializeGCNDPPCombineLegacyPass(*PR); initializeSILowerI1CopiesLegacyPass(*PR); initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); @@ -1196,6 +1198,7 @@ public: bool addRegBankSelect() override; void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; + void addPreRegAlloc() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; @@ -1539,6 +1542,11 @@ void GCNPassConfig::addFastRegAlloc() { TargetPassConfig::addFastRegAlloc(); } +void GCNPassConfig::addPreRegAlloc() { + if (getOptLevel() != CodeGenOptLevel::None) + addPass(&AMDGPUPrepareAGPRAllocLegacyID); +} + void GCNPassConfig::addOptimizedRegAlloc() { if (EnableDCEInRA) insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); @@ -2235,6 +2243,11 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( Base::addOptimizedRegAlloc(addPass); } +void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const { + if (getOptLevel() != CodeGenOptLevel::None) + addPass(AMDGPUPrepareAGPRAllocPass()); +} + Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized( AddMachinePass &addPass) const { // TODO: Check --regalloc-npm option diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 3b2f39c..e0f1296 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -181,7 +181,9 @@ public: void addMachineSSAOptimization(AddMachinePass &) const; void addPostRegAlloc(AddMachinePass &) const; void addPreEmitPass(AddMachinePass &) const; + void addPreEmitRegAlloc(AddMachinePass &) const; Error addRegAssignmentOptimized(AddMachinePass &) const; + void addPreRegAlloc(AddMachinePass &) const; void addOptimizedRegAlloc(AddMachinePass &) const; void addPreSched2(AddMachinePass &) const; diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index e3519f1..42edec0 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -74,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp + AMDGPUPrepareAGPRAlloc.cpp AMDGPUSwLowerLDS.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 44d9ef5..f018f77 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -947,13 +947,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { // Copies and REG_SEQUENCE do not contribute to the final assembly // So, skip them but take care of the SGPR to VGPR copies bookkeeping. - if (Inst->isCopy() || Inst->isRegSequence()) { - if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { - if (!Inst->isCopy() || - !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { - Info.NumSVCopies++; - continue; - } + if (Inst->isRegSequence() && + TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { + Info.NumSVCopies++; + continue; + } + if (Inst->isCopy()) { + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI); + if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) && + !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { + Info.NumSVCopies++; + continue; } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 27212fda..0c76ff2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -46,6 +47,7 @@ #include <optional> using namespace llvm; +using namespace llvm::SDPatternMatch; #define DEBUG_TYPE "si-lower" @@ -11131,7 +11133,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { assert(VT.getSizeInBits() == 64); SDLoc DL(Op); - SDValue Cond = Op.getOperand(0); + SDValue Cond = DAG.getFreeze(Op.getOperand(0)); SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); @@ -14561,7 +14563,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, // instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::ADD); + assert(N->isAnyAdd()); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -14594,7 +14596,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, for (SDNode *User : LHS->users()) { // There is a use that does not feed into addition, so the multiply can't // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. - if (User->getOpcode() != ISD::ADD) + if (!User->isAnyAdd()) return SDValue(); // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer @@ -14706,8 +14708,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, SDValue Hi = getHiHalf64(LHS, DAG); SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::PTRADD) + Opcode = ISD::ADD; SDValue AddHi = - DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); + DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); @@ -15181,42 +15186,123 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::ADD) { - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, - // y is not, and (add y, z) is used only once. - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, - // z is not, and (add y, z) is used only once. - // The goal is to move constant offsets to the outermost ptradd, to create - // more opportunities to fold offsets into memory instructions. - // Together with the generic combines in DAGCombiner.cpp, this also - // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). - // - // This transform is here instead of in the general DAGCombiner as it can - // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for - // AArch64's CPA. - SDValue X = N0; - SDValue Y = N1.getOperand(0); - SDValue Z = N1.getOperand(1); - if (N1.hasOneUse()) { - bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); - bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); - if (ZIsConstant != YIsConstant) { - // If both additions in the original were NUW, the new ones are as well. - SDNodeFlags Flags = - (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; - if (YIsConstant) - std::swap(Y, Z); + // The following folds transform PTRADDs into regular arithmetic in cases + // where the PTRADD wouldn't be folded as an immediate offset into memory + // instructions anyway. They are target-specific in that other targets might + // prefer to not lose information about the pointer arithmetic. + + // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)). + // Adapted from DAGCombiner::visitADDLikeCommutative. + SDValue V, K; + if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) { + SDNodeFlags ShlFlags = N1->getFlags(); + // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0, + // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be + // preserved. + SDNodeFlags NewShlFlags = + ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap() + ? SDNodeFlags::NoSignedWrap + : SDNodeFlags(); + SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getNode(ISD::SUB, DL, VT, N0, Inner); + } + + // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in + // performAddCombine. + if (N1.getOpcode() == ISD::MUL) { + if (Subtarget->hasMad64_32()) { + if (SDValue Folded = tryFoldToMad64_32(N, DCI)) + return Folded; + } + } - SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags); + // If the 32 low bits of the constant are all zero, there is nothing to fold + // into an immediate offset, so it's better to eliminate the unnecessary + // addition for the lower 32 bits than to preserve the PTRADD. + // Analogous to a fold in performAddCombine. + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } + + if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) { + // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with + // global address GA and constant c, such that c can be folded into GA. + SDValue GAValue = N0.getOperand(0); + if (const GlobalAddressSDNode *GA = + dyn_cast<GlobalAddressSDNode>(GAValue)) { + if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) { + // If both additions in the original were NUW, reassociation preserves + // that. + SDNodeFlags Flags = + (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; + SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); DCI.AddToWorklist(Inner.getNode()); - return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags); + return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); } } } + if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse()) + return SDValue(); + + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, + // y is not, and (add y, z) is used only once. + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, + // z is not, and (add y, z) is used only once. + // The goal is to move constant offsets to the outermost ptradd, to create + // more opportunities to fold offsets into memory instructions. + // Together with the generic combines in DAGCombiner.cpp, this also + // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). + // + // This transform is here instead of in the general DAGCombiner as it can + // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for + // AArch64's CPA. + SDValue X = N0; + SDValue Y = N1.getOperand(0); + SDValue Z = N1.getOperand(1); + bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); + bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + + // If both additions in the original were NUW, reassociation preserves that. + SDNodeFlags ReassocFlags = + (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; + + if (ZIsConstant != YIsConstant) { + if (YIsConstant) + std::swap(Y, Z); + SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags); + } + + // If one of Y and Z is constant, they have been handled above. If both were + // constant, the addition would have been folded in SelectionDAG::getNode + // already. This ensures that the generic DAG combines won't undo the + // following reassociation. + assert(!YIsConstant && !ZIsConstant); + + if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) { + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and + // y are uniform and z isn't. + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and + // z are uniform and y isn't. + // The goal is to push uniform operands up in the computation, so that they + // can be handled with scalar operations. We can't use reassociateScalarOps + // for this since it requires two identical commutative operations to + // reassociate. + if (Y->isDivergent()) + std::swap(Y, Z); + SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(UniformInner.getNode()); + return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags); + } + return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 9da8a1c..c8935f0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -687,7 +687,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, if (!SafeToPropagate) break; - DefOp.setIsKill(false); + for (auto I = Def; I != MI; ++I) + I->clearRegisterKills(DefOp.getReg(), &RI); } MachineInstrBuilder Builder = @@ -1625,41 +1626,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { } } -static unsigned getAGPRSpillSaveOpcode(unsigned Size) { - switch (Size) { - case 4: - return AMDGPU::SI_SPILL_A32_SAVE; - case 8: - return AMDGPU::SI_SPILL_A64_SAVE; - case 12: - return AMDGPU::SI_SPILL_A96_SAVE; - case 16: - return AMDGPU::SI_SPILL_A128_SAVE; - case 20: - return AMDGPU::SI_SPILL_A160_SAVE; - case 24: - return AMDGPU::SI_SPILL_A192_SAVE; - case 28: - return AMDGPU::SI_SPILL_A224_SAVE; - case 32: - return AMDGPU::SI_SPILL_A256_SAVE; - case 36: - return AMDGPU::SI_SPILL_A288_SAVE; - case 40: - return AMDGPU::SI_SPILL_A320_SAVE; - case 44: - return AMDGPU::SI_SPILL_A352_SAVE; - case 48: - return AMDGPU::SI_SPILL_A384_SAVE; - case 64: - return AMDGPU::SI_SPILL_A512_SAVE; - case 128: - return AMDGPU::SI_SPILL_A1024_SAVE; - default: - llvm_unreachable("unknown register size"); - } -} - static unsigned getAVSpillSaveOpcode(unsigned Size) { switch (Size) { case 4: @@ -1707,22 +1673,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size, return AMDGPU::SI_SPILL_WWM_V32_SAVE; } -static unsigned getVectorRegSpillSaveOpcode(Register Reg, - const TargetRegisterClass *RC, - unsigned Size, - const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &MFI) { - bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); +unsigned SIInstrInfo::getVectorRegSpillSaveOpcode( + Register Reg, const TargetRegisterClass *RC, unsigned Size, + const SIMachineFunctionInfo &MFI) const { + bool IsVectorSuperClass = RI.isVectorSuperClass(RC); // Choose the right opcode if spilling a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass); - if (IsVectorSuperClass) + // TODO: Check if AGPRs are available + if (ST.hasMAIInsts()) return getAVSpillSaveOpcode(Size); - return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) - : getVGPRSpillSaveOpcode(Size); + return getVGPRSpillSaveOpcode(Size); } void SIInstrInfo::storeRegToStackSlot( @@ -1770,8 +1734,8 @@ void SIInstrInfo::storeRegToStackSlot( return; } - unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, - SpillSize, RI, *MFI); + unsigned Opcode = + getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) @@ -1854,41 +1818,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { } } -static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { - switch (Size) { - case 4: - return AMDGPU::SI_SPILL_A32_RESTORE; - case 8: - return AMDGPU::SI_SPILL_A64_RESTORE; - case 12: - return AMDGPU::SI_SPILL_A96_RESTORE; - case 16: - return AMDGPU::SI_SPILL_A128_RESTORE; - case 20: - return AMDGPU::SI_SPILL_A160_RESTORE; - case 24: - return AMDGPU::SI_SPILL_A192_RESTORE; - case 28: - return AMDGPU::SI_SPILL_A224_RESTORE; - case 32: - return AMDGPU::SI_SPILL_A256_RESTORE; - case 36: - return AMDGPU::SI_SPILL_A288_RESTORE; - case 40: - return AMDGPU::SI_SPILL_A320_RESTORE; - case 44: - return AMDGPU::SI_SPILL_A352_RESTORE; - case 48: - return AMDGPU::SI_SPILL_A384_RESTORE; - case 64: - return AMDGPU::SI_SPILL_A512_RESTORE; - case 128: - return AMDGPU::SI_SPILL_A1024_RESTORE; - default: - llvm_unreachable("unknown register size"); - } -} - static unsigned getAVSpillRestoreOpcode(unsigned Size) { switch (Size) { case 4: @@ -1930,27 +1859,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, if (Size != 4) llvm_unreachable("unknown wwm register spill size"); - if (IsVectorSuperClass) + if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs return AMDGPU::SI_SPILL_WWM_AV32_RESTORE; return AMDGPU::SI_SPILL_WWM_V32_RESTORE; } -static unsigned -getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, - unsigned Size, const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &MFI) { - bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); +unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode( + Register Reg, const TargetRegisterClass *RC, unsigned Size, + const SIMachineFunctionInfo &MFI) const { + bool IsVectorSuperClass = RI.isVectorSuperClass(RC); // Choose the right opcode if restoring a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass); - if (IsVectorSuperClass) + // TODO: Check if AGPRs are available + if (ST.hasMAIInsts()) return getAVSpillRestoreOpcode(Size); - return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) - : getVGPRSpillRestoreOpcode(Size); + assert(!RI.isAGPRClass(RC)); + return getVGPRSpillRestoreOpcode(Size); } void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, @@ -1998,7 +1927,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, } unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, - SpillSize, RI, *MFI); + SpillSize, *MFI); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 3a48e65..5e92921 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -33,6 +33,7 @@ class LiveVariables; class MachineDominatorTree; class MachineRegisterInfo; class RegScavenger; +class SIMachineFunctionInfo; class TargetRegisterClass; class ScheduleHazardRecognizer; @@ -287,6 +288,15 @@ public: bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override; + unsigned getVectorRegSpillSaveOpcode(Register Reg, + const TargetRegisterClass *RC, + unsigned Size, + const SIMachineFunctionInfo &MFI) const; + unsigned + getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, + unsigned Size, + const SIMachineFunctionInfo &MFI) const; + void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, @@ -1103,7 +1113,6 @@ public: // that will not require an additional 4-bytes; this function assumes that it // will. bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { - assert(!MO.isReg() && "isInlineConstant called on register operand!"); if (!MO.isImm()) return false; return isInlineConstant(MO.getImm(), OperandType); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 9173041..fa2b8db 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -4052,11 +4052,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, return 0; } -unsigned -SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, - const TargetRegisterClass &RC) const { +unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, + const TargetRegisterClass &RC, + bool IncludeCalls) const { for (MCPhysReg Reg : reverse(RC.getRegisters())) - if (MRI.isPhysRegUsed(Reg)) + if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls)) return getHWRegIndex(Reg) + 1; return 0; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 06a7a17..0008e5f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -486,9 +486,11 @@ public: unsigned SubReg) const; // \returns a number of registers of a given \p RC used in a function. - // Does not go inside function calls. + // Does not go inside function calls. If \p IncludeCalls is true, it will + // include registers that may be clobbered by calls. unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, - const TargetRegisterClass &RC) const; + const TargetRegisterClass &RC, + bool IncludeCalls = true) const; std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override { return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 8c35fea..1bbbb61 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -530,6 +530,10 @@ defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>; defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; +let SubtargetPredicate = HasTanhInsts in { +defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>; +} + let SubtargetPredicate = HasBF16TransInsts in { defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; @@ -1142,6 +1146,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>; defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>; defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>; +defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp index ac5e7f3..1493bf4 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -158,7 +158,12 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized( // estimateStackSize has been observed to under-estimate the final stack // size, so give ourselves wiggle-room by checking for stack size // representable an 11-bit signed field rather than 12-bits. - if (!isInt<11>(MFI.estimateStackSize(MF))) + // For [x]vstelm.{b/h/w/d} memory instructions with 8 imm offset, 7-bit + // signed field is fine. + unsigned EstimateStackSize = MFI.estimateStackSize(MF); + if (!isInt<11>(EstimateStackSize) || + (MF.getSubtarget<LoongArchSubtarget>().hasExtLSX() && + !isInt<7>(EstimateStackSize))) ScavSlotsNum = std::max(ScavSlotsNum, 1u); // For CFR spill. diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 6c8e3da..23b4554 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -95,6 +95,11 @@ static const std::pair<MCPhysReg, int8_t> FixedCSRFIQCIInterruptMap[] = { /* -21, -22, -23, -24 are reserved */ }; +/// Returns true if DWARF CFI instructions ("frame moves") should be emitted. +static bool needsDwarfCFI(const MachineFunction &MF) { + return MF.needsFrameMoves(); +} + // For now we use x3, a.k.a gp, as pointer to shadow call stack. // User should not use x3 in their asm. static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, @@ -141,6 +146,9 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, .addImm(-SlotSize) .setMIFlag(MachineInstr::FrameSetup); + if (!needsDwarfCFI(MF)) + return; + // Emit a CFI instruction that causes SlotSize to be subtracted from the value // of the shadow stack pointer when unwinding past this frame. char DwarfSCSReg = TRI->getDwarfRegNum(SCSPReg, /*IsEH*/ true); @@ -199,8 +207,10 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB, .addReg(SCSPReg) .addImm(-SlotSize) .setMIFlag(MachineInstr::FrameDestroy); - // Restore the SCS pointer - CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg); + if (needsDwarfCFI(MF)) { + // Restore the SCS pointer + CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg); + } } // Insert instruction to swap mscratchsw with sp @@ -935,6 +945,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() + getUnmanagedCSI(MF, CSI).size()); CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); + bool NeedsDwarfCFI = needsDwarfCFI(MF); // If libcalls are used to spill and restore callee-saved registers, the frame // has two sections; the opaque section managed by the libcalls, and the @@ -962,10 +973,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, alignTo((STI.getXLen() / 8) * LibCallRegs, getStackAlign()); RVFI->setLibCallStackSize(LibCallFrameSize); - CFIBuilder.buildDefCFAOffset(LibCallFrameSize); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + CFIBuilder.buildDefCFAOffset(LibCallFrameSize); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } // FIXME (note copied from Lanai): This appears to be overallocating. Needs @@ -996,14 +1009,17 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // could only be the next instruction. ++PossiblePush; - // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)` - // could be. The PUSH will also get its own CFI metadata for its own - // modifications, which should come after the PUSH. - CFIInstBuilder PushCFIBuilder(MBB, PossiblePush, MachineInstr::FrameSetup); - PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount); - for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI)) - PushCFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)` + // could be. The PUSH will also get its own CFI metadata for its own + // modifications, which should come after the PUSH. + CFIInstBuilder PushCFIBuilder(MBB, PossiblePush, + MachineInstr::FrameSetup); + PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount); + for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI)) + PushCFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } if (RVFI->isPushable(MF) && PossiblePush != MBB.end() && @@ -1017,10 +1033,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, PossiblePush->getOperand(1).setImm(StackAdj); StackSize -= StackAdj; - CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } // Allocate space on the stack if necessary. @@ -1031,7 +1049,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, bool DynAllocation = MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation(); if (StackSize != 0) - allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true, + allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, NeedsDwarfCFI, NeedProbe, ProbeSize, DynAllocation, MachineInstr::FrameSetup); @@ -1049,8 +1067,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // Iterate over list of callee-saved registers and emit .cfi_offset // directives. - for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) + for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); // Generate new FP. if (hasFP(MF)) { @@ -1069,7 +1089,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup, getStackAlign()); } - CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize()); + if (NeedsDwarfCFI) + CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize()); } uint64_t SecondSPAdjustAmount = 0; @@ -1080,15 +1101,16 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, "SecondSPAdjustAmount should be greater than zero"); allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount, - getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe, - ProbeSize, DynAllocation, MachineInstr::FrameSetup); + getStackSizeWithRVVPadding(MF), NeedsDwarfCFI && !hasFP(MF), + NeedProbe, ProbeSize, DynAllocation, + MachineInstr::FrameSetup); } if (RVVStackSize) { if (NeedProbe) { allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize, - MachineInstr::FrameSetup, !hasFP(MF), - DynAllocation); + MachineInstr::FrameSetup, + NeedsDwarfCFI && !hasFP(MF), DynAllocation); } else { // We must keep the stack pointer aligned through any intermediate // updates. @@ -1097,14 +1119,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup, getStackAlign()); } - if (!hasFP(MF)) { + if (NeedsDwarfCFI && !hasFP(MF)) { // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb". CFIBuilder.insertCFIInst(createDefCFAExpression( *RI, SPReg, getStackSizeWithRVVPadding(MF), RVVStackSize / 8)); } std::advance(MBBI, getRVVCalleeSavedInfo(MF, CSI).size()); - emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF)); + if (NeedsDwarfCFI) + emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF)); } if (hasFP(MF)) { @@ -1171,8 +1194,9 @@ void RISCVFrameLowering::deallocateStack(MachineFunction &MF, MachineInstr::FrameDestroy, getStackAlign()); StackSize = 0; - CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy) - .buildDefCFAOffset(CFAOffset); + if (needsDwarfCFI(MF)) + CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy) + .buildDefCFAOffset(CFAOffset); } void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, @@ -1212,6 +1236,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, std::next(MBBI, getRVVCalleeSavedInfo(MF, CSI).size()); CFIInstBuilder CFIBuilder(MBB, FirstScalarCSRRestoreInsn, MachineInstr::FrameDestroy); + bool NeedsDwarfCFI = needsDwarfCFI(MF); uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); uint64_t RealStackSize = FirstSPAdjustAmount ? FirstSPAdjustAmount @@ -1232,10 +1257,11 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, StackOffset::getScalable(RVVStackSize), MachineInstr::FrameDestroy, getStackAlign()); - if (!hasFP(MF)) - CFIBuilder.buildDefCFA(SPReg, RealStackSize); - - emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn); + if (NeedsDwarfCFI) { + if (!hasFP(MF)) + CFIBuilder.buildDefCFA(SPReg, RealStackSize); + emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn); + } } if (FirstSPAdjustAmount) { @@ -1251,7 +1277,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, StackOffset::getFixed(SecondSPAdjustAmount), MachineInstr::FrameDestroy, getStackAlign()); - if (!hasFP(MF)) + if (NeedsDwarfCFI && !hasFP(MF)) CFIBuilder.buildDefCFAOffset(FirstSPAdjustAmount); } @@ -1272,7 +1298,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, getStackAlign()); } - if (hasFP(MF)) + if (NeedsDwarfCFI && hasFP(MF)) CFIBuilder.buildDefCFA(SPReg, RealStackSize); // Skip to after the restores of scalar callee-saved registers @@ -1295,8 +1321,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, } // Recover callee-saved registers. - for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) - CFIBuilder.buildRestore(CS.getReg()); + if (NeedsDwarfCFI) + for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) + CFIBuilder.buildRestore(CS.getReg()); if (RVFI->isPushable(MF) && MBBI != MBB.end() && isPop(MBBI->getOpcode())) { // Use available stack adjustment in pop instruction to deallocate stack @@ -1315,15 +1342,17 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, auto NextI = next_nodbg(MBBI, MBB.end()); if (NextI == MBB.end() || NextI->getOpcode() != RISCV::PseudoRET) { ++MBBI; - CFIBuilder.setInsertPoint(MBBI); + if (NeedsDwarfCFI) { + CFIBuilder.setInsertPoint(MBBI); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildRestore(CS.getReg()); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildRestore(CS.getReg()); - // Update CFA Offset. If this is a QCI interrupt function, there will be a - // leftover offset which is deallocated by `QC.C.MILEAVERET`, otherwise - // getQCIInterruptStackSize() will be 0. - CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize()); + // Update CFA Offset. If this is a QCI interrupt function, there will + // be a leftover offset which is deallocated by `QC.C.MILEAVERET`, + // otherwise getQCIInterruptStackSize() will be 0. + CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize()); + } } } @@ -1812,7 +1841,8 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr( // allocateStack. bool DynAllocation = MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation(); - allocateStack(MBB, MI, MF, -Amount, -Amount, !hasFP(MF), + allocateStack(MBB, MI, MF, -Amount, -Amount, + needsDwarfCFI(MF) && !hasFP(MF), /*NeedProbe=*/true, ProbeSize, DynAllocation, MachineInstr::NoFlags); } else { diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp index c1f4d19..3bd2705 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp @@ -10,6 +10,10 @@ // instructions and masked instructions, so that we can reduce the live range // overlaps of mask registers. // +// If there are multiple masks producers followed by multiple masked +// instructions, then at each masked instructions add dependency edges between +// every producer and masked instruction. +// // The reason why we need to do this: // 1. When tracking register pressure, we don't track physical registers. // 2. We have a RegisterClass for mask register (which is `VMV0`), but we don't @@ -67,11 +71,27 @@ public: void apply(ScheduleDAGInstrs *DAG) override { SUnit *NearestUseV0SU = nullptr; + SmallVector<SUnit *, 2> DefMask; for (SUnit &SU : DAG->SUnits) { const MachineInstr *MI = SU.getInstr(); - if (MI->findRegisterUseOperand(RISCV::V0, TRI)) + bool UseV0 = MI->findRegisterUseOperand(RISCV::V0, TRI); + if (isSoleUseCopyToV0(SU) && !UseV0) + DefMask.push_back(&SU); + + if (UseV0) { NearestUseV0SU = &SU; + // Copy may not be a real use, so skip it here. + if (DefMask.size() > 1 && !MI->isCopy()) { + for (SUnit *Def : DefMask) + if (DAG->canAddEdge(Def, &SU)) + DAG->addEdge(Def, SDep(&SU, SDep::Artificial)); + } + + if (!DefMask.empty()) + DefMask.erase(DefMask.begin()); + } + if (NearestUseV0SU && NearestUseV0SU != &SU && isSoleUseCopyToV0(SU) && // For LMUL=8 cases, there will be more possibilities to spill. // FIXME: We should use RegPressureTracker to do fine-grained diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index 82e8ce4e..5d5a705 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -389,7 +389,7 @@ static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, if (!ArgFlags.isInConsecutiveRegsLast()) return true; - assert(PendingMembers.size() == 4 && "Should have two parts"); + assert(PendingMembers.size() == 4 && "Should have four parts"); int64_t Offset = State.AllocateStack(16, Align(16)); PendingMembers[0].convertToMem(Offset); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index 57fbc71..9cd35e3 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -176,10 +176,10 @@ constexpr FeatureBitset FeaturesArrowlakeS = FeaturesArrowlake | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 | FeatureSM4; constexpr FeatureBitset FeaturesPantherlake = - FeaturesArrowlakeS ^ FeatureWIDEKL | FeaturePREFETCHI; + (FeaturesArrowlakeS ^ FeatureWIDEKL) | FeaturePREFETCHI; constexpr FeatureBitset FeaturesClearwaterforest = - FeaturesSierraforest ^ FeatureWIDEKL | FeatureAVXVNNIINT16 | FeatureSHA512 | - FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR; + (FeaturesSierraforest ^ FeatureWIDEKL) | FeatureAVXVNNIINT16 | + FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR; // Geode Processor. constexpr FeatureBitset FeaturesGeode = diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index fbeb721..a65d0fb 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1103,14 +1103,13 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP, SpillAlignment, E.first->getName() + Twine(".reload")); - TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(Def); TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(Def); // Try best to find dbg.declare. If the spill is a temp, there may not // be a direct dbg.declare. Walk up the load chain to find one from an // alias. if (F->getSubprogram()) { auto *CurDef = Def; - while (DIs.empty() && DVRs.empty() && isa<LoadInst>(CurDef)) { + while (DVRs.empty() && isa<LoadInst>(CurDef)) { auto *LdInst = cast<LoadInst>(CurDef); // Only consider ptr to ptr same type load. if (LdInst->getPointerOperandType() != LdInst->getType()) @@ -1118,12 +1117,11 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { CurDef = LdInst->getPointerOperand(); if (!isa<AllocaInst, LoadInst>(CurDef)) break; - DIs = findDbgDeclares(CurDef); DVRs = findDVRDeclares(CurDef); } } - auto SalvageOne = [&](auto *DDI) { + auto SalvageOne = [&](DbgVariableRecord *DDI) { // This dbg.declare is preserved for all coro-split function // fragments. It will be unreachable in the main function, and // processed by coro::salvageDebugInfo() by the Cloner. @@ -1137,7 +1135,6 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { // will be deleted in all coro-split functions. coro::salvageDebugInfo(ArgToAllocaMap, *DDI, false /*UseEntryValue*/); }; - for_each(DIs, SalvageOne); for_each(DVRs, SalvageOne); } @@ -1225,8 +1222,7 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { SmallVector<DbgVariableIntrinsic *, 4> DIs; SmallVector<DbgVariableRecord *> DbgVariableRecords; findDbgUsers(DIs, Alloca, &DbgVariableRecords); - for (auto *DVI : DIs) - DVI->replaceUsesOfWith(Alloca, G); + assert(DIs.empty() && "Should never see debug-intrinsics"); for (auto *DVR : DbgVariableRecords) DVR->replaceVariableLocationOp(Alloca, G); @@ -1922,48 +1918,6 @@ salvageDebugInfoImpl(SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, void coro::salvageDebugInfo( SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, - DbgVariableIntrinsic &DVI, bool UseEntryValue) { - - Function *F = DVI.getFunction(); - // Follow the pointer arithmetic all the way to the incoming - // function argument and convert into a DIExpression. - bool SkipOutermostLoad = !isa<DbgValueInst>(DVI); - Value *OriginalStorage = DVI.getVariableLocationOp(0); - - auto SalvagedInfo = - ::salvageDebugInfoImpl(ArgToAllocaMap, UseEntryValue, F, OriginalStorage, - DVI.getExpression(), SkipOutermostLoad); - if (!SalvagedInfo) - return; - - Value *Storage = &SalvagedInfo->first; - DIExpression *Expr = &SalvagedInfo->second; - - DVI.replaceVariableLocationOp(OriginalStorage, Storage); - DVI.setExpression(Expr); - // We only hoist dbg.declare today since it doesn't make sense to hoist - // dbg.value since it does not have the same function wide guarantees that - // dbg.declare does. - if (isa<DbgDeclareInst>(DVI)) { - std::optional<BasicBlock::iterator> InsertPt; - if (auto *I = dyn_cast<Instruction>(Storage)) { - InsertPt = I->getInsertionPointAfterDef(); - // Update DILocation only if variable was not inlined. - DebugLoc ILoc = I->getDebugLoc(); - DebugLoc DVILoc = DVI.getDebugLoc(); - if (ILoc && DVILoc && - DVILoc->getScope()->getSubprogram() == - ILoc->getScope()->getSubprogram()) - DVI.setDebugLoc(I->getDebugLoc()); - } else if (isa<Argument>(Storage)) - InsertPt = F->getEntryBlock().begin(); - if (InsertPt) - DVI.moveBefore(*(*InsertPt)->getParent(), *InsertPt); - } -} - -void coro::salvageDebugInfo( - SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, DbgVariableRecord &DVR, bool UseEntryValue) { Function *F = DVR.getFunction(); diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h index b53c5a4..52f4ffe 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -34,16 +34,13 @@ void suppressCoroAllocs(CoroIdInst *CoroId); void suppressCoroAllocs(LLVMContext &Context, ArrayRef<CoroAllocInst *> CoroAllocs); -/// Attempts to rewrite the location operand of debug intrinsics in terms of +/// Attempts to rewrite the location operand of debug records in terms of /// the coroutine frame pointer, folding pointer offsets into the DIExpression /// of the intrinsic. /// If the frame pointer is an Argument, store it into an alloca to enhance the /// debugability. void salvageDebugInfo( SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, - DbgVariableIntrinsic &DVI, bool IsEntryPoint); -void salvageDebugInfo( - SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, DbgVariableRecord &DVR, bool UseEntryValue); // Keeps data and helper functions for lowering coroutine intrinsics. diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 5a8a41f..64b33e4 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -618,19 +618,15 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape, } } -/// Returns all DbgVariableIntrinsic in F. -static std::pair<SmallVector<DbgVariableIntrinsic *, 8>, - SmallVector<DbgVariableRecord *>> -collectDbgVariableIntrinsics(Function &F) { - SmallVector<DbgVariableIntrinsic *, 8> Intrinsics; +/// Returns all debug records in F. +static SmallVector<DbgVariableRecord *> +collectDbgVariableRecords(Function &F) { SmallVector<DbgVariableRecord *> DbgVariableRecords; for (auto &I : instructions(F)) { for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) DbgVariableRecords.push_back(&DVR); - if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) - Intrinsics.push_back(DVI); } - return {Intrinsics, DbgVariableRecords}; + return DbgVariableRecords; } void coro::BaseCloner::replaceSwiftErrorOps() { @@ -638,13 +634,11 @@ void coro::BaseCloner::replaceSwiftErrorOps() { } void coro::BaseCloner::salvageDebugInfo() { - auto [Worklist, DbgVariableRecords] = collectDbgVariableIntrinsics(*NewF); + auto DbgVariableRecords = collectDbgVariableRecords(*NewF); SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap; // Only 64-bit ABIs have a register we can refer to with the entry value. bool UseEntryValue = OrigF.getParent()->getTargetTriple().isArch64Bit(); - for (DbgVariableIntrinsic *DVI : Worklist) - coro::salvageDebugInfo(ArgToAllocaMap, *DVI, UseEntryValue); for (DbgVariableRecord *DVR : DbgVariableRecords) coro::salvageDebugInfo(ArgToAllocaMap, *DVR, UseEntryValue); @@ -655,7 +649,7 @@ void coro::BaseCloner::salvageDebugInfo() { return !isPotentiallyReachable(&NewF->getEntryBlock(), BB, nullptr, &DomTree); }; - auto RemoveOne = [&](auto *DVI) { + auto RemoveOne = [&](DbgVariableRecord *DVI) { if (IsUnreachableBlock(DVI->getParent())) DVI->eraseFromParent(); else if (isa_and_nonnull<AllocaInst>(DVI->getVariableLocationOp(0))) { @@ -669,7 +663,6 @@ void coro::BaseCloner::salvageDebugInfo() { DVI->eraseFromParent(); } }; - for_each(Worklist, RemoveOne); for_each(DbgVariableRecords, RemoveOne); } @@ -2022,9 +2015,7 @@ static void doSplitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones, // original function. The Cloner has already salvaged debug info in the new // coroutine funclets. SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap; - auto [DbgInsts, DbgVariableRecords] = collectDbgVariableIntrinsics(F); - for (auto *DDI : DbgInsts) - coro::salvageDebugInfo(ArgToAllocaMap, *DDI, false /*UseEntryValue*/); + auto DbgVariableRecords = collectDbgVariableRecords(F); for (DbgVariableRecord *DVR : DbgVariableRecords) coro::salvageDebugInfo(ArgToAllocaMap, *DVR, false /*UseEntryValue*/); diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp index 8017db1c..5fd5f7d 100644 --- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp +++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp @@ -514,7 +514,7 @@ void collectSpillsAndAllocasFromInsts( void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F, const SuspendCrossingInfo &Checker) { // We don't want the layout of coroutine frame to be affected - // by debug information. So we only choose to salvage DbgValueInst for + // by debug information. So we only choose to salvage dbg.values for // whose value is already in the frame. // We would handle the dbg.values for allocas specially for (auto &Iter : Spills) { @@ -522,9 +522,7 @@ void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F, SmallVector<DbgValueInst *, 16> DVIs; SmallVector<DbgVariableRecord *, 16> DVRs; findDbgValues(DVIs, V, &DVRs); - for (DbgValueInst *DVI : DVIs) - if (Checker.isDefinitionAcrossSuspend(*V, DVI)) - Spills[V].push_back(DVI); + assert(DVIs.empty()); // Add the instructions which carry debug info that is in the frame. for (DbgVariableRecord *DVR : DVRs) if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr)) diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp index d4555e9..f5525de 100644 --- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -572,7 +572,7 @@ void MergeFunctions::filterInstsUnrelatedToPDI( // Work out whether a dbg.value intrinsic or an equivalent DbgVariableRecord // is a parameter to be preserved. - auto ExamineDbgValue = [](auto *DbgVal, auto &Container) { + auto ExamineDbgValue = [&PDVRRelated](DbgVariableRecord *DbgVal) { LLVM_DEBUG(dbgs() << " Deciding: "); LLVM_DEBUG(DbgVal->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); @@ -581,7 +581,7 @@ void MergeFunctions::filterInstsUnrelatedToPDI( LLVM_DEBUG(dbgs() << " Include (parameter): "); LLVM_DEBUG(DbgVal->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); - Container.insert(DbgVal); + PDVRRelated.insert(DbgVal); } else { LLVM_DEBUG(dbgs() << " Delete (!parameter): "); LLVM_DEBUG(DbgVal->print(dbgs())); @@ -589,7 +589,8 @@ void MergeFunctions::filterInstsUnrelatedToPDI( } }; - auto ExamineDbgDeclare = [&PDIRelated](auto *DbgDecl, auto &Container) { + auto ExamineDbgDeclare = [&PDIRelated, + &PDVRRelated](DbgVariableRecord *DbgDecl) { LLVM_DEBUG(dbgs() << " Deciding: "); LLVM_DEBUG(DbgDecl->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); @@ -616,7 +617,7 @@ void MergeFunctions::filterInstsUnrelatedToPDI( LLVM_DEBUG(dbgs() << " Include: "); LLVM_DEBUG(DbgDecl->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); - Container.insert(DbgDecl); + PDVRRelated.insert(DbgDecl); } else { LLVM_DEBUG(dbgs() << " Delete (!parameter): "); LLVM_DEBUG(SI->print(dbgs())); @@ -647,18 +648,14 @@ void MergeFunctions::filterInstsUnrelatedToPDI( // they connected to parameters? for (DbgVariableRecord &DVR : filterDbgVars(BI->getDbgRecordRange())) { if (DVR.isDbgValue() || DVR.isDbgAssign()) { - ExamineDbgValue(&DVR, PDVRRelated); + ExamineDbgValue(&DVR); } else { assert(DVR.isDbgDeclare()); - ExamineDbgDeclare(&DVR, PDVRRelated); + ExamineDbgDeclare(&DVR); } } - if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) { - ExamineDbgValue(DVI, PDIRelated); - } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) { - ExamineDbgDeclare(DDI, PDIRelated); - } else if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) { + if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) { LLVM_DEBUG(dbgs() << " Will Include Terminator: "); LLVM_DEBUG(BI->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 9d7c025..f7fbf08 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -825,9 +825,6 @@ public: Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned); bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock); - void tryToSinkInstructionDbgValues( - Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock, - BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers); void tryToSinkInstructionDbgVariableRecords( Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock, BasicBlock *DestBlock, SmallVectorImpl<DbgVariableRecord *> &DPUsers); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 684b9a1..503611a 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1420,21 +1420,16 @@ void InstCombinerImpl::freelyInvertAllUsersOf(Value *I, Value *IgnoredUser) { SmallVector<DbgValueInst *, 4> DbgValues; SmallVector<DbgVariableRecord *, 4> DbgVariableRecords; llvm::findDbgValues(DbgValues, I, &DbgVariableRecords); + assert(DbgValues.empty()); - auto InvertDbgValueUse = [&](auto *DbgVal) { + for (DbgVariableRecord *DbgVal : DbgVariableRecords) { SmallVector<uint64_t, 1> Ops = {dwarf::DW_OP_not}; for (unsigned Idx = 0, End = DbgVal->getNumVariableLocationOps(); Idx != End; ++Idx) if (DbgVal->getVariableLocationOp(Idx) == I) DbgVal->setExpression( DIExpression::appendOpsToArg(DbgVal->getExpression(), Ops, Idx)); - }; - - for (DbgValueInst *DVI : DbgValues) - InvertDbgValueUse(DVI); - - for (DbgVariableRecord *DVR : DbgVariableRecords) - InvertDbgValueUse(DVR); + } } /// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a @@ -3575,6 +3570,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { std::unique_ptr<DIBuilder> DIB; if (isa<AllocaInst>(MI)) { findDbgUsers(DVIs, &MI, &DVRs); + assert(DVIs.empty()); DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false)); } @@ -5253,8 +5249,7 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I, SmallVector<DbgVariableIntrinsic *, 2> DbgUsers; SmallVector<DbgVariableRecord *, 2> DbgVariableRecords; findDbgUsers(DbgUsers, I, &DbgVariableRecords); - if (!DbgUsers.empty()) - tryToSinkInstructionDbgValues(I, InsertPos, SrcBlock, DestBlock, DbgUsers); + assert(DbgUsers.empty()); if (!DbgVariableRecords.empty()) tryToSinkInstructionDbgVariableRecords(I, InsertPos, SrcBlock, DestBlock, DbgVariableRecords); @@ -5271,71 +5266,12 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I, return true; } -void InstCombinerImpl::tryToSinkInstructionDbgValues( - Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock, - BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers) { - // For all debug values in the destination block, the sunk instruction - // will still be available, so they do not need to be dropped. - SmallVector<DbgVariableIntrinsic *, 2> DbgUsersToSalvage; - for (auto &DbgUser : DbgUsers) - if (DbgUser->getParent() != DestBlock) - DbgUsersToSalvage.push_back(DbgUser); - - // Process the sinking DbgUsersToSalvage in reverse order, as we only want - // to clone the last appearing debug intrinsic for each given variable. - SmallVector<DbgVariableIntrinsic *, 2> DbgUsersToSink; - for (DbgVariableIntrinsic *DVI : DbgUsersToSalvage) - if (DVI->getParent() == SrcBlock) - DbgUsersToSink.push_back(DVI); - llvm::sort(DbgUsersToSink, - [](auto *A, auto *B) { return B->comesBefore(A); }); - - SmallVector<DbgVariableIntrinsic *, 2> DIIClones; - SmallSet<DebugVariable, 4> SunkVariables; - for (auto *User : DbgUsersToSink) { - // A dbg.declare instruction should not be cloned, since there can only be - // one per variable fragment. It should be left in the original place - // because the sunk instruction is not an alloca (otherwise we could not be - // here). - if (isa<DbgDeclareInst>(User)) - continue; - - DebugVariable DbgUserVariable = - DebugVariable(User->getVariable(), User->getExpression(), - User->getDebugLoc()->getInlinedAt()); - - if (!SunkVariables.insert(DbgUserVariable).second) - continue; - - // Leave dbg.assign intrinsics in their original positions and there should - // be no need to insert a clone. - if (isa<DbgAssignIntrinsic>(User)) - continue; - - DIIClones.emplace_back(cast<DbgVariableIntrinsic>(User->clone())); - if (isa<DbgDeclareInst>(User) && isa<CastInst>(I)) - DIIClones.back()->replaceVariableLocationOp(I, I->getOperand(0)); - LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n'); - } - - // Perform salvaging without the clones, then sink the clones. - if (!DIIClones.empty()) { - salvageDebugInfoForDbgValues(*I, DbgUsersToSalvage, {}); - // The clones are in reverse order of original appearance, reverse again to - // maintain the original order. - for (auto &DIIClone : llvm::reverse(DIIClones)) { - DIIClone->insertBefore(InsertPos); - LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n'); - } - } -} - void InstCombinerImpl::tryToSinkInstructionDbgVariableRecords( Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock, BasicBlock *DestBlock, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) { - // Implementation of tryToSinkInstructionDbgValues, but for the - // DbgVariableRecord of variable assignments rather than dbg.values. + // For all debug values in the destination block, the sunk instruction + // will still be available, so they do not need to be dropped. // Fetch all DbgVariableRecords not already in the destination. SmallVector<DbgVariableRecord *, 2> DbgVariableRecordsToSalvage; diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 2786d81..df31602 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -1489,6 +1489,7 @@ static bool checkAndReplaceCondition( SmallVector<DbgVariableIntrinsic *> DbgUsers; SmallVector<DbgVariableRecord *> DVRUsers; findDbgUsers(DbgUsers, Cmp, &DVRUsers); + assert(DbgUsers.empty()); for (auto *DVR : DVRUsers) { auto *DTN = DT.getNode(DVR->getParent()); diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 85dd9a1..0f63ed0 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -2079,6 +2079,7 @@ struct DSEState { AllocFnKind AllocKind = Attrs.getFnAttr(Attribute::AllocKind).getAllocKind() | AllocFnKind::Zeroed; + AllocKind &= ~AllocFnKind::Uninitialized; Attrs = Attrs.addFnAttribute(Ctx, Attribute::getWithAllocKind(Ctx, AllocKind)) .removeFnAttribute(Ctx, "alloc-variant-zeroed"); diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 8bff458..f6bf09d 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2367,11 +2367,14 @@ uint32_t GVNPass::ValueTable::phiTranslateImpl(const BasicBlock *Pred, // See if we can refine the value number by looking at the PN incoming value // for the given predecessor. if (PHINode *PN = NumberingPhi[Num]) { - if (PN->getParent() == PhiBlock) - for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) - if (PN->getIncomingBlock(I) == Pred) - if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false)) - return TransVal; + if (PN->getParent() != PhiBlock) + return Num; + for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) { + if (PN->getIncomingBlock(I) != Pred) + continue; + if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false)) + return TransVal; + } return Num; } diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index b5dbef1..4d1f4407 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1979,15 +1979,13 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB, // Find debug values outside of the block findDbgValues(DbgValues, &I, &DbgVariableRecords); - llvm::erase_if(DbgValues, [&](const DbgValueInst *DbgVal) { - return DbgVal->getParent() == BB; - }); + assert(DbgValues.empty()); llvm::erase_if(DbgVariableRecords, [&](const DbgVariableRecord *DbgVarRec) { return DbgVarRec->getParent() == BB; }); // If there are no uses outside the block, we're done with this instruction. - if (UsesToRename.empty() && DbgValues.empty() && DbgVariableRecords.empty()) + if (UsesToRename.empty() && DbgVariableRecords.empty()) continue; LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); @@ -2000,8 +1998,7 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB, while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); - if (!DbgValues.empty() || !DbgVariableRecords.empty()) { - SSAUpdate.UpdateDebugValues(&I, DbgValues); + if (!DbgVariableRecords.empty()) { SSAUpdate.UpdateDebugValues(&I, DbgVariableRecords); DbgValues.clear(); DbgVariableRecords.clear(); @@ -2032,32 +2029,7 @@ void JumpThreadingPass::cloneInstructions(ValueToValueMapTy &ValueMapping, // copy of the block 'NewBB'. If there are PHI nodes in the source basic // block, evaluate them to account for entry from PredBB. - // Retargets llvm.dbg.value to any renamed variables. - auto RetargetDbgValueIfPossible = [&](Instruction *NewInst) -> bool { - auto DbgInstruction = dyn_cast<DbgValueInst>(NewInst); - if (!DbgInstruction) - return false; - - SmallSet<std::pair<Value *, Value *>, 16> OperandsToRemap; - for (auto DbgOperand : DbgInstruction->location_ops()) { - auto DbgOperandInstruction = dyn_cast<Instruction>(DbgOperand); - if (!DbgOperandInstruction) - continue; - - auto I = ValueMapping.find(DbgOperandInstruction); - if (I != ValueMapping.end()) { - OperandsToRemap.insert( - std::pair<Value *, Value *>(DbgOperand, I->second)); - } - } - - for (auto &[OldOp, MappedOp] : OperandsToRemap) - DbgInstruction->replaceVariableLocationOp(OldOp, MappedOp); - return true; - }; - - // Duplicate implementation of the above dbg.value code, using - // DbgVariableRecords instead. + // Retargets dbg.value to any renamed variables. auto RetargetDbgVariableRecordIfPossible = [&](DbgVariableRecord *DVR) { SmallSet<std::pair<Value *, Value *>, 16> OperandsToRemap; for (auto *Op : DVR->location_ops()) { @@ -2116,9 +2088,6 @@ void JumpThreadingPass::cloneInstructions(ValueToValueMapTy &ValueMapping, if (const DebugLoc &DL = New->getDebugLoc()) mapAtomInstance(DL, ValueMapping); - if (RetargetDbgValueIfPossible(New)) - continue; - // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index a0f9f3c..70e9eee 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -78,6 +78,7 @@ enum class RuleTy { PerLoopCacheAnalysis, PerInstrOrderCost, ForVectorization, + Ignore }; } // end anonymous namespace @@ -106,14 +107,20 @@ static cl::list<RuleTy> Profitabilities( clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", - "Prioritize vectorization"))); + "Prioritize vectorization"), + clEnumValN(RuleTy::Ignore, "ignore", + "Ignore profitability, force interchange (does not " + "work with other options)"))); #ifndef NDEBUG -static bool noDuplicateRules(ArrayRef<RuleTy> Rules) { +static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) { SmallSet<RuleTy, 4> Set; - for (RuleTy Rule : Rules) + for (RuleTy Rule : Rules) { if (!Set.insert(Rule).second) return false; + if (Rule == RuleTy::Ignore) + return false; + } return true; } @@ -1357,6 +1364,13 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization( bool LoopInterchangeProfitability::isProfitable( const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) { + + // Return true if interchange is forced and the cost-model ignored. + if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore) + return true; + assert(noDuplicateRulesAndIgnore(Profitabilities) && + "Duplicate rules and option 'ignore' are not allowed"); + // isProfitable() is structured to avoid endless loop interchange. If the // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could // decide the profitability then, profitability check will stop and return the @@ -1365,7 +1379,6 @@ bool LoopInterchangeProfitability::isProfitable( // second highest priority rule (isProfitablePerInstrOrderCost by default). // Likewise, if it failed to analysis the profitability then only, the last // rule (isProfitableForVectorization by default) will decide. - assert(noDuplicateRules(Profitabilities) && "Detect duplicate rules"); std::optional<bool> shouldInterchange; for (RuleTy RT : Profitabilities) { switch (RT) { @@ -1382,6 +1395,9 @@ bool LoopInterchangeProfitability::isProfitable( shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); break; + case RuleTy::Ignore: + llvm_unreachable("Option 'ignore' is not supported with other options"); + break; } // If this rule could determine the profitability, don't call subsequent diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index dc8fa43..636bd81 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -6630,13 +6630,10 @@ struct SCEVDbgValueBuilder { /// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs /// and DIExpression. struct DVIRecoveryRec { - DVIRecoveryRec(DbgValueInst *DbgValue) - : DbgRef(DbgValue), Expr(DbgValue->getExpression()), - HadLocationArgList(false) {} DVIRecoveryRec(DbgVariableRecord *DVR) : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {} - PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgRef; + DbgVariableRecord *DbgRef; DIExpression *Expr; bool HadLocationArgList; SmallVector<WeakVH, 2> LocationOps; @@ -6695,44 +6692,38 @@ static void updateDVIWithLocations(T &DbgVal, } /// Write the new expression and new location ops for the dbg.value. If possible -/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This +/// reduce the szie of the dbg.value by omitting DIArglist. This /// can be omitted if: /// 1. There is only a single location, refenced by a single DW_OP_llvm_arg. /// 2. The DW_OP_LLVM_arg is the first operand in the expression. -static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec, - SmallVectorImpl<Value *> &NewLocationOps, - SmallVectorImpl<uint64_t> &NewExpr) { - auto UpdateDbgValueInstImpl = [&](auto *DbgVal) { - unsigned NumLLVMArgs = numLLVMArgOps(NewExpr); - if (NumLLVMArgs == 0) { - // Location assumed to be on the stack. - updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr); - } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) { - // There is only a single DW_OP_llvm_arg at the start of the expression, - // so it can be omitted along with DIArglist. - assert(NewExpr[1] == 0 && - "Lone LLVM_arg in a DIExpression should refer to location-op 0."); - llvm::SmallVector<uint64_t, 6> ShortenedOps(llvm::drop_begin(NewExpr, 2)); - updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps); - } else { - // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary. - updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr); - } +static void UpdateDbgValue(DVIRecoveryRec &DVIRec, + SmallVectorImpl<Value *> &NewLocationOps, + SmallVectorImpl<uint64_t> &NewExpr) { + DbgVariableRecord *DbgVal = DVIRec.DbgRef; + unsigned NumLLVMArgs = numLLVMArgOps(NewExpr); + if (NumLLVMArgs == 0) { + // Location assumed to be on the stack. + updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr); + } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) { + // There is only a single DW_OP_llvm_arg at the start of the expression, + // so it can be omitted along with DIArglist. + assert(NewExpr[1] == 0 && + "Lone LLVM_arg in a DIExpression should refer to location-op 0."); + llvm::SmallVector<uint64_t, 6> ShortenedOps(llvm::drop_begin(NewExpr, 2)); + updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps); + } else { + // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary. + updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr); + } - // If the DIExpression was previously empty then add the stack terminator. - // Non-empty expressions have only had elements inserted into them and so - // the terminator should already be present e.g. stack_value or fragment. - DIExpression *SalvageExpr = DbgVal->getExpression(); - if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) { - SalvageExpr = - DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value}); - DbgVal->setExpression(SalvageExpr); - } - }; - if (isa<DbgValueInst *>(DVIRec.DbgRef)) - UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef)); - else - UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef)); + // If the DIExpression was previously empty then add the stack terminator. + // Non-empty expressions have only had elements inserted into them and so + // the terminator should already be present e.g. stack_value or fragment. + DIExpression *SalvageExpr = DbgVal->getExpression(); + if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) { + SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value}); + DbgVal->setExpression(SalvageExpr); + } } /// Cached location ops may be erased during LSR, in which case a poison is @@ -6746,39 +6737,34 @@ static Value *getValueOrPoison(WeakVH &VH, LLVMContext &C) { /// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values. static void restorePreTransformState(DVIRecoveryRec &DVIRec) { - auto RestorePreTransformStateImpl = [&](auto *DbgVal) { - LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n" - << "scev-salvage: post-LSR: " << *DbgVal << '\n'); - assert(DVIRec.Expr && "Expected an expression"); - DbgVal->setExpression(DVIRec.Expr); - - // Even a single location-op may be inside a DIArgList and referenced with - // DW_OP_LLVM_arg, which is valid only with a DIArgList. - if (!DVIRec.HadLocationArgList) { - assert(DVIRec.LocationOps.size() == 1 && - "Unexpected number of location ops."); - // LSR's unsuccessful salvage attempt may have added DIArgList, which in - // this case was not present before, so force the location back to a - // single uncontained Value. - Value *CachedValue = - getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext()); - DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue)); - } else { - SmallVector<ValueAsMetadata *, 3> MetadataLocs; - for (WeakVH VH : DVIRec.LocationOps) { - Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext()); - MetadataLocs.push_back(ValueAsMetadata::get(CachedValue)); - } - auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs); - DbgVal->setRawLocation( - llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef)); + DbgVariableRecord *DbgVal = DVIRec.DbgRef; + LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n" + << "scev-salvage: post-LSR: " << *DbgVal << '\n'); + assert(DVIRec.Expr && "Expected an expression"); + DbgVal->setExpression(DVIRec.Expr); + + // Even a single location-op may be inside a DIArgList and referenced with + // DW_OP_LLVM_arg, which is valid only with a DIArgList. + if (!DVIRec.HadLocationArgList) { + assert(DVIRec.LocationOps.size() == 1 && + "Unexpected number of location ops."); + // LSR's unsuccessful salvage attempt may have added DIArgList, which in + // this case was not present before, so force the location back to a + // single uncontained Value. + Value *CachedValue = + getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext()); + DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue)); + } else { + SmallVector<ValueAsMetadata *, 3> MetadataLocs; + for (WeakVH VH : DVIRec.LocationOps) { + Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext()); + MetadataLocs.push_back(ValueAsMetadata::get(CachedValue)); } - LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n'); - }; - if (isa<DbgValueInst *>(DVIRec.DbgRef)) - RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef)); - else - RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef)); + auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs); + DbgVal->setRawLocation( + llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef)); + } + LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n'); } static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, @@ -6786,9 +6772,7 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr) { - if (isa<DbgValueInst *>(DVIRec.DbgRef) - ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation() - : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation()) + if (!DVIRec.DbgRef->isKillLocation()) return false; // LSR may have caused several changes to the dbg.value in the failed salvage @@ -6882,13 +6866,8 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, DbgBuilder->appendToVectors(NewExpr, NewLocationOps); } - UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr); - if (isa<DbgValueInst *>(DVIRec.DbgRef)) - LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " - << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n"); - else - LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " - << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n"); + UpdateDbgValue(DVIRec, NewLocationOps, NewExpr); + LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n"); return true; } @@ -6934,21 +6913,23 @@ static void DbgRewriteSalvageableDVIs( /// cacheing and salvaging. static void DbgGatherSalvagableDVI( Loop *L, ScalarEvolution &SE, - SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs, - SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) { + SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) { for (const auto &B : L->getBlocks()) { for (auto &I : *B) { - auto ProcessDbgValue = [&](auto *DbgVal) -> bool { + for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) { + if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign()) + continue; + // Ensure that if any location op is undef that the dbg.vlue is not // cached. - if (DbgVal->isKillLocation()) - return false; + if (DbgVal.isKillLocation()) + continue; // Check that the location op SCEVs are suitable for translation to // DIExpression. const auto &HasTranslatableLocationOps = - [&](const auto *DbgValToTranslate) -> bool { - for (const auto LocOp : DbgValToTranslate->location_ops()) { + [&](const DbgVariableRecord &DbgValToTranslate) -> bool { + for (const auto LocOp : DbgValToTranslate.location_ops()) { if (!LocOp) return false; @@ -6963,31 +6944,21 @@ static void DbgGatherSalvagableDVI( }; if (!HasTranslatableLocationOps(DbgVal)) - return false; + continue; std::unique_ptr<DVIRecoveryRec> NewRec = - std::make_unique<DVIRecoveryRec>(DbgVal); + std::make_unique<DVIRecoveryRec>(&DbgVal); // Each location Op may need a SCEVDbgValueBuilder in order to recover // it. Pre-allocating a vector will enable quick lookups of the builder // later during the salvage. - NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps()); - for (const auto LocOp : DbgVal->location_ops()) { + NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps()); + for (const auto LocOp : DbgVal.location_ops()) { NewRec->SCEVs.push_back(SE.getSCEV(LocOp)); NewRec->LocationOps.push_back(LocOp); - NewRec->HadLocationArgList = DbgVal->hasArgList(); + NewRec->HadLocationArgList = DbgVal.hasArgList(); } SalvageableDVISCEVs.push_back(std::move(NewRec)); - return true; - }; - for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) { - if (DVR.isDbgValue() || DVR.isDbgAssign()) - ProcessDbgValue(&DVR); } - auto DVI = dyn_cast<DbgValueInst>(&I); - if (!DVI) - continue; - if (ProcessDbgValue(DVI)) - DVIHandles.insert(DVI); } } } @@ -7036,8 +7007,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, // Debug preservation - before we start removing anything identify which DVI // meet the salvageable criteria and store their DIExpression and SCEVs. SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords; - SmallSet<AssertingVH<DbgValueInst>, 2> DVIHandles; - DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles); + DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords); bool Changed = false; std::unique_ptr<MemorySSAUpdater> MSSAU; @@ -7105,7 +7075,6 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, for (auto &Rec : SalvageableDVIRecords) Rec->clear(); SalvageableDVIRecords.clear(); - DVIHandles.clear(); return Changed; } diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index eacaf42..1d1af42 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1222,9 +1222,7 @@ static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) { SmallVector<DbgVariableIntrinsic *, 4> DbgUsers; SmallVector<DbgVariableRecord *, 4> DbgVariableRecords; findDbgUsers(DbgUsers, &I, &DbgVariableRecords); - for (DbgVariableIntrinsic *DVI : DbgUsers) - if (DVI->getFunction() != &F) - DVI->eraseFromParent(); + assert(DbgUsers.empty()); for (DbgVariableRecord *DVR : DbgVariableRecords) if (DVR->getFunction() != &F) DVR->eraseFromParent(); @@ -1289,14 +1287,12 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; SmallVector<DbgVariableRecord *, 1> DPUsers; findDbgUsers(DbgUsers, Input, &DPUsers); + assert(DbgUsers.empty()); DIExpression *Expr = DIB.createExpression(); // Iterate the debud users of the Input values. If they are in the extracted // function then update their location with the new value. If they are in // the parent function then create a similar debug record. - for (auto *DVI : DbgUsers) - UpdateOrInsertDebugRecord(DVI, Input, NewVal, Expr, - isa<DbgDeclareInst>(DVI)); for (auto *DVR : DPUsers) UpdateOrInsertDebugRecord(DVR, Input, NewVal, Expr, DVR->isDbgDeclare()); } diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index a1f030a..4210ce6 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -808,9 +808,6 @@ bool checkDebugifyMetadata(Module &M, // Find missing lines. for (Instruction &I : instructions(F)) { - if (isa<DbgValueInst>(&I)) - continue; - auto DL = I.getDebugLoc(); if (DL && DL.getLine() != 0) { MissingLines.reset(DL.getLine() - 1); @@ -839,10 +836,6 @@ bool checkDebugifyMetadata(Module &M, for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) if (DVR.isDbgValue() || DVR.isDbgAssign()) CheckForMisSized(&DVR); - auto *DVI = dyn_cast<DbgValueInst>(&I); - if (!DVI) - continue; - CheckForMisSized(DVI); } } diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 6929d14..ed3dca2 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1978,14 +1978,13 @@ static at::StorageToVarsMap collectEscapedLocals(const DataLayout &DL, continue; // Find all local variables associated with the backing storage. - auto CollectAssignsForStorage = [&](auto *DbgAssign) { + auto CollectAssignsForStorage = [&](DbgVariableRecord *DbgAssign) { // Skip variables from inlined functions - they are not local variables. if (DbgAssign->getDebugLoc().getInlinedAt()) return; LLVM_DEBUG(errs() << " > DEF : " << *DbgAssign << "\n"); EscapedLocals[Base].insert(at::VarRecord(DbgAssign)); }; - for_each(at::getAssignmentMarkers(Base), CollectAssignsForStorage); for_each(at::getDVRAssignmentMarkers(Base), CollectAssignsForStorage); } return EscapedLocals; diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index b14bbea..d481ad9 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -613,11 +613,10 @@ bool llvm::replaceDbgUsesWithUndef(Instruction *I) { SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; SmallVector<DbgVariableRecord *, 1> DPUsers; findDbgUsers(DbgUsers, I, &DPUsers); - for (auto *DII : DbgUsers) - DII->setKillLocation(); + assert(DbgUsers.empty()); for (auto *DVR : DPUsers) DVR->setKillLocation(); - return !DbgUsers.empty() || !DPUsers.empty(); + return !DPUsers.empty(); } /// areAllUsesEqual - Check whether the uses of a value are all the same. @@ -1607,12 +1606,8 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar, SmallVector<DbgValueInst *, 1> DbgValues; SmallVector<DbgVariableRecord *, 1> DbgVariableRecords; findDbgValues(DbgValues, APN, &DbgVariableRecords); - for (auto *DVI : DbgValues) { - assert(is_contained(DVI->getValues(), APN)); - if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr)) - return true; - } - for (auto *DVR : DbgVariableRecords) { + assert(DbgValues.empty()); + for (DbgVariableRecord *DVR : DbgVariableRecords) { assert(is_contained(DVR->location_ops(), APN)); if ((DVR->getVariable() == DIVar) && (DVR->getExpression() == DIExpr)) return true; @@ -1971,7 +1966,6 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, static void updateOneDbgValueForAlloca(const DebugLoc &Loc, DILocalVariable *DIVar, DIExpression *DIExpr, Value *NewAddress, - DbgValueInst *DVI, DbgVariableRecord *DVR, DIBuilder &Builder, int Offset) { assert(DIVar && "Missing variable"); @@ -1987,14 +1981,8 @@ static void updateOneDbgValueForAlloca(const DebugLoc &Loc, if (Offset) DIExpr = DIExpression::prepend(DIExpr, 0, Offset); - if (DVI) { - DVI->setExpression(DIExpr); - DVI->replaceVariableLocationOp(0u, NewAddress); - } else { - assert(DVR); - DVR->setExpression(DIExpr); - DVR->replaceVariableLocationOp(0u, NewAddress); - } + DVR->setExpression(DIExpr); + DVR->replaceVariableLocationOp(0u, NewAddress); } void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress, @@ -2002,18 +1990,13 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress, SmallVector<DbgValueInst *, 1> DbgUsers; SmallVector<DbgVariableRecord *, 1> DPUsers; findDbgValues(DbgUsers, AI, &DPUsers); - - // Attempt to replace dbg.values that use this alloca. - for (auto *DVI : DbgUsers) - updateOneDbgValueForAlloca(DVI->getDebugLoc(), DVI->getVariable(), - DVI->getExpression(), NewAllocaAddress, DVI, - nullptr, Builder, Offset); + assert(DbgUsers.empty()); // Replace any DbgVariableRecords that use this alloca. for (DbgVariableRecord *DVR : DPUsers) updateOneDbgValueForAlloca(DVR->getDebugLoc(), DVR->getVariable(), - DVR->getExpression(), NewAllocaAddress, nullptr, - DVR, Builder, Offset); + DVR->getExpression(), NewAllocaAddress, DVR, + Builder, Offset); } /// Where possible to salvage debug information for \p I do so. @@ -2022,6 +2005,7 @@ void llvm::salvageDebugInfo(Instruction &I) { SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; SmallVector<DbgVariableRecord *, 1> DPUsers; findDbgUsers(DbgUsers, &I, &DPUsers); + assert(DbgUsers.empty()); salvageDebugInfoForDbgValues(I, DbgUsers, DPUsers); } @@ -2070,66 +2054,9 @@ void llvm::salvageDebugInfoForDbgValues( const unsigned MaxExpressionSize = 128; bool Salvaged = false; - for (auto *DII : DbgUsers) { - if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DII)) { - if (DAI->getAddress() == &I) { - salvageDbgAssignAddress(DAI); - Salvaged = true; - } - if (DAI->getValue() != &I) - continue; - } + // We should never see debug intrinsics nowadays. + assert(DbgUsers.empty()); - // Do not add DW_OP_stack_value for DbgDeclare, because they are implicitly - // pointing out the value as a DWARF memory location description. - bool StackValue = isa<DbgValueInst>(DII); - auto DIILocation = DII->location_ops(); - assert( - is_contained(DIILocation, &I) && - "DbgVariableIntrinsic must use salvaged instruction as its location"); - SmallVector<Value *, 4> AdditionalValues; - // `I` may appear more than once in DII's location ops, and each use of `I` - // must be updated in the DIExpression and potentially have additional - // values added; thus we call salvageDebugInfoImpl for each `I` instance in - // DIILocation. - Value *Op0 = nullptr; - DIExpression *SalvagedExpr = DII->getExpression(); - auto LocItr = find(DIILocation, &I); - while (SalvagedExpr && LocItr != DIILocation.end()) { - SmallVector<uint64_t, 16> Ops; - unsigned LocNo = std::distance(DIILocation.begin(), LocItr); - uint64_t CurrentLocOps = SalvagedExpr->getNumLocationOperands(); - Op0 = salvageDebugInfoImpl(I, CurrentLocOps, Ops, AdditionalValues); - if (!Op0) - break; - SalvagedExpr = - DIExpression::appendOpsToArg(SalvagedExpr, Ops, LocNo, StackValue); - LocItr = std::find(++LocItr, DIILocation.end(), &I); - } - // salvageDebugInfoImpl should fail on examining the first element of - // DbgUsers, or none of them. - if (!Op0) - break; - - SalvagedExpr = SalvagedExpr->foldConstantMath(); - DII->replaceVariableLocationOp(&I, Op0); - bool IsValidSalvageExpr = SalvagedExpr->getNumElements() <= MaxExpressionSize; - if (AdditionalValues.empty() && IsValidSalvageExpr) { - DII->setExpression(SalvagedExpr); - } else if (isa<DbgValueInst>(DII) && IsValidSalvageExpr && - DII->getNumVariableLocationOps() + AdditionalValues.size() <= - MaxDebugArgs) { - DII->addVariableLocationOps(AdditionalValues, SalvagedExpr); - } else { - // Do not salvage using DIArgList for dbg.declare, as it is not currently - // supported in those instructions. Also do not salvage if the resulting - // DIArgList would contain an unreasonably large number of values. - DII->setKillLocation(); - } - LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n'); - Salvaged = true; - } - // Duplicate of above block for DbgVariableRecords. for (auto *DVR : DPUsers) { if (DVR->isDbgAssign()) { if (DVR->getAddress() == &I) { @@ -2198,9 +2125,6 @@ void llvm::salvageDebugInfoForDbgValues( if (Salvaged) return; - for (auto *DII : DbgUsers) - DII->setKillLocation(); - for (auto *DVR : DPUsers) DVR->setKillLocation(); } @@ -3429,8 +3353,7 @@ void llvm::dropDebugUsers(Instruction &I) { SmallVector<DbgVariableIntrinsic *, 1> DbgUsers; SmallVector<DbgVariableRecord *, 1> DPUsers; findDbgUsers(DbgUsers, &I, &DPUsers); - for (auto *DII : DbgUsers) - DII->eraseFromParent(); + assert(DbgUsers.empty()); for (auto *DVR : DPUsers) DVR->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 66d0573..06115e0 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -161,29 +161,8 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, SmallVector<DbgValueInst *, 1> DbgValues; SmallVector<DbgVariableRecord *, 1> DbgVariableRecords; llvm::findDbgValues(DbgValues, OrigHeaderVal, &DbgVariableRecords); - for (auto &DbgValue : DbgValues) { - // The original users in the OrigHeader are already using the original - // definitions. - BasicBlock *UserBB = DbgValue->getParent(); - if (UserBB == OrigHeader) - continue; - - // Users in the OrigPreHeader need to use the value to which the - // original definitions are mapped and anything else can be handled by - // the SSAUpdater. To avoid adding PHINodes, check if the value is - // available in UserBB, if not substitute poison. - Value *NewVal; - if (UserBB == OrigPreheader) - NewVal = OrigPreHeaderVal; - else if (SSA.HasValueForBlock(UserBB)) - NewVal = SSA.GetValueInMiddleOfBlock(UserBB); - else - NewVal = PoisonValue::get(OrigHeaderVal->getType()); - DbgValue->replaceVariableLocationOp(OrigHeaderVal, NewVal); - } + assert(DbgValues.empty()); - // RemoveDIs: duplicate implementation for non-instruction debug-info - // storage in DbgVariableRecords. for (DbgVariableRecord *DVR : DbgVariableRecords) { // The original users in the OrigHeader are already using the original // definitions. diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index ccd7ee3..73b5f48 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -190,7 +190,6 @@ public: }; struct AllocaInfo { - using DbgUserVec = SmallVector<DbgVariableIntrinsic *, 1>; using DPUserVec = SmallVector<DbgVariableRecord *, 1>; SmallVector<BasicBlock *, 32> DefiningBlocks; @@ -201,7 +200,6 @@ struct AllocaInfo { bool OnlyUsedInOneBlock; /// Debug users of the alloca - does not include dbg.assign intrinsics. - DbgUserVec DbgUsers; DPUserVec DPUsers; /// Helper to update assignment tracking debug info. AssignmentTrackingInfo AssignmentTracking; @@ -212,7 +210,6 @@ struct AllocaInfo { OnlyStore = nullptr; OnlyBlock = nullptr; OnlyUsedInOneBlock = true; - DbgUsers.clear(); DPUsers.clear(); AssignmentTracking.clear(); } @@ -246,13 +243,10 @@ struct AllocaInfo { OnlyUsedInOneBlock = false; } } - DbgUserVec AllDbgUsers; + SmallVector<DbgVariableIntrinsic *> AllDbgUsers; SmallVector<DbgVariableRecord *> AllDPUsers; findDbgUsers(AllDbgUsers, AI, &AllDPUsers); - std::copy_if(AllDbgUsers.begin(), AllDbgUsers.end(), - std::back_inserter(DbgUsers), [](DbgVariableIntrinsic *DII) { - return !isa<DbgAssignIntrinsic>(DII); - }); + assert(AllDbgUsers.empty()); std::copy_if(AllDPUsers.begin(), AllDPUsers.end(), std::back_inserter(DPUsers), [](DbgVariableRecord *DVR) { return !DVR->isDbgAssign(); }); @@ -380,10 +374,9 @@ struct PromoteMem2Reg { /// to. DenseMap<PHINode *, unsigned> PhiToAllocaMap; - /// For each alloca, we keep track of the dbg.declare intrinsic that + /// For each alloca, we keep track of the dbg.declare record that /// describes it, if any, so that we can convert it to a dbg.value - /// intrinsic if the alloca gets promoted. - SmallVector<AllocaInfo::DbgUserVec, 8> AllocaDbgUsers; + /// record if the alloca gets promoted. SmallVector<AllocaInfo::DPUserVec, 8> AllocaDPUsers; /// For each alloca, keep an instance of a helper class that gives us an easy @@ -741,14 +734,11 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, AI->eraseFromParent(); // The alloca's debuginfo can be removed as well. - auto DbgUpdateForAlloca = [&](auto &Container) { - for (auto *DbgItem : Container) - if (DbgItem->isAddressOfVariable() || - DbgItem->getExpression()->startsWithDeref()) - DbgItem->eraseFromParent(); - }; - DbgUpdateForAlloca(Info.DbgUsers); - DbgUpdateForAlloca(Info.DPUsers); + for (DbgVariableRecord *DbgItem : Info.DPUsers) { + if (DbgItem->isAddressOfVariable() || + DbgItem->getExpression()->startsWithDeref()) + DbgItem->eraseFromParent(); + } ++NumLocalPromoted; return true; @@ -757,7 +747,6 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, void PromoteMem2Reg::run() { Function &F = *DT.getRoot()->getParent(); - AllocaDbgUsers.resize(Allocas.size()); AllocaATInfo.resize(Allocas.size()); AllocaDPUsers.resize(Allocas.size()); @@ -816,9 +805,7 @@ void PromoteMem2Reg::run() { if (BBNumPreds.empty()) BBNumPreds.resize(F.getMaxBlockNumber()); - // Remember the dbg.declare intrinsic describing this alloca, if any. - if (!Info.DbgUsers.empty()) - AllocaDbgUsers[AllocaNum] = Info.DbgUsers; + // Remember the dbg.declare record describing this alloca, if any. if (!Info.AssignmentTracking.empty()) AllocaATInfo[AllocaNum] = Info.AssignmentTracking; if (!Info.DPUsers.empty()) @@ -894,16 +881,12 @@ void PromoteMem2Reg::run() { } // Remove alloca's dbg.declare intrinsics from the function. - auto RemoveDbgDeclares = [&](auto &Container) { - for (auto &DbgUsers : Container) { - for (auto *DbgItem : DbgUsers) - if (DbgItem->isAddressOfVariable() || - DbgItem->getExpression()->startsWithDeref()) - DbgItem->eraseFromParent(); - } - }; - RemoveDbgDeclares(AllocaDbgUsers); - RemoveDbgDeclares(AllocaDPUsers); + for (auto &DbgUsers : AllocaDPUsers) { + for (DbgVariableRecord *DbgItem : DbgUsers) + if (DbgItem->isAddressOfVariable() || + DbgItem->getExpression()->startsWithDeref()) + DbgItem->eraseFromParent(); + } // Loop over all of the PHI nodes and see if there are any that we can get // rid of because they merge all of the same incoming values. This can diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 5db7fc9..561c898 100644 --- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -200,11 +200,7 @@ void SSAUpdater::UpdateDebugValues(Instruction *I) { SmallVector<DbgValueInst *, 4> DbgValues; SmallVector<DbgVariableRecord *, 4> DbgVariableRecords; llvm::findDbgValues(DbgValues, I, &DbgVariableRecords); - for (auto &DbgValue : DbgValues) { - if (DbgValue->getParent() == I->getParent()) - continue; - UpdateDebugValue(I, DbgValue); - } + assert(DbgValues.empty()); for (auto &DVR : DbgVariableRecords) { if (DVR->getParent() == I->getParent()) continue; @@ -212,13 +208,6 @@ void SSAUpdater::UpdateDebugValues(Instruction *I) { } } -void SSAUpdater::UpdateDebugValues(Instruction *I, - SmallVectorImpl<DbgValueInst *> &DbgValues) { - for (auto &DbgValue : DbgValues) { - UpdateDebugValue(I, DbgValue); - } -} - void SSAUpdater::UpdateDebugValues( Instruction *I, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) { for (auto &DVR : DbgVariableRecords) { @@ -226,15 +215,6 @@ void SSAUpdater::UpdateDebugValues( } } -void SSAUpdater::UpdateDebugValue(Instruction *I, DbgValueInst *DbgValue) { - BasicBlock *UserBB = DbgValue->getParent(); - if (HasValueForBlock(UserBB)) { - Value *NewVal = GetValueAtEndOfBlock(UserBB); - DbgValue->replaceVariableLocationOp(I, NewVal); - } else - DbgValue->setKillLocation(); -} - void SSAUpdater::UpdateDebugValue(Instruction *I, DbgVariableRecord *DVR) { BasicBlock *UserBB = DVR->getParent(); if (HasValueForBlock(UserBB)) { diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index a75f290..75c9650 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7493,7 +7493,7 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI, SmallPtrSet<PHINode *, 8> Phis; SmallPtrSet<BasicBlock *, 8> Seen; DenseMap<PHINode *, SmallDenseMap<BasicBlock *, Value *, 8>> PhiPredIVs; - DenseMap<BasicBlock *, SmallVector<unsigned, 4>> BBToSuccessorIndexes; + DenseMap<BasicBlock *, SmallVector<unsigned, 32>> BBToSuccessorIndexes; SmallVector<SwitchSuccWrapper> Cases; Cases.reserve(SI->getNumSuccessors()); @@ -7505,12 +7505,6 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI, if (BB->size() != 1) continue; - // FIXME: This case needs some extra care because the terminators other than - // SI need to be updated. For now, consider only backedges to the SI. - if (BB->hasNPredecessorsOrMore(4) || - BB->getUniquePredecessor() != SI->getParent()) - continue; - // FIXME: Relax that the terminator is a BranchInst by checking for equality // on other kinds of terminators. We decide to only support unconditional // branches for now for compile time reasons. @@ -7518,14 +7512,24 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI, if (!BI || BI->isConditional()) continue; - if (Seen.insert(BB).second) { - // Keep track of which PHIs we need as keys in PhiPredIVs below. - for (BasicBlock *Succ : BI->successors()) - Phis.insert_range(llvm::make_pointer_range(Succ->phis())); - // Add the successor only if not previously visited. - Cases.emplace_back(SwitchSuccWrapper{BB, &PhiPredIVs}); + if (!Seen.insert(BB).second) { + auto It = BBToSuccessorIndexes.find(BB); + if (It != BBToSuccessorIndexes.end()) + It->second.emplace_back(I); + continue; } + // FIXME: This case needs some extra care because the terminators other than + // SI need to be updated. For now, consider only backedges to the SI. + if (BB->getUniquePredecessor() != SI->getParent()) + continue; + + // Keep track of which PHIs we need as keys in PhiPredIVs below. + for (BasicBlock *Succ : BI->successors()) + Phis.insert_range(llvm::make_pointer_range(Succ->phis())); + + // Add the successor only if not previously visited. + Cases.emplace_back(SwitchSuccWrapper{BB, &PhiPredIVs}); BBToSuccessorIndexes[BB].emplace_back(I); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ceeabd65..f142e07 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -947,9 +947,8 @@ public: /// user options, for the given register kind. bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind); - /// \return True if maximizing vector bandwidth is enabled by the target or - /// user options, for the given vector factor. - bool useMaxBandwidth(ElementCount VF); + /// \return True if register pressure should be calculated for the given VF. + bool shouldCalculateRegPressureForVF(ElementCount VF); /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as @@ -1736,6 +1735,9 @@ public: /// Whether this loop should be optimized for size based on function attribute /// or profile information. bool OptForSize; + + /// The highest VF possible for this loop, without using MaxBandwidth. + FixedScalableVFPair MaxPermissibleVFWithoutMaxBW; }; } // end namespace llvm @@ -3832,10 +3834,16 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return FixedScalableVFPair::getNone(); } -bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) { - return useMaxBandwidth(VF.isScalable() - ? TargetTransformInfo::RGK_ScalableVector - : TargetTransformInfo::RGK_FixedWidthVector); +bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF( + ElementCount VF) { + if (!useMaxBandwidth(VF.isScalable() + ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector)) + return false; + // Only calculate register pressure for VFs enabled by MaxBandwidth. + return ElementCount::isKnownGT( + VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF + : MaxPermissibleVFWithoutMaxBW.FixedVF); } bool LoopVectorizationCostModel::useMaxBandwidth( @@ -3911,6 +3919,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector; ElementCount MaxVF = MaxVectorElementCount; + + if (MaxVF.isScalable()) + MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF; + else + MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF; + if (useMaxBandwidth(RegKind)) { auto MaxVectorElementCountMaxBW = ElementCount::get( llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), @@ -4264,9 +4278,11 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { if (VF.isScalar()) continue; - /// Don't consider the VF if it exceeds the number of registers for the - /// target. - if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) + /// If the register pressure needs to be considered for VF, + /// don't consider the VF as valid if it exceeds the number + /// of registers for the target. + if (CM.shouldCalculateRegPressureForVF(VF) && + RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) continue; InstructionCost C = CM.expectedCost(VF); @@ -7044,7 +7060,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { InstructionCost Cost = cost(*P, VF); VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); - if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) { + if (CM.shouldCalculateRegPressureForVF(VF) && + RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) { LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width " << VF << " because it uses too many registers\n"); continue; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index b27a7ff..ca8729a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -404,9 +404,12 @@ static unsigned getVFScaleFactor(VPRecipeBase *R) { return 1; } -bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI) const { - return any_of(MaxLocalUsers, [&TTI](auto &LU) { - return LU.second > TTI.getNumberOfRegisters(LU.first); +bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI, + unsigned OverrideMaxNumRegs) const { + return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) { + return LU.second > (OverrideMaxNumRegs > 0 + ? OverrideMaxNumRegs + : TTI.getNumberOfRegisters(LU.first)); }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h index 7bcf9db..cd86d27 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -85,8 +85,10 @@ struct VPRegisterUsage { SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; /// Check if any of the tracked live intervals exceeds the number of - /// available registers for the target. - bool exceedsMaxNumRegs(const TargetTransformInfo &TTI) const; + /// available registers for the target. If non-zero, OverrideMaxNumRegs + /// is used in place of the target's number of registers. + bool exceedsMaxNumRegs(const TargetTransformInfo &TTI, + unsigned OverrideMaxNumRegs = 0) const; }; /// Estimate the register usage for \p Plan and vectorization factors in \p VFs |