diff options
Diffstat (limited to 'llvm/lib/Transforms')
33 files changed, 571 insertions, 323 deletions
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp index b73a0ce..4645670 100644 --- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp +++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp @@ -147,7 +147,7 @@ public: private: // Only add checks if the module has the cfguard=2 flag. - int cfguard_module_flag = 0; + int CFGuardModuleFlag = 0; StringRef GuardFnName; Mechanism GuardMechanism = Mechanism::Check; FunctionType *GuardFnType = nullptr; @@ -162,9 +162,7 @@ public: static char ID; // Default constructor required for the INITIALIZE_PASS macro. - CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) { - initializeCFGuardPass(*PassRegistry::getPassRegistry()); - } + CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) {} bool doInitialization(Module &M) override { return Impl.doInitialization(M); } bool runOnFunction(Function &F) override { return Impl.runOnFunction(F); } @@ -173,7 +171,6 @@ public: } // end anonymous namespace void CFGuardImpl::insertCFGuardCheck(CallBase *CB) { - assert(CB->getModule()->getTargetTriple().isOSWindows() && "Only applicable for Windows targets"); assert(CB->isIndirectCall() && @@ -202,7 +199,6 @@ void CFGuardImpl::insertCFGuardCheck(CallBase *CB) { } void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) { - assert(CB->getModule()->getTargetTriple().isOSWindows() && "Only applicable for Windows targets"); assert(CB->isIndirectCall() && @@ -236,14 +232,13 @@ void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) { } bool CFGuardImpl::doInitialization(Module &M) { - // Check if this module has the cfguard flag and read its value. if (auto *MD = mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard"))) - cfguard_module_flag = MD->getZExtValue(); + CFGuardModuleFlag = MD->getZExtValue(); // Skip modules for which CFGuard checks have been disabled. - if (cfguard_module_flag != 2) + if (CFGuardModuleFlag != 2) return false; // Set up prototypes for the guard check and dispatch functions. @@ -264,9 +259,8 @@ bool CFGuardImpl::doInitialization(Module &M) { } bool CFGuardImpl::runOnFunction(Function &F) { - // Skip modules for which CFGuard checks have been disabled. - if (cfguard_module_flag != 2) + if (CFGuardModuleFlag != 2) return false; SmallVector<CallBase *, 8> IndirectCalls; @@ -286,19 +280,16 @@ bool CFGuardImpl::runOnFunction(Function &F) { } // If no checks are needed, return early. - if (IndirectCalls.empty()) { + if (IndirectCalls.empty()) return false; - } // For each indirect call/invoke, add the appropriate dispatch or check. if (GuardMechanism == Mechanism::Dispatch) { - for (CallBase *CB : IndirectCalls) { + for (CallBase *CB : IndirectCalls) insertCFGuardDispatch(CB); - } } else { - for (CallBase *CB : IndirectCalls) { + for (CallBase *CB : IndirectCalls) insertCFGuardCheck(CB); - } } return true; diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp index f166fef..cf7e450 100644 --- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp @@ -153,26 +153,23 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C, bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine(); bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe); if (IsCallerPresplitCoroutine && HasAttr) { - BranchProbability MinBranchProbability( - static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution), - MinBlockCounterExecution); - auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller); - auto Prob = BranchProbability::getBranchProbability( - BFI.getBlockFreq(CB->getParent()).getFrequency(), - BFI.getEntryFreq().getFrequency()); + auto BlockFreq = BFI.getBlockFreq(CB->getParent()).getFrequency(); + auto EntryFreq = BFI.getEntryFreq().getFrequency(); + uint64_t MinFreq = + static_cast<uint64_t>(EntryFreq * CoroElideBranchRatio); - if (Prob < MinBranchProbability) { + if (BlockFreq < MinFreq) { ORE.emit([&]() { return OptimizationRemarkMissed( DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller) << "'" << ore::NV("callee", Callee->getName()) << "' not elided in '" << ore::NV("caller", Caller->getName()) - << "' because of low probability: " - << ore::NV("probability", Prob) << " (threshold: " - << ore::NV("threshold", MinBranchProbability) << ")"; + << "' because of low frequency: " + << ore::NV("block_freq", BlockFreq) + << " (threshold: " << ore::NV("min_freq", MinFreq) << ")"; }); continue; } @@ -188,7 +185,8 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C, return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller) << "'" << ore::NV("callee", Callee->getName()) << "' elided in '" << ore::NV("caller", Caller->getName()) - << "' (probability: " << ore::NV("probability", Prob) << ")"; + << "' (block_freq: " << ore::NV("block_freq", BlockFreq) + << ")"; }); FAM.invalidate(*Caller, PreservedAnalyses::none()); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 5066a99..894d83f 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -6150,3 +6150,42 @@ void MemProfContextDisambiguation::run( IndexCallsiteContextGraph CCG(Index, isPrevailing); CCG.process(); } + +// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline +// when we don't have an index that has recorded that we are linking with +// allocation libraries containing the necessary APIs for downstream +// transformations. +PreservedAnalyses MemProfRemoveInfo::run(Module &M, ModuleAnalysisManager &AM) { + // The profile matcher applies hotness attributes directly for allocations, + // and those will cause us to generate calls to the hot/cold interfaces + // unconditionally. If supports-hot-cold-new was not enabled in the LTO + // link then assume we don't want these calls (e.g. not linking with + // the appropriate library, or otherwise trying to disable this behavior). + bool Changed = false; + for (auto &F : M) { + for (auto &BB : F) { + for (auto &I : BB) { + auto *CI = dyn_cast<CallBase>(&I); + if (!CI) + continue; + if (CI->hasFnAttr("memprof")) { + CI->removeFnAttr("memprof"); + Changed = true; + } + if (!CI->hasMetadata(LLVMContext::MD_callsite)) { + assert(!CI->hasMetadata(LLVMContext::MD_memprof)); + continue; + } + // Strip off all memprof metadata as it is no longer needed. + // Importantly, this avoids the addition of new memprof attributes + // after inlining propagation. + CI->setMetadata(LLVMContext::MD_memprof, nullptr); + CI->setMetadata(LLVMContext::MD_callsite, nullptr); + Changed = true; + } + } + } + if (!Changed) + return PreservedAnalyses::all(); + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 4c9b10a..cdc559b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -156,9 +156,9 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) { Value *Src = CI.getOperand(0); Type *Ty = CI.getType(); - if (auto *SrcC = dyn_cast<Constant>(Src)) - if (Constant *Res = ConstantFoldCastOperand(CI.getOpcode(), SrcC, Ty, DL)) - return replaceInstUsesWith(CI, Res); + if (Value *Res = + simplifyCastInst(CI.getOpcode(), Src, Ty, SQ.getWithInstruction(&CI))) + return replaceInstUsesWith(CI, Res); // Try to eliminate a cast of a cast. if (auto *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 07ad65c..fba1ccf 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1481,13 +1481,13 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp, return new ICmpInst(Pred, Y, ConstantInt::get(SrcTy, C.logBase2())); } - if (Cmp.isEquality() && Trunc->hasOneUse()) { + if (Cmp.isEquality() && (Trunc->hasOneUse() || Trunc->hasNoUnsignedWrap())) { // Canonicalize to a mask and wider compare if the wide type is suitable: // (trunc X to i8) == C --> (X & 0xff) == (zext C) if (!SrcTy->isVectorTy() && shouldChangeType(DstBits, SrcBits)) { Constant *Mask = ConstantInt::get(SrcTy, APInt::getLowBitsSet(SrcBits, DstBits)); - Value *And = Builder.CreateAnd(X, Mask); + Value *And = Trunc->hasNoUnsignedWrap() ? X : Builder.CreateAnd(X, Mask); Constant *WideC = ConstantInt::get(SrcTy, C.zext(SrcBits)); return new ICmpInst(Pred, And, WideC); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 09cb225..a8eb9b9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3757,6 +3757,10 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder, // (x < y) ? -1 : zext(x > y) // (x > y) ? 1 : sext(x != y) // (x > y) ? 1 : sext(x < y) +// (x == y) ? 0 : (x > y ? 1 : -1) +// (x == y) ? 0 : (x < y ? -1 : 1) +// Special case: x == C ? 0 : (x > C - 1 ? 1 : -1) +// Special case: x == C ? 0 : (x < C + 1 ? -1 : 1) // Into ucmp/scmp(x, y), where signedness is determined by the signedness // of the comparison in the original sequence. Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { @@ -3849,6 +3853,44 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { } } + // Special cases with constants: x == C ? 0 : (x > C-1 ? 1 : -1) + if (Pred == ICmpInst::ICMP_EQ && match(TV, m_Zero())) { + const APInt *C; + if (match(RHS, m_APInt(C))) { + CmpPredicate InnerPred; + Value *InnerRHS; + const APInt *InnerTV, *InnerFV; + if (match(FV, + m_Select(m_ICmp(InnerPred, m_Specific(LHS), m_Value(InnerRHS)), + m_APInt(InnerTV), m_APInt(InnerFV)))) { + + // x == C ? 0 : (x > C-1 ? 1 : -1) + if (ICmpInst::isGT(InnerPred) && InnerTV->isOne() && + InnerFV->isAllOnes()) { + IsSigned = ICmpInst::isSigned(InnerPred); + bool CanSubOne = IsSigned ? !C->isMinSignedValue() : !C->isMinValue(); + if (CanSubOne) { + APInt Cminus1 = *C - 1; + if (match(InnerRHS, m_SpecificInt(Cminus1))) + Replace = true; + } + } + + // x == C ? 0 : (x < C+1 ? -1 : 1) + if (ICmpInst::isLT(InnerPred) && InnerTV->isAllOnes() && + InnerFV->isOne()) { + IsSigned = ICmpInst::isSigned(InnerPred); + bool CanAddOne = IsSigned ? !C->isMaxSignedValue() : !C->isMaxValue(); + if (CanAddOne) { + APInt Cplus1 = *C + 1; + if (match(InnerRHS, m_SpecificInt(Cplus1))) + Replace = true; + } + } + } + } + } + Intrinsic::ID IID = IsSigned ? Intrinsic::scmp : Intrinsic::ucmp; if (Replace) return replaceInstUsesWith( diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 511bca4..2646334 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -605,17 +605,16 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize, return Mapping; } -namespace llvm { -void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize, - bool IsKasan, uint64_t *ShadowBase, - int *MappingScale, bool *OrShadowOffset) { +void llvm::getAddressSanitizerParams(const Triple &TargetTriple, int LongSize, + bool IsKasan, uint64_t *ShadowBase, + int *MappingScale, bool *OrShadowOffset) { auto Mapping = getShadowMapping(TargetTriple, LongSize, IsKasan); *ShadowBase = Mapping.Offset; *MappingScale = Mapping.Scale; *OrShadowOffset = Mapping.OrShadowOffset; } -void removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) { +void llvm::removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) { // Sanitizer checks read from shadow, which invalidates memory(argmem: *). // // This is not only true for sanitized functions, because AttrInfer can @@ -668,8 +667,6 @@ ASanAccessInfo::ASanAccessInfo(bool IsWrite, bool CompileKernel, AccessSizeIndex(AccessSizeIndex), IsWrite(IsWrite), CompileKernel(CompileKernel) {} -} // namespace llvm - static uint64_t getRedzoneSizeForScale(int MappingScale) { // Redzone used for stack and globals is at least 32 bytes. // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively. @@ -677,11 +674,10 @@ static uint64_t getRedzoneSizeForScale(int MappingScale) { } static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) { - if (TargetTriple.isOSEmscripten()) { + if (TargetTriple.isOSEmscripten()) return kAsanEmscriptenCtorAndDtorPriority; - } else { + else return kAsanCtorAndDtorPriority; - } } static Twine genName(StringRef suffix) { @@ -848,6 +844,7 @@ struct AddressSanitizer { bool maybeInsertAsanInitAtFunctionEntry(Function &F); bool maybeInsertDynamicShadowAtFunctionEntry(Function &F); void markEscapedLocalAllocas(Function &F); + void markCatchParametersAsUninteresting(Function &F); private: friend struct FunctionStackPoisoner; @@ -3001,6 +2998,22 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) { } } } +// Mitigation for https://github.com/google/sanitizers/issues/749 +// We don't instrument Windows catch-block parameters to avoid +// interfering with exception handling assumptions. +void AddressSanitizer::markCatchParametersAsUninteresting(Function &F) { + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (auto *CatchPad = dyn_cast<CatchPadInst>(&I)) { + // Mark the parameters to a catch-block as uninteresting to avoid + // instrumenting them. + for (Value *Operand : CatchPad->arg_operands()) + if (auto *AI = dyn_cast<AllocaInst>(Operand)) + ProcessedAllocas[AI] = false; + } + } + } +} bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) { bool ShouldInstrument = @@ -3045,6 +3058,9 @@ bool AddressSanitizer::instrumentFunction(Function &F, // can be passed to that intrinsic. markEscapedLocalAllocas(F); + if (TargetTriple.isOSWindows()) + markCatchParametersAsUninteresting(F); + // We want to instrument every address only once per basic block (unless there // are calls between uses). SmallPtrSet<Value *, 16> TempsToInstrument; diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 444b390..72e8e50 100644 --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -2092,8 +2092,6 @@ bool CHR::run() { return Changed; } -namespace llvm { - ControlHeightReductionPass::ControlHeightReductionPass() { parseCHRFilterFiles(); } @@ -2116,5 +2114,3 @@ PreservedAnalyses ControlHeightReductionPass::run( return PreservedAnalyses::all(); return PreservedAnalyses::none(); } - -} // namespace llvm diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index c327311..7ebcc21 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -53,6 +53,7 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" @@ -117,6 +118,10 @@ static cl::opt<bool> LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true), cl::desc("Predicate conditions in read only loops")); +static cl::opt<bool> LoopPredicationTraps( + "indvars-predicate-loop-traps", cl::Hidden, cl::init(true), + cl::desc("Predicate conditions that trap in loops with only local writes")); + static cl::opt<bool> AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true), cl::desc("Allow widening of indvars to eliminate s/zext")); @@ -1704,6 +1709,24 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { return Changed; } +static bool crashingBBWithoutEffect(const BasicBlock &BB) { + return llvm::all_of(BB, [](const Instruction &I) { + // TODO: for now this is overly restrictive, to make sure nothing in this + // BB can depend on the loop body. + // It's not enough to check for !I.mayHaveSideEffects(), because e.g. a + // load does not have a side effect, but we could have + // %a = load ptr, ptr %ptr + // %b = load i32, ptr %a + // Now if the loop stored a non-nullptr to %a, we could cause a nullptr + // dereference by skipping over loop iterations. + if (const auto *CB = dyn_cast<CallBase>(&I)) { + if (CB->onlyAccessesInaccessibleMemory()) + return true; + } + return isa<UnreachableInst>(I); + }); +} + bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { SmallVector<BasicBlock*, 16> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); @@ -1816,11 +1839,25 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { // suggestions on how to improve this? I can obviously bail out for outer // loops, but that seems less than ideal. MemorySSA can find memory writes, // is that enough for *all* side effects? + bool HasThreadLocalSideEffects = false; for (BasicBlock *BB : L->blocks()) for (auto &I : *BB) // TODO:isGuaranteedToTransfer - if (I.mayHaveSideEffects()) - return false; + if (I.mayHaveSideEffects()) { + if (!LoopPredicationTraps) + return false; + HasThreadLocalSideEffects = true; + if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { + // Simple stores cannot be observed by other threads. + // If HasThreadLocalSideEffects is set, we check + // crashingBBWithoutEffect to make sure that the crashing BB cannot + // observe them either. + if (!SI->isSimple()) + return false; + } else { + return false; + } + } bool Changed = false; // Finally, do the actual predication for all predicatable blocks. A couple @@ -1840,6 +1877,19 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); + if (HasThreadLocalSideEffects) { + const BasicBlock *Unreachable = nullptr; + for (const BasicBlock *Succ : BI->successors()) { + if (isa<UnreachableInst>(Succ->getTerminator())) + Unreachable = Succ; + } + // Exit BB which have one branch back into the loop and another one to + // a trap can still be optimized, because local side effects cannot + // be observed in the exit case (the trap). We could be smarter about + // this, but for now lets pattern match common cases that directly trap. + if (Unreachable == nullptr || !crashingBBWithoutEffect(*Unreachable)) + return Changed; + } Value *NewCond; if (ExitCount == ExactBTC) { NewCond = L->contains(BI->getSuccessor(0)) ? diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 20733032..19eccb9 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -368,7 +368,7 @@ private: Valid = false; } - bool reportInvalidCandidate(llvm::Statistic &Stat) const { + bool reportInvalidCandidate(Statistic &Stat) const { using namespace ore; assert(L && Preheader && "Fusion candidate not initialized properly!"); #if LLVM_ENABLE_STATS @@ -445,6 +445,7 @@ struct FusionCandidateCompare { "No dominance relationship between these fusion candidates!"); } }; +} // namespace using LoopVector = SmallVector<Loop *, 4>; @@ -461,9 +462,15 @@ using LoopVector = SmallVector<Loop *, 4>; using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; -#if !defined(NDEBUG) -static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const FusionCandidate &FC) { +#ifndef NDEBUG +static void printLoopVector(const LoopVector &LV) { + dbgs() << "****************************\n"; + for (const Loop *L : LV) + printLoop(*L, dbgs()); + dbgs() << "****************************\n"; +} + +static raw_ostream &operator<<(raw_ostream &OS, const FusionCandidate &FC) { if (FC.isValid()) OS << FC.Preheader->getName(); else @@ -472,8 +479,8 @@ static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, return OS; } -static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const FusionCandidateSet &CandSet) { +static raw_ostream &operator<<(raw_ostream &OS, + const FusionCandidateSet &CandSet) { for (const FusionCandidate &FC : CandSet) OS << FC << '\n'; @@ -489,7 +496,9 @@ printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { dbgs() << "****************************\n"; } } -#endif +#endif // NDEBUG + +namespace { /// Collect all loops in function at the same nest level, starting at the /// outermost level. @@ -550,15 +559,6 @@ private: LoopsOnLevelTy LoopsOnLevel; }; -#ifndef NDEBUG -static void printLoopVector(const LoopVector &LV) { - dbgs() << "****************************\n"; - for (auto *L : LV) - printLoop(*L, dbgs()); - dbgs() << "****************************\n"; -} -#endif - struct LoopFuser { private: // Sets of control flow equivalent fusion candidates for a given nest level. @@ -1850,7 +1850,7 @@ private: /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> template <typename RemarkKind> void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, - llvm::Statistic &Stat) { + Statistic &Stat) { assert(FC0.Preheader && FC1.Preheader && "Expecting valid fusion candidates"); using namespace ore; diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index 32078b1..d827e64 100644 --- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -8,7 +8,6 @@ #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -16,8 +15,6 @@ using namespace llvm; -namespace llvm { - /// Explicitly specialize the pass manager's run method to handle loop nest /// structure updates. PreservedAnalyses @@ -185,7 +182,6 @@ LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM, } return PA; } -} // namespace llvm void FunctionToLoopPassAdaptor::printPipeline( raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { @@ -193,6 +189,7 @@ void FunctionToLoopPassAdaptor::printPipeline( Pass->printPipeline(OS, MapClassName2PassName); OS << ')'; } + PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, FunctionAnalysisManager &AM) { // Before we even compute any loop analyses, first run a miniature function @@ -219,9 +216,6 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, // Get the analysis results needed by loop passes. MemorySSA *MSSA = UseMemorySSA ? (&AM.getResult<MemorySSAAnalysis>(F).getMSSA()) : nullptr; - BlockFrequencyInfo *BFI = UseBlockFrequencyInfo && F.hasProfileData() - ? (&AM.getResult<BlockFrequencyAnalysis>(F)) - : nullptr; LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F), AM.getResult<AssumptionAnalysis>(F), AM.getResult<DominatorTreeAnalysis>(F), @@ -229,7 +223,6 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, AM.getResult<ScalarEvolutionAnalysis>(F), AM.getResult<TargetLibraryAnalysis>(F), AM.getResult<TargetIRAnalysis>(F), - BFI, MSSA}; // Setup the loop analysis manager from its proxy. It is important that diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 448dc2b..f3e6cbf 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -540,8 +540,6 @@ bool LoopVersioningLICM::run(DominatorTree *DT) { return Changed; } -namespace llvm { - PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &LAR, LPMUpdater &U) { @@ -556,4 +554,3 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); } -} // namespace llvm diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 7cae94eb..3487e81 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -97,6 +97,12 @@ static cl::opt<MatrixLayoutTy> MatrixLayout( static cl::opt<bool> PrintAfterTransposeOpt("matrix-print-after-transpose-opt", cl::init(false)); +static cl::opt<unsigned> SplitMatmulRemainderOverThreshold( + "matrix-split-matmul-remainder-over-threshold", cl::Hidden, + cl::desc("Illegal remainder vectors over this size in bits should be split " + "in the inner loop of matmul"), + cl::init(0)); + /// Helper function to either return Scope, if it is a subprogram or the /// attached subprogram for a local scope. static DISubprogram *getSubprogram(DIScope *Scope) { @@ -115,18 +121,16 @@ static bool isSplat(Value *V) { /// Match any mul operation (fp or integer). template <typename LTy, typename RTy> -auto m_AnyMul(const LTy &L, const RTy &R) { +static auto m_AnyMul(const LTy &L, const RTy &R) { return m_CombineOr(m_Mul(L, R), m_FMul(L, R)); } /// Match any add operation (fp or integer). template <typename LTy, typename RTy> -auto m_AnyAdd(const LTy &L, const RTy &R) { +static auto m_AnyAdd(const LTy &L, const RTy &R) { return m_CombineOr(m_Add(L, R), m_FAdd(L, R)); } -namespace { - // Given an element pointer \p BasePtr to the start of a (sub) matrix, compute // the start address of vector \p VecIdx with type (\p EltType x \p NumElements) // assuming \p Stride elements between start two consecutive vectors. @@ -167,9 +171,9 @@ namespace { // v_2_0 |v_2_1 |v_2_2 |v_2_3 // v_3_0 {v_3_1 {v_3_2 v_3_3 // -Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride, - unsigned NumElements, Type *EltType, - IRBuilder<> &Builder) { +static Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride, + unsigned NumElements, Type *EltType, + IRBuilder<> &Builder) { assert((!isa<ConstantInt>(Stride) || cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) && @@ -338,6 +342,8 @@ computeShapeInfoForInst(Instruction *I, return std::nullopt; } +namespace { + /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics. /// /// Currently, the lowering for each matrix intrinsic is done as follows: @@ -371,7 +377,8 @@ class LowerMatrixIntrinsics { LoopInfo *LI = nullptr; OptimizationRemarkEmitter *ORE = nullptr; - /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation. + /// Contains estimates of the number of operations (loads, stores, compute) + /// required to lower a matrix operation. struct OpInfoTy { /// Number of stores emitted to generate this matrix. unsigned NumStores = 0; @@ -1719,6 +1726,31 @@ public: ToRemove.push_back(MatMul); } + /// Given \p Remainder iterations of the the matmul inner loop, + /// potentially lower \p Blocksize that is used for the underlying + /// vector. + unsigned capBlockSize(unsigned BlockSize, unsigned Remainder, Type *EltType) { + if (BlockSize <= Remainder) + return BlockSize; + + // If the remainder is also a legal type just use it. + auto *VecTy = FixedVectorType::get(EltType, Remainder); + if (TTI.isTypeLegal(VecTy)) + return Remainder; + + // Similarly, if the vector is small enough that we don't want + // to split further. + if (VecTy->getPrimitiveSizeInBits() <= SplitMatmulRemainderOverThreshold) + return Remainder; + + // Gradually lower the vectorization factor to cover the + // remainder. + do { + BlockSize /= 2; + } while (BlockSize > Remainder); + return BlockSize; + } + /// Compute \p Result += \p A * \p B for input matrices with left-associating /// addition. /// @@ -1756,10 +1788,8 @@ public: bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J)); for (unsigned I = 0; I < R; I += BlockSize) { - // Gradually lower the vectorization factor to cover the remainder. - while (I + BlockSize > R) - BlockSize /= 2; - + // Lower block size to make sure we stay within bounds. + BlockSize = capBlockSize(BlockSize, R - I, Result.getElementType()); Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder) : nullptr; for (unsigned K = 0; K < M; ++K) { @@ -1784,9 +1814,8 @@ public: unsigned BlockSize = VF; bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I)); for (unsigned J = 0; J < C; J += BlockSize) { - // Gradually lower the vectorization factor to cover the remainder. - while (J + BlockSize > C) - BlockSize /= 2; + // Lower the vectorization factor to cover the remainder. + BlockSize = capBlockSize(BlockSize, C - J, Result.getElementType()); Value *Sum = nullptr; for (unsigned K = 0; K < M; ++K) { diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 80aa98d..5a8f18a 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -160,9 +160,6 @@ static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true), //===----------------------------------------------------------------------===// // Anchor methods. -namespace llvm { -namespace GVNExpression { - Expression::~Expression() = default; BasicExpression::~BasicExpression() = default; CallExpression::~CallExpression() = default; @@ -171,9 +168,6 @@ StoreExpression::~StoreExpression() = default; AggregateValueExpression::~AggregateValueExpression() = default; PHIExpression::~PHIExpression() = default; -} // end namespace GVNExpression -} // end namespace llvm - namespace { // Tarjan's SCC finding algorithm with Nuutila's improvements diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index ba58b8e..6d7ce36 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -2623,32 +2623,32 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { namespace { - class ReassociateLegacyPass : public FunctionPass { - ReassociatePass Impl; +class ReassociateLegacyPass : public FunctionPass { + ReassociatePass Impl; - public: - static char ID; // Pass identification, replacement for typeid +public: + static char ID; // Pass identification, replacement for typeid - ReassociateLegacyPass() : FunctionPass(ID) { - initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry()); - } + ReassociateLegacyPass() : FunctionPass(ID) { + initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry()); + } - bool runOnFunction(Function &F) override { - if (skipFunction(F)) - return false; + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; - FunctionAnalysisManager DummyFAM; - auto PA = Impl.run(F, DummyFAM); - return !PA.areAllPreserved(); - } + FunctionAnalysisManager DummyFAM; + auto PA = Impl.run(F, DummyFAM); + return !PA.areAllPreserved(); + } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addPreserved<AAResultsWrapperPass>(); - AU.addPreserved<BasicAAWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } - }; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; } // end anonymous namespace diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 30b27cb..7646624 100644 --- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -107,9 +107,7 @@ PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) { return PA; } -namespace llvm { - -void initializeRegToMemWrapperPassPass(PassRegistry &); +namespace { class RegToMemWrapperPass : public FunctionPass { public: @@ -136,7 +134,7 @@ public: return N != 0 || Changed; } }; -} // namespace llvm +} // namespace INITIALIZE_PASS_BEGIN(RegToMemWrapperPass, "reg2mem", "", true, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index a692009..5c60fad 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -344,6 +344,12 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL) { + // If we want allocas to be migrated using this helper then we need to ensure + // that the BaseFragments map code still works. A simple solution would be + // to choose to always clone alloca dbg_assigns (rather than sometimes + // "stealing" them). + assert(!isa<AllocaInst>(Inst) && "Unexpected alloca"); + auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst); // Nothing to do if OldInst has no linked dbg.assign intrinsics. if (DVRAssignMarkerRange.empty()) @@ -429,11 +435,22 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID); } - ::Value *NewValue = Value ? Value : DbgAssign->getValue(); - DbgVariableRecord *NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>( - DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr, - Dest, DIExpression::get(Expr->getContext(), {}), - DbgAssign->getDebugLoc()))); + DbgVariableRecord *NewAssign; + if (IsSplit) { + ::Value *NewValue = Value ? Value : DbgAssign->getValue(); + NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>( + DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr, + Dest, DIExpression::get(Expr->getContext(), {}), + DbgAssign->getDebugLoc()))); + } else { + // The store is not split, simply steal the existing dbg_assign. + NewAssign = DbgAssign; + NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs? + NewAssign->setAddress(Dest); + if (Value) + NewAssign->replaceVariableLocationOp(0u, Value); + assert(Expr == NewAssign->getExpression()); + } // If we've updated the value but the original dbg.assign has an arglist // then kill it now - we can't use the requested new value. @@ -464,9 +481,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, // noted as slightly offset (in code) from the store. In practice this // should have little effect on the debugging experience due to the fact // that all the split stores should get the same line number. - NewAssign->moveBefore(DbgAssign->getIterator()); - - NewAssign->setDebugLoc(DbgAssign->getDebugLoc()); + if (NewAssign != DbgAssign) { + NewAssign->moveBefore(DbgAssign->getIterator()); + NewAssign->setDebugLoc(DbgAssign->getDebugLoc()); + } LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n"); }; diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index aae5d60..25a531c 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -50,9 +50,7 @@ using namespace llvm; #define DEBUG_TYPE "scalarizer" -namespace { - -BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) { +static BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) { BasicBlock *BB = Itr->getParent(); if (isa<PHINode>(Itr)) Itr = BB->getFirstInsertionPt(); @@ -76,6 +74,8 @@ using ScatterMap = std::map<std::pair<Value *, Type *>, ValueVector>; // along with a pointer to their scattered forms. using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>; +namespace { + struct VectorSplit { // The type of the vector. FixedVectorType *VecTy = nullptr; @@ -196,6 +196,7 @@ struct VectorLayout { // The size of each (non-remainder) fragment in bytes. uint64_t SplitSize = 0; }; +} // namespace static bool isStructOfMatchingFixedVectors(Type *Ty) { if (!isa<StructType>(Ty)) @@ -268,6 +269,7 @@ static Value *concatenate(IRBuilder<> &Builder, ArrayRef<Value *> Fragments, return Res; } +namespace { class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> { public: ScalarizerVisitor(DominatorTree *DT, const TargetTransformInfo *TTI, diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index e4ba70d..5af6c96 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" -#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -3611,8 +3610,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, AAResults &AA, TargetTransformInfo &TTI, bool Trivial, bool NonTrivial, ScalarEvolution *SE, - MemorySSAUpdater *MSSAU, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI, LPMUpdater &LoopUpdater) { + MemorySSAUpdater *MSSAU, LPMUpdater &LoopUpdater) { assert(L.isRecursivelyLCSSAForm(DT, LI) && "Loops must be in LCSSA form before unswitching."); @@ -3652,35 +3650,6 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, if (F->hasOptSize()) return false; - // Returns true if Loop L's loop nest is cold, i.e. if the headers of L, - // of the loops L is nested in, and of the loops nested in L are all cold. - auto IsLoopNestCold = [&](const Loop *L) { - // Check L and all of its parent loops. - auto *Parent = L; - while (Parent) { - if (!PSI->isColdBlock(Parent->getHeader(), BFI)) - return false; - Parent = Parent->getParentLoop(); - } - // Next check all loops nested within L. - SmallVector<const Loop *, 4> Worklist; - llvm::append_range(Worklist, L->getSubLoops()); - while (!Worklist.empty()) { - auto *CurLoop = Worklist.pop_back_val(); - if (!PSI->isColdBlock(CurLoop->getHeader(), BFI)) - return false; - llvm::append_range(Worklist, CurLoop->getSubLoops()); - } - return true; - }; - - // Skip cold loops in cold loop nests, as unswitching them brings little - // benefit but increases the code size - if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) { - LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n"); - return false; - } - // Perform legality checks. if (!isSafeForNoNTrivialUnswitching(L, LI)) return false; @@ -3705,11 +3674,6 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, LPMUpdater &U) { Function &F = *L.getHeader()->getParent(); (void)F; - ProfileSummaryInfo *PSI = nullptr; - if (auto OuterProxy = - AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR) - .getCachedResult<ModuleAnalysisManagerFunctionProxy>(F)) - PSI = OuterProxy->getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n"); @@ -3720,7 +3684,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, AR.MSSA->verifyMemorySSA(); } if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial, - &AR.SE, MSSAU ? &*MSSAU : nullptr, PSI, AR.BFI, U)) + &AR.SE, MSSAU ? &*MSSAU : nullptr, U)) return PreservedAnalyses::all(); if (AR.MSSA && VerifyMemorySSA) diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index ebcbd2b..fa66a03 100644 --- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -149,8 +149,6 @@ bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) { return Impl.runImpl(F, TTI); } -namespace llvm { - bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) { if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence(&F)) { LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because " @@ -328,11 +326,11 @@ bool SpeculativeExecutionPass::considerHoistingFromTo( return true; } -FunctionPass *createSpeculativeExecutionPass() { +FunctionPass *llvm::createSpeculativeExecutionPass() { return new SpeculativeExecutionLegacyPass(); } -FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() { +FunctionPass *llvm::createSpeculativeExecutionIfHasBranchDivergencePass() { return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true); } @@ -362,4 +360,3 @@ void SpeculativeExecutionPass::printPipeline( OS << "only-if-divergent-target"; OS << '>'; } -} // namespace llvm diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 7d01709..e94ad19 100644 --- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -716,8 +716,6 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) { return Ret; } -namespace llvm { - PreservedAnalyses StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) { const DataLayout *DL = &F.getDataLayout(); @@ -735,5 +733,3 @@ StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) { PA.preserve<TargetIRAnalysis>(); return PA; } - -} // namespace llvm diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 1d83ddc..89d41f3e 100644 --- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -192,7 +192,7 @@ struct AllocaDerivedValueTracker { SmallPtrSet<Instruction *, 32> AllocaUsers; SmallPtrSet<Instruction *, 32> EscapePoints; }; -} +} // namespace static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) { if (F.callsFunctionThatReturnsTwice()) @@ -967,7 +967,7 @@ struct TailCallElim : public FunctionPass { /*BFI=*/nullptr); } }; -} +} // namespace char TailCallElim::ID = 0; INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination", diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 9693ae6..4947d03 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" @@ -634,18 +635,10 @@ private: /// Merge \p MergeWithV into \p IV and push \p V to the worklist, if \p IV /// changes. bool mergeInValue(ValueLatticeElement &IV, Value *V, - ValueLatticeElement MergeWithV, + const ValueLatticeElement &MergeWithV, ValueLatticeElement::MergeOptions Opts = { /*MayIncludeUndef=*/false, /*CheckWiden=*/false}); - bool mergeInValue(Value *V, ValueLatticeElement MergeWithV, - ValueLatticeElement::MergeOptions Opts = { - /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) { - assert(!V->getType()->isStructTy() && - "non-structs should use markConstant"); - return mergeInValue(ValueState[V], V, MergeWithV, Opts); - } - /// getValueState - Return the ValueLatticeElement object that corresponds to /// the value. This function handles the case when the value hasn't been seen /// yet by properly seeding constants etc. @@ -768,6 +761,7 @@ private: void handleCallArguments(CallBase &CB); void handleExtractOfWithOverflow(ExtractValueInst &EVI, const WithOverflowInst *WO, unsigned Idx); + bool isInstFullyOverDefined(Instruction &Inst); private: friend class InstVisitor<SCCPInstVisitor>; @@ -987,7 +981,7 @@ public: void trackValueOfArgument(Argument *A) { if (A->getType()->isStructTy()) return (void)markOverdefined(A); - mergeInValue(A, getArgAttributeVL(A)); + mergeInValue(ValueState[A], A, getArgAttributeVL(A)); } bool isStructLatticeConstant(Function *F, StructType *STy); @@ -1128,8 +1122,7 @@ bool SCCPInstVisitor::isStructLatticeConstant(Function *F, StructType *STy) { for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i)); assert(It != TrackedMultipleRetVals.end()); - ValueLatticeElement LV = It->second; - if (!SCCPSolver::isConstant(LV)) + if (!SCCPSolver::isConstant(It->second)) return false; } return true; @@ -1160,7 +1153,7 @@ Constant *SCCPInstVisitor::getConstantOrNull(Value *V) const { std::vector<Constant *> ConstVals; auto *ST = cast<StructType>(V->getType()); for (unsigned I = 0, E = ST->getNumElements(); I != E; ++I) { - ValueLatticeElement LV = LVs[I]; + const ValueLatticeElement &LV = LVs[I]; ConstVals.push_back(SCCPSolver::isConstant(LV) ? getConstant(LV, ST->getElementType(I)) : UndefValue::get(ST->getElementType(I))); @@ -1225,7 +1218,7 @@ void SCCPInstVisitor::visitInstruction(Instruction &I) { } bool SCCPInstVisitor::mergeInValue(ValueLatticeElement &IV, Value *V, - ValueLatticeElement MergeWithV, + const ValueLatticeElement &MergeWithV, ValueLatticeElement::MergeOptions Opts) { if (IV.mergeIn(MergeWithV, Opts)) { pushUsersToWorkList(V); @@ -1264,7 +1257,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI, return; } - ValueLatticeElement BCValue = getValueState(BI->getCondition()); + const ValueLatticeElement &BCValue = getValueState(BI->getCondition()); ConstantInt *CI = getConstantInt(BCValue, BI->getCondition()->getType()); if (!CI) { // Overdefined condition variables, and branches on unfoldable constant @@ -1326,7 +1319,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI, // the target as executable. if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) { // Casts are folded by visitCastInst. - ValueLatticeElement IBRValue = getValueState(IBR->getAddress()); + const ValueLatticeElement &IBRValue = getValueState(IBR->getAddress()); BlockAddress *Addr = dyn_cast_or_null<BlockAddress>( getConstant(IBRValue, IBR->getAddress()->getType())); if (!Addr) { // Overdefined or unknown condition? @@ -1383,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const { // 7. If a conditional branch has a value that is overdefined, make all // successors executable. void SCCPInstVisitor::visitPHINode(PHINode &PN) { - // If this PN returns a struct, just mark the result overdefined. - // TODO: We could do a lot better than this if code actually uses this. - if (PN.getType()->isStructTy()) - return (void)markOverdefined(&PN); - - if (getValueState(&PN).isOverdefined()) - return; // Quick exit - // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, // and slow us down a lot. Just mark them overdefined. if (PN.getNumIncomingValues() > 64) return (void)markOverdefined(&PN); - unsigned NumActiveIncoming = 0; + if (isInstFullyOverDefined(PN)) + return; + SmallVector<unsigned> FeasibleIncomingIndices; + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { + if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) + continue; + FeasibleIncomingIndices.push_back(i); + } // Look at all of the executable operands of the PHI node. If any of them // are overdefined, the PHI becomes overdefined as well. If they are all // constant, and they agree with each other, the PHI becomes the identical // constant. If they are constant and don't agree, the PHI is a constant // range. If there are no executable operands, the PHI remains unknown. - ValueLatticeElement PhiState = getValueState(&PN); - for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { - if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) - continue; - - ValueLatticeElement IV = getValueState(PN.getIncomingValue(i)); - PhiState.mergeIn(IV); - NumActiveIncoming++; - if (PhiState.isOverdefined()) - break; + if (StructType *STy = dyn_cast<StructType>(PN.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + ValueLatticeElement PhiState = getStructValueState(&PN, i); + if (PhiState.isOverdefined()) + continue; + for (unsigned j : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = + getStructValueState(PN.getIncomingValue(j), i); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i); + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); + } + } else { + ValueLatticeElement PhiState = getValueState(&PN); + for (unsigned i : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i)); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + // We allow up to 1 range extension per active incoming value and one + // additional extension. Note that we manually adjust the number of range + // extensions to match the number of active incoming values. This helps to + // limit multiple extensions caused by the same incoming value, if other + // incoming values are equal. + ValueLatticeElement &PhiStateRef = ValueState[&PN]; + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); } - - // We allow up to 1 range extension per active incoming value and one - // additional extension. Note that we manually adjust the number of range - // extensions to match the number of active incoming values. This helps to - // limit multiple extensions caused by the same incoming value, if other - // incoming values are equal. - mergeInValue(&PN, PhiState, - ValueLatticeElement::MergeOptions().setMaxWidenSteps( - NumActiveIncoming + 1)); - ValueLatticeElement &PhiStateRef = getValueState(&PN); - PhiStateRef.setNumRangeExtensions( - std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions())); } void SCCPInstVisitor::visitReturnInst(ReturnInst &I) { @@ -1481,7 +1491,7 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) { } } - ValueLatticeElement OpSt = getValueState(I.getOperand(0)); + const ValueLatticeElement &OpSt = getValueState(I.getOperand(0)); if (OpSt.isUnknownOrUndef()) return; @@ -1496,9 +1506,9 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) { if (I.getDestTy()->isIntOrIntVectorTy() && I.getSrcTy()->isIntOrIntVectorTy() && I.getOpcode() != Instruction::BitCast) { - auto &LV = getValueState(&I); ConstantRange OpRange = OpSt.asConstantRange(I.getSrcTy(), /*UndefAllowed=*/false); + auto &LV = getValueState(&I); Type *DestTy = I.getDestTy(); ConstantRange Res = ConstantRange::getEmpty(DestTy->getScalarSizeInBits()); @@ -1516,19 +1526,24 @@ void SCCPInstVisitor::handleExtractOfWithOverflow(ExtractValueInst &EVI, const WithOverflowInst *WO, unsigned Idx) { Value *LHS = WO->getLHS(), *RHS = WO->getRHS(); - ValueLatticeElement L = getValueState(LHS); - ValueLatticeElement R = getValueState(RHS); + Type *Ty = LHS->getType(); + addAdditionalUser(LHS, &EVI); addAdditionalUser(RHS, &EVI); - if (L.isUnknownOrUndef() || R.isUnknownOrUndef()) - return; // Wait to resolve. - Type *Ty = LHS->getType(); + const ValueLatticeElement &L = getValueState(LHS); + if (L.isUnknownOrUndef()) + return; // Wait to resolve. ConstantRange LR = L.asConstantRange(Ty, /*UndefAllowed=*/false); + + const ValueLatticeElement &R = getValueState(RHS); + if (R.isUnknownOrUndef()) + return; // Wait to resolve. + ConstantRange RR = R.asConstantRange(Ty, /*UndefAllowed=*/false); if (Idx == 0) { ConstantRange Res = LR.binaryOp(WO->getBinaryOp(), RR); - mergeInValue(&EVI, ValueLatticeElement::getRange(Res)); + mergeInValue(ValueState[&EVI], &EVI, ValueLatticeElement::getRange(Res)); } else { assert(Idx == 1 && "Index can only be 0 or 1"); ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion( @@ -1560,7 +1575,7 @@ void SCCPInstVisitor::visitExtractValueInst(ExtractValueInst &EVI) { if (auto *WO = dyn_cast<WithOverflowInst>(AggVal)) return handleExtractOfWithOverflow(EVI, WO, i); ValueLatticeElement EltVal = getStructValueState(AggVal, i); - mergeInValue(getValueState(&EVI), &EVI, EltVal); + mergeInValue(ValueState[&EVI], &EVI, EltVal); } else { // Otherwise, must be extracting from an array. return (void)markOverdefined(&EVI); @@ -1616,14 +1631,18 @@ void SCCPInstVisitor::visitSelectInst(SelectInst &I) { if (ValueState[&I].isOverdefined()) return (void)markOverdefined(&I); - ValueLatticeElement CondValue = getValueState(I.getCondition()); + const ValueLatticeElement &CondValue = getValueState(I.getCondition()); if (CondValue.isUnknownOrUndef()) return; if (ConstantInt *CondCB = getConstantInt(CondValue, I.getCondition()->getType())) { Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue(); - mergeInValue(&I, getValueState(OpVal)); + const ValueLatticeElement &OpValState = getValueState(OpVal); + // Safety: ValueState[&I] doesn't invalidate OpValState since it is already + // in the map. + assert(ValueState.contains(&I) && "&I is not in ValueState map."); + mergeInValue(ValueState[&I], &I, OpValState); return; } @@ -1721,7 +1740,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) { // being a special floating value. ValueLatticeElement NewV; NewV.markConstant(C, /*MayIncludeUndef=*/true); - return (void)mergeInValue(&I, NewV); + return (void)mergeInValue(ValueState[&I], &I, NewV); } } @@ -1741,7 +1760,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) { R = A.overflowingBinaryOp(BO->getOpcode(), B, OBO->getNoWrapKind()); else R = A.binaryOp(BO->getOpcode(), B); - mergeInValue(&I, ValueLatticeElement::getRange(R)); + mergeInValue(ValueState[&I], &I, ValueLatticeElement::getRange(R)); // TODO: Currently we do not exploit special values that produce something // better than overdefined with an overdefined operand for vector or floating @@ -1767,7 +1786,7 @@ void SCCPInstVisitor::visitCmpInst(CmpInst &I) { if (C) { ValueLatticeElement CV; CV.markConstant(C); - mergeInValue(&I, CV); + mergeInValue(ValueState[&I], &I, CV); return; } @@ -1802,7 +1821,7 @@ void SCCPInstVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { Operands.reserve(I.getNumOperands()); for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) { - ValueLatticeElement State = getValueState(I.getOperand(i)); + const ValueLatticeElement &State = getValueState(I.getOperand(i)); if (State.isUnknownOrUndef()) return; // Operands are not resolved yet. @@ -1881,14 +1900,13 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) { if (ValueState[&I].isOverdefined()) return (void)markOverdefined(&I); - ValueLatticeElement PtrVal = getValueState(I.getOperand(0)); + const ValueLatticeElement &PtrVal = getValueState(I.getOperand(0)); if (PtrVal.isUnknownOrUndef()) return; // The pointer is not resolved yet! - ValueLatticeElement &IV = ValueState[&I]; - if (SCCPSolver::isConstant(PtrVal)) { Constant *Ptr = getConstant(PtrVal, I.getOperand(0)->getType()); + ValueLatticeElement &IV = ValueState[&I]; // load null is undefined. if (isa<ConstantPointerNull>(Ptr)) { @@ -1916,7 +1934,7 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) { } // Fall back to metadata. - mergeInValue(&I, getValueFromMetadata(&I)); + mergeInValue(ValueState[&I], &I, getValueFromMetadata(&I)); } void SCCPInstVisitor::visitCallBase(CallBase &CB) { @@ -1944,7 +1962,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) { return markOverdefined(&CB); // Can't handle struct args. if (A.get()->getType()->isMetadataTy()) continue; // Carried in CB, not allowed in Operands. - ValueLatticeElement State = getValueState(A); + const ValueLatticeElement &State = getValueState(A); if (State.isUnknownOrUndef()) return; // Operands are not resolved yet. @@ -1964,7 +1982,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) { } // Fall back to metadata. - mergeInValue(&CB, getValueFromMetadata(&CB)); + mergeInValue(ValueState[&CB], &CB, getValueFromMetadata(&CB)); } void SCCPInstVisitor::handleCallArguments(CallBase &CB) { @@ -1992,10 +2010,11 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) { mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg, getMaxWidenStepsOpts()); } - } else - mergeInValue(&*AI, - getValueState(*CAI).intersect(getArgAttributeVL(&*AI)), - getMaxWidenStepsOpts()); + } else { + ValueLatticeElement CallArg = + getValueState(*CAI).intersect(getArgAttributeVL(&*AI)); + mergeInValue(ValueState[&*AI], &*AI, CallArg, getMaxWidenStepsOpts()); + } } } } @@ -2076,7 +2095,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { if (II->getIntrinsicID() == Intrinsic::vscale) { unsigned BitWidth = CB.getType()->getScalarSizeInBits(); const ConstantRange Result = getVScaleRange(II->getFunction(), BitWidth); - return (void)mergeInValue(II, ValueLatticeElement::getRange(Result)); + return (void)mergeInValue(ValueState[II], II, + ValueLatticeElement::getRange(Result)); } if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) { @@ -2094,7 +2114,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { ConstantRange Result = ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges); - return (void)mergeInValue(II, ValueLatticeElement::getRange(Result)); + return (void)mergeInValue(ValueState[II], II, + ValueLatticeElement::getRange(Result)); } } @@ -2121,10 +2142,25 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { return handleCallOverdefined(CB); // Not tracking this callee. // If so, propagate the return value of the callee into this call result. - mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts()); + mergeInValue(ValueState[&CB], &CB, TFRVI->second, getMaxWidenStepsOpts()); } } +bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) { + // For structure Type, we handle each member separately. + // A structure object won't be considered as overdefined when + // there is at least one member that is not overdefined. + if (StructType *STy = dyn_cast<StructType>(Inst.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) { + if (!getStructValueState(&Inst, i).isOverdefined()) + return false; + } + return true; + } + + return getValueState(&Inst).isOverdefined(); +} + void SCCPInstVisitor::solve() { // Process the work lists until they are empty! while (!BBWorkList.empty() || !InstWorkList.empty()) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 62a81ba..280eb20 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7957,9 +7957,9 @@ bool VPRecipeBuilder::getScaledReductions( auto CollectExtInfo = [this, &Exts, &ExtOpTypes, &ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool { for (const auto &[I, OpI] : enumerate(Ops)) { - auto *CI = dyn_cast<ConstantInt>(OpI); - if (I > 0 && CI && - canConstantBeExtended(CI, ExtOpTypes[0], ExtKinds[0])) { + const APInt *C; + if (I > 0 && match(OpI, m_APInt(C)) && + canConstantBeExtended(C, ExtOpTypes[0], ExtKinds[0])) { ExtOpTypes[I] = ExtOpTypes[0]; ExtKinds[I] = ExtKinds[0]; continue; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 88af2cf..9cd52da 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2242,8 +2242,49 @@ public: /// may not be necessary. bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const; bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const; + Align Alignment, const int64_t Diff, + const size_t Sz) const; + + /// Return true if an array of scalar loads can be replaced with a strided + /// load (with constant stride). + /// + /// TODO: + /// It is possible that the load gets "widened". Suppose that originally each + /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is + /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2 + /// ... + /// %b + 0 * %s + (w - 1) + /// + /// %b + 1 * %s + 0 + /// %b + 1 * %s + 1 + /// %b + 1 * %s + 2 + /// ... + /// %b + 1 * %s + (w - 1) + /// ... + /// + /// %b + (n - 1) * %s + 0 + /// %b + (n - 1) * %s + 1 + /// %b + (n - 1) * %s + 2 + /// ... + /// %b + (n - 1) * %s + (w - 1) + /// + /// In this case we will generate a strided load of type `<n x (k * w)>`. + /// + /// \param PointerOps list of pointer arguments of loads. + /// \param ElemTy original scalar type of loads. + /// \param Alignment alignment of the first load. + /// \param SortedIndices is the order of PointerOps as returned by + /// `sortPtrAccesses` + /// \param Diff Pointer difference between the lowest and the highes pointer + /// in `PointerOps` as returned by `getPointersDiff`. + /// \param Ptr0 first pointer in `PointersOps`. + /// \param PtrN last pointer in `PointersOps`. + /// \param SPtrInfo If the function return `true`, it also sets all the fields + /// of `SPtrInfo` necessary to generate the strided load later. + bool analyzeConstantStrideCandidate( + const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment, + const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, + Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const; /// Return true if an array of scalar loads can be replaced with a strided /// load (with run-time stride). @@ -6849,9 +6890,8 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, /// current graph (for masked gathers extra extractelement instructions /// might be required). bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const { - const size_t Sz = PointerOps.size(); + Align Alignment, const int64_t Diff, + const size_t Sz) const { if (Diff % (Sz - 1) != 0) return false; @@ -6875,27 +6915,40 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, return false; if (!TTI->isLegalStridedLoadStore(VecTy, Alignment)) return false; + return true; + } + return false; +} - // Iterate through all pointers and check if all distances are - // unique multiple of Dist. - SmallSet<int64_t, 4> Dists; - for (Value *Ptr : PointerOps) { - int64_t Dist = 0; - if (Ptr == PtrN) - Dist = Diff; - else if (Ptr != Ptr0) - Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); - // If the strides are not the same or repeated, we can't - // vectorize. - if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) - break; - } - if (Dists.size() == Sz) { - Type *StrideTy = DL->getIndexType(Ptr0->getType()); - SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); - SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); - return true; - } +bool BoUpSLP::analyzeConstantStrideCandidate( + const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment, + const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, + Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const { + const size_t Sz = PointerOps.size(); + if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz)) + return false; + + int64_t Stride = Diff / static_cast<int64_t>(Sz - 1); + + // Iterate through all pointers and check if all distances are + // unique multiple of Dist. + SmallSet<int64_t, 4> Dists; + for (Value *Ptr : PointerOps) { + int64_t Dist = 0; + if (Ptr == PtrN) + Dist = Diff; + else if (Ptr != Ptr0) + Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); + // If the strides are not the same or repeated, we can't + // vectorize. + if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) + break; + } + if (Dists.size() == Sz) { + Type *StrideTy = DL->getIndexType(Ptr0->getType()); + SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); + SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); + return true; } return false; } @@ -6995,8 +7048,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( Align Alignment = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]) ->getAlign(); - if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN, - SPtrInfo)) + if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order, + *Diff, Ptr0, PtrN, SPtrInfo)) return LoadsState::StridedVectorize; } if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || @@ -17632,7 +17685,9 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { } if (IsPHI || (!E->isGather() && E->State != TreeEntry::SplitVectorize && - E->doesNotNeedToSchedule()) || + (E->doesNotNeedToSchedule() || + (E->hasCopyableElements() && !E->isCopyableElement(LastInst) && + isUsedOutsideBlock(LastInst)))) || (GatheredLoadsEntriesFirst.has_value() && E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && E->getOpcode() == Instruction::Load)) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 0101942..d167009 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1753,14 +1753,14 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) { } #endif -bool llvm::canConstantBeExtended(const ConstantInt *CI, Type *NarrowType, +bool llvm::canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind) { - APInt TruncatedVal = CI->getValue().trunc(NarrowType->getScalarSizeInBits()); - unsigned WideSize = CI->getType()->getScalarSizeInBits(); + APInt TruncatedVal = C->trunc(NarrowType->getScalarSizeInBits()); + unsigned WideSize = C->getBitWidth(); APInt ExtendedVal = ExtKind == TTI::PR_SignExtend ? TruncatedVal.sext(WideSize) : TruncatedVal.zext(WideSize); - return ExtendedVal == CI->getValue(); + return ExtendedVal == *C; } TargetTransformInfo::OperandValueInfo diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0e0b042..84d2ea6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -407,6 +407,10 @@ public: VPBasicBlock *getParent() { return Parent; } const VPBasicBlock *getParent() const { return Parent; } + /// \return the VPRegionBlock which the recipe belongs to. + VPRegionBlock *getRegion(); + const VPRegionBlock *getRegion() const; + /// The method which generates the output IR instructions that correspond to /// this VPRecipe, thereby "executing" the VPlan. virtual void execute(VPTransformState &State) = 0; @@ -4075,6 +4079,14 @@ public: } }; +inline VPRegionBlock *VPRecipeBase::getRegion() { + return getParent()->getParent(); +} + +inline const VPRegionBlock *VPRecipeBase::getRegion() const { + return getParent()->getParent(); +} + /// VPlan models a candidate for vectorization, encoding various decisions take /// to produce efficient output IR, including which branches, basic-blocks and /// output IR instructions to generate, and their cost. VPlan holds a diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index f413c63..7e074c1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -377,7 +377,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, #ifndef NDEBUG auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { - auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); + VPRegionBlock *Region = R->getRegion(); if (Region && Region->isReplicator()) { assert(Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && "Expected SESE region!"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 1580a3b..2aaabd9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -474,7 +474,7 @@ public: /// Check if a constant \p CI can be safely treated as having been extended /// from a narrower type with the given extension kind. -bool canConstantBeExtended(const ConstantInt *CI, Type *NarrowType, +bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind); } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index ff286f7..d8203e2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -173,10 +173,10 @@ inline int_pred_ty<is_zero_int> m_ZeroInt() { /// For vectors, this includes constants with undefined elements. inline int_pred_ty<is_one> m_One() { return int_pred_ty<is_one>(); } -struct bind_const_int { - uint64_t &Res; +struct bind_apint { + const APInt *&Res; - bind_const_int(uint64_t &Res) : Res(Res) {} + bind_apint(const APInt *&Res) : Res(Res) {} bool match(VPValue *VPV) const { if (!VPV->isLiveIn()) @@ -188,7 +188,23 @@ struct bind_const_int { const auto *CI = dyn_cast<ConstantInt>(V); if (!CI) return false; - if (auto C = CI->getValue().tryZExtValue()) { + Res = &CI->getValue(); + return true; + } +}; + +inline bind_apint m_APInt(const APInt *&C) { return C; } + +struct bind_const_int { + uint64_t &Res; + + bind_const_int(uint64_t &Res) : Res(Res) {} + + bool match(VPValue *VPV) const { + const APInt *APConst; + if (!bind_apint(APConst).match(VPV)) + return false; + if (auto C = APConst->tryZExtValue()) { Res = *C; return true; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 775837f..d1e67e6b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -341,12 +341,12 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, ExtAType = GetExtendKind(ExtAR); ExtBType = GetExtendKind(ExtBR); - if (!ExtBR && Widen->getOperand(1)->isLiveIn()) { - auto *CI = cast<ConstantInt>(Widen->getOperand(1)->getLiveInIRValue()); - if (canConstantBeExtended(CI, InputTypeA, ExtAType)) { - InputTypeB = InputTypeA; - ExtBType = ExtAType; - } + using namespace VPlanPatternMatch; + const APInt *C; + if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) && + canConstantBeExtended(C, InputTypeA, ExtAType)) { + InputTypeB = InputTypeA; + ExtBType = ExtAType; } }; @@ -2352,7 +2352,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return false; auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *CanIV = getParent()->getParent()->getCanonicalIV(); + auto *CanIV = getRegion()->getCanonicalIV(); return StartC && StartC->isZero() && StepC && StepC->isOne() && getScalarType() == CanIV->getScalarType(); } @@ -3076,7 +3076,7 @@ static void scalarizeInstruction(const Instruction *Instr, State.AC->registerAssumption(II); assert( - (RepRecipe->getParent()->getParent() || + (RepRecipe->getRegion() || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || all_of(RepRecipe->operands(), [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && @@ -3268,7 +3268,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, to_vector(operands()), VF); // If the recipe is not predicated (i.e. not in a replicate region), return // the scalar cost. Otherwise handle predicated cost. - if (!getParent()->getParent()->isReplicator()) + if (!getRegion()->isReplicator()) return ScalarCost; // Account for the phi nodes that we will create. @@ -3284,7 +3284,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, case Instruction::Store: { // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - const VPRegionBlock *ParentRegion = getParent()->getParent(); + const VPRegionBlock *ParentRegion = getRegion(); if (ParentRegion && ParentRegion->isReplicator()) break; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8d76b2d8..f5f616f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1858,8 +1858,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, return nullptr; VPRegionBlock *EnclosingLoopRegion = HoistCandidate->getParent()->getEnclosingLoopRegion(); - assert((!HoistCandidate->getParent()->getParent() || - HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) && + assert((!HoistCandidate->getRegion() || + HoistCandidate->getRegion() == EnclosingLoopRegion) && "CFG in VPlan should still be flat, without replicate regions"); // Hoist candidate was already visited, no need to hoist. if (!Visited.insert(HoistCandidate).second) @@ -2122,9 +2122,18 @@ static void licm(VPlan &Plan) { VPBasicBlock *Preheader = Plan.getVectorPreheader(); // Return true if we do not know how to (mechanically) hoist a given recipe - // out of a loop region. Does not address legality concerns such as aliasing - // or speculation safety. + // out of a loop region. auto CannotHoistRecipe = [](VPRecipeBase &R) { + // Assumes don't alias anything or throw; as long as they're guaranteed to + // execute, they're safe to hoist. + if (match(&R, m_Intrinsic<Intrinsic::assume>())) + return false; + + // TODO: Relax checks in the future, e.g. we could also hoist reads, if + // their memory location is not modified in the vector loop. + if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi()) + return true; + // Allocas cannot be hoisted. auto *RepR = dyn_cast<VPReplicateRecipe>(&R); return RepR && RepR->getOpcode() == Instruction::Alloca; @@ -2132,17 +2141,18 @@ static void licm(VPlan &Plan) { // Hoist any loop invariant recipes from the vector loop region to the // preheader. Preform a shallow traversal of the vector loop region, to - // exclude recipes in replicate regions. + // exclude recipes in replicate regions. Since the top-level blocks in the + // vector loop region are guaranteed to execute if the vector pre-header is, + // we don't need to check speculation safety. VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + assert(Preheader->getSingleSuccessor() == LoopRegion && + "Expected vector prehader's successor to be the vector loop region"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_shallow(LoopRegion->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { if (CannotHoistRecipe(R)) continue; - // TODO: Relax checks in the future, e.g. we could also hoist reads, if - // their memory location is not modified in the vector loop. - if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() || - any_of(R.operands(), [](VPValue *Op) { + if (any_of(R.operands(), [](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); })) continue; @@ -2888,7 +2898,7 @@ void VPlanTransforms::replaceSymbolicStrides( // evolution. auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { auto *R = cast<VPRecipeBase>(&U); - return R->getParent()->getParent() || + return R->getRegion() || R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor(); }; ValueToSCEVMapTy RewriteMap; @@ -3793,8 +3803,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { continue; auto *DefR = cast<VPRecipeWithIRFlags>(&R); auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) { - VPRegionBlock *ParentRegion = - cast<VPRecipeBase>(U)->getParent()->getParent(); + VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion(); return !U->usesScalars(DefR) || ParentRegion != LoopRegion; }; if ((isa<VPReplicateRecipe>(DefR) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index cf95ac0..9a2497e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) { return true; if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) { - const VPRegionBlock *RegionOfR = Rep->getParent()->getParent(); + const VPRegionBlock *RegionOfR = Rep->getRegion(); // Don't consider recipes in replicate regions as uniform yet; their first // lane cannot be accessed when executing the replicate region for other // lanes. |