diff options
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r-- | llvm/lib/Transforms/CFGuard/CFGuard.cpp | 25 | ||||
-rw-r--r-- | llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 39 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp | 20 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/LoopFuse.cpp | 34 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/LoopPassManager.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp | 59 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/Reg2Mem.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/SROA.cpp | 34 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 40 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 111 |
11 files changed, 243 insertions, 134 deletions
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp index b73a0ce..4645670 100644 --- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp +++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp @@ -147,7 +147,7 @@ public: private: // Only add checks if the module has the cfguard=2 flag. - int cfguard_module_flag = 0; + int CFGuardModuleFlag = 0; StringRef GuardFnName; Mechanism GuardMechanism = Mechanism::Check; FunctionType *GuardFnType = nullptr; @@ -162,9 +162,7 @@ public: static char ID; // Default constructor required for the INITIALIZE_PASS macro. - CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) { - initializeCFGuardPass(*PassRegistry::getPassRegistry()); - } + CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) {} bool doInitialization(Module &M) override { return Impl.doInitialization(M); } bool runOnFunction(Function &F) override { return Impl.runOnFunction(F); } @@ -173,7 +171,6 @@ public: } // end anonymous namespace void CFGuardImpl::insertCFGuardCheck(CallBase *CB) { - assert(CB->getModule()->getTargetTriple().isOSWindows() && "Only applicable for Windows targets"); assert(CB->isIndirectCall() && @@ -202,7 +199,6 @@ void CFGuardImpl::insertCFGuardCheck(CallBase *CB) { } void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) { - assert(CB->getModule()->getTargetTriple().isOSWindows() && "Only applicable for Windows targets"); assert(CB->isIndirectCall() && @@ -236,14 +232,13 @@ void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) { } bool CFGuardImpl::doInitialization(Module &M) { - // Check if this module has the cfguard flag and read its value. if (auto *MD = mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard"))) - cfguard_module_flag = MD->getZExtValue(); + CFGuardModuleFlag = MD->getZExtValue(); // Skip modules for which CFGuard checks have been disabled. - if (cfguard_module_flag != 2) + if (CFGuardModuleFlag != 2) return false; // Set up prototypes for the guard check and dispatch functions. @@ -264,9 +259,8 @@ bool CFGuardImpl::doInitialization(Module &M) { } bool CFGuardImpl::runOnFunction(Function &F) { - // Skip modules for which CFGuard checks have been disabled. - if (cfguard_module_flag != 2) + if (CFGuardModuleFlag != 2) return false; SmallVector<CallBase *, 8> IndirectCalls; @@ -286,19 +280,16 @@ bool CFGuardImpl::runOnFunction(Function &F) { } // If no checks are needed, return early. - if (IndirectCalls.empty()) { + if (IndirectCalls.empty()) return false; - } // For each indirect call/invoke, add the appropriate dispatch or check. if (GuardMechanism == Mechanism::Dispatch) { - for (CallBase *CB : IndirectCalls) { + for (CallBase *CB : IndirectCalls) insertCFGuardDispatch(CB); - } } else { - for (CallBase *CB : IndirectCalls) { + for (CallBase *CB : IndirectCalls) insertCFGuardCheck(CB); - } } return true; diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 5066a99..894d83f 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -6150,3 +6150,42 @@ void MemProfContextDisambiguation::run( IndexCallsiteContextGraph CCG(Index, isPrevailing); CCG.process(); } + +// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline +// when we don't have an index that has recorded that we are linking with +// allocation libraries containing the necessary APIs for downstream +// transformations. +PreservedAnalyses MemProfRemoveInfo::run(Module &M, ModuleAnalysisManager &AM) { + // The profile matcher applies hotness attributes directly for allocations, + // and those will cause us to generate calls to the hot/cold interfaces + // unconditionally. If supports-hot-cold-new was not enabled in the LTO + // link then assume we don't want these calls (e.g. not linking with + // the appropriate library, or otherwise trying to disable this behavior). + bool Changed = false; + for (auto &F : M) { + for (auto &BB : F) { + for (auto &I : BB) { + auto *CI = dyn_cast<CallBase>(&I); + if (!CI) + continue; + if (CI->hasFnAttr("memprof")) { + CI->removeFnAttr("memprof"); + Changed = true; + } + if (!CI->hasMetadata(LLVMContext::MD_callsite)) { + assert(!CI->hasMetadata(LLVMContext::MD_memprof)); + continue; + } + // Strip off all memprof metadata as it is no longer needed. + // Importantly, this avoids the addition of new memprof attributes + // after inlining propagation. + CI->setMetadata(LLVMContext::MD_memprof, nullptr); + CI->setMetadata(LLVMContext::MD_callsite, nullptr); + Changed = true; + } + } + } + if (!Changed) + return PreservedAnalyses::all(); + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 07ad65c..fba1ccf 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1481,13 +1481,13 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp, return new ICmpInst(Pred, Y, ConstantInt::get(SrcTy, C.logBase2())); } - if (Cmp.isEquality() && Trunc->hasOneUse()) { + if (Cmp.isEquality() && (Trunc->hasOneUse() || Trunc->hasNoUnsignedWrap())) { // Canonicalize to a mask and wider compare if the wide type is suitable: // (trunc X to i8) == C --> (X & 0xff) == (zext C) if (!SrcTy->isVectorTy() && shouldChangeType(DstBits, SrcBits)) { Constant *Mask = ConstantInt::get(SrcTy, APInt::getLowBitsSet(SrcBits, DstBits)); - Value *And = Builder.CreateAnd(X, Mask); + Value *And = Trunc->hasNoUnsignedWrap() ? X : Builder.CreateAnd(X, Mask); Constant *WideC = ConstantInt::get(SrcTy, C.zext(SrcBits)); return new ICmpInst(Pred, And, WideC); } diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 6e17801..2646334 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -844,6 +844,7 @@ struct AddressSanitizer { bool maybeInsertAsanInitAtFunctionEntry(Function &F); bool maybeInsertDynamicShadowAtFunctionEntry(Function &F); void markEscapedLocalAllocas(Function &F); + void markCatchParametersAsUninteresting(Function &F); private: friend struct FunctionStackPoisoner; @@ -2997,6 +2998,22 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) { } } } +// Mitigation for https://github.com/google/sanitizers/issues/749 +// We don't instrument Windows catch-block parameters to avoid +// interfering with exception handling assumptions. +void AddressSanitizer::markCatchParametersAsUninteresting(Function &F) { + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (auto *CatchPad = dyn_cast<CatchPadInst>(&I)) { + // Mark the parameters to a catch-block as uninteresting to avoid + // instrumenting them. + for (Value *Operand : CatchPad->arg_operands()) + if (auto *AI = dyn_cast<AllocaInst>(Operand)) + ProcessedAllocas[AI] = false; + } + } + } +} bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) { bool ShouldInstrument = @@ -3041,6 +3058,9 @@ bool AddressSanitizer::instrumentFunction(Function &F, // can be passed to that intrinsic. markEscapedLocalAllocas(F); + if (TargetTriple.isOSWindows()) + markCatchParametersAsUninteresting(F); + // We want to instrument every address only once per basic block (unless there // are calls between uses). SmallPtrSet<Value *, 16> TempsToInstrument; diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 20733032..19eccb9 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -368,7 +368,7 @@ private: Valid = false; } - bool reportInvalidCandidate(llvm::Statistic &Stat) const { + bool reportInvalidCandidate(Statistic &Stat) const { using namespace ore; assert(L && Preheader && "Fusion candidate not initialized properly!"); #if LLVM_ENABLE_STATS @@ -445,6 +445,7 @@ struct FusionCandidateCompare { "No dominance relationship between these fusion candidates!"); } }; +} // namespace using LoopVector = SmallVector<Loop *, 4>; @@ -461,9 +462,15 @@ using LoopVector = SmallVector<Loop *, 4>; using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; -#if !defined(NDEBUG) -static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const FusionCandidate &FC) { +#ifndef NDEBUG +static void printLoopVector(const LoopVector &LV) { + dbgs() << "****************************\n"; + for (const Loop *L : LV) + printLoop(*L, dbgs()); + dbgs() << "****************************\n"; +} + +static raw_ostream &operator<<(raw_ostream &OS, const FusionCandidate &FC) { if (FC.isValid()) OS << FC.Preheader->getName(); else @@ -472,8 +479,8 @@ static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, return OS; } -static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const FusionCandidateSet &CandSet) { +static raw_ostream &operator<<(raw_ostream &OS, + const FusionCandidateSet &CandSet) { for (const FusionCandidate &FC : CandSet) OS << FC << '\n'; @@ -489,7 +496,9 @@ printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { dbgs() << "****************************\n"; } } -#endif +#endif // NDEBUG + +namespace { /// Collect all loops in function at the same nest level, starting at the /// outermost level. @@ -550,15 +559,6 @@ private: LoopsOnLevelTy LoopsOnLevel; }; -#ifndef NDEBUG -static void printLoopVector(const LoopVector &LV) { - dbgs() << "****************************\n"; - for (auto *L : LV) - printLoop(*L, dbgs()); - dbgs() << "****************************\n"; -} -#endif - struct LoopFuser { private: // Sets of control flow equivalent fusion candidates for a given nest level. @@ -1850,7 +1850,7 @@ private: /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> template <typename RemarkKind> void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, - llvm::Statistic &Stat) { + Statistic &Stat) { assert(FC0.Preheader && FC1.Preheader && "Expecting valid fusion candidates"); using namespace ore; diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index 7da8586..d827e64 100644 --- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -8,7 +8,6 @@ #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -217,9 +216,6 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, // Get the analysis results needed by loop passes. MemorySSA *MSSA = UseMemorySSA ? (&AM.getResult<MemorySSAAnalysis>(F).getMSSA()) : nullptr; - BlockFrequencyInfo *BFI = UseBlockFrequencyInfo && F.hasProfileData() - ? (&AM.getResult<BlockFrequencyAnalysis>(F)) - : nullptr; LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F), AM.getResult<AssumptionAnalysis>(F), AM.getResult<DominatorTreeAnalysis>(F), @@ -227,7 +223,6 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, AM.getResult<ScalarEvolutionAnalysis>(F), AM.getResult<TargetLibraryAnalysis>(F), AM.getResult<TargetIRAnalysis>(F), - BFI, MSSA}; // Setup the loop analysis manager from its proxy. It is important that diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 7cae94eb..3487e81 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -97,6 +97,12 @@ static cl::opt<MatrixLayoutTy> MatrixLayout( static cl::opt<bool> PrintAfterTransposeOpt("matrix-print-after-transpose-opt", cl::init(false)); +static cl::opt<unsigned> SplitMatmulRemainderOverThreshold( + "matrix-split-matmul-remainder-over-threshold", cl::Hidden, + cl::desc("Illegal remainder vectors over this size in bits should be split " + "in the inner loop of matmul"), + cl::init(0)); + /// Helper function to either return Scope, if it is a subprogram or the /// attached subprogram for a local scope. static DISubprogram *getSubprogram(DIScope *Scope) { @@ -115,18 +121,16 @@ static bool isSplat(Value *V) { /// Match any mul operation (fp or integer). template <typename LTy, typename RTy> -auto m_AnyMul(const LTy &L, const RTy &R) { +static auto m_AnyMul(const LTy &L, const RTy &R) { return m_CombineOr(m_Mul(L, R), m_FMul(L, R)); } /// Match any add operation (fp or integer). template <typename LTy, typename RTy> -auto m_AnyAdd(const LTy &L, const RTy &R) { +static auto m_AnyAdd(const LTy &L, const RTy &R) { return m_CombineOr(m_Add(L, R), m_FAdd(L, R)); } -namespace { - // Given an element pointer \p BasePtr to the start of a (sub) matrix, compute // the start address of vector \p VecIdx with type (\p EltType x \p NumElements) // assuming \p Stride elements between start two consecutive vectors. @@ -167,9 +171,9 @@ namespace { // v_2_0 |v_2_1 |v_2_2 |v_2_3 // v_3_0 {v_3_1 {v_3_2 v_3_3 // -Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride, - unsigned NumElements, Type *EltType, - IRBuilder<> &Builder) { +static Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride, + unsigned NumElements, Type *EltType, + IRBuilder<> &Builder) { assert((!isa<ConstantInt>(Stride) || cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) && @@ -338,6 +342,8 @@ computeShapeInfoForInst(Instruction *I, return std::nullopt; } +namespace { + /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics. /// /// Currently, the lowering for each matrix intrinsic is done as follows: @@ -371,7 +377,8 @@ class LowerMatrixIntrinsics { LoopInfo *LI = nullptr; OptimizationRemarkEmitter *ORE = nullptr; - /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation. + /// Contains estimates of the number of operations (loads, stores, compute) + /// required to lower a matrix operation. struct OpInfoTy { /// Number of stores emitted to generate this matrix. unsigned NumStores = 0; @@ -1719,6 +1726,31 @@ public: ToRemove.push_back(MatMul); } + /// Given \p Remainder iterations of the the matmul inner loop, + /// potentially lower \p Blocksize that is used for the underlying + /// vector. + unsigned capBlockSize(unsigned BlockSize, unsigned Remainder, Type *EltType) { + if (BlockSize <= Remainder) + return BlockSize; + + // If the remainder is also a legal type just use it. + auto *VecTy = FixedVectorType::get(EltType, Remainder); + if (TTI.isTypeLegal(VecTy)) + return Remainder; + + // Similarly, if the vector is small enough that we don't want + // to split further. + if (VecTy->getPrimitiveSizeInBits() <= SplitMatmulRemainderOverThreshold) + return Remainder; + + // Gradually lower the vectorization factor to cover the + // remainder. + do { + BlockSize /= 2; + } while (BlockSize > Remainder); + return BlockSize; + } + /// Compute \p Result += \p A * \p B for input matrices with left-associating /// addition. /// @@ -1756,10 +1788,8 @@ public: bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J)); for (unsigned I = 0; I < R; I += BlockSize) { - // Gradually lower the vectorization factor to cover the remainder. - while (I + BlockSize > R) - BlockSize /= 2; - + // Lower block size to make sure we stay within bounds. + BlockSize = capBlockSize(BlockSize, R - I, Result.getElementType()); Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder) : nullptr; for (unsigned K = 0; K < M; ++K) { @@ -1784,9 +1814,8 @@ public: unsigned BlockSize = VF; bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I)); for (unsigned J = 0; J < C; J += BlockSize) { - // Gradually lower the vectorization factor to cover the remainder. - while (J + BlockSize > C) - BlockSize /= 2; + // Lower the vectorization factor to cover the remainder. + BlockSize = capBlockSize(BlockSize, C - J, Result.getElementType()); Value *Sum = nullptr; for (unsigned K = 0; K < M; ++K) { diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 30b27cb..7646624 100644 --- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -107,9 +107,7 @@ PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) { return PA; } -namespace llvm { - -void initializeRegToMemWrapperPassPass(PassRegistry &); +namespace { class RegToMemWrapperPass : public FunctionPass { public: @@ -136,7 +134,7 @@ public: return N != 0 || Changed; } }; -} // namespace llvm +} // namespace INITIALIZE_PASS_BEGIN(RegToMemWrapperPass, "reg2mem", "", true, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index a692009..5c60fad 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -344,6 +344,12 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL) { + // If we want allocas to be migrated using this helper then we need to ensure + // that the BaseFragments map code still works. A simple solution would be + // to choose to always clone alloca dbg_assigns (rather than sometimes + // "stealing" them). + assert(!isa<AllocaInst>(Inst) && "Unexpected alloca"); + auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst); // Nothing to do if OldInst has no linked dbg.assign intrinsics. if (DVRAssignMarkerRange.empty()) @@ -429,11 +435,22 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID); } - ::Value *NewValue = Value ? Value : DbgAssign->getValue(); - DbgVariableRecord *NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>( - DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr, - Dest, DIExpression::get(Expr->getContext(), {}), - DbgAssign->getDebugLoc()))); + DbgVariableRecord *NewAssign; + if (IsSplit) { + ::Value *NewValue = Value ? Value : DbgAssign->getValue(); + NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>( + DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr, + Dest, DIExpression::get(Expr->getContext(), {}), + DbgAssign->getDebugLoc()))); + } else { + // The store is not split, simply steal the existing dbg_assign. + NewAssign = DbgAssign; + NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs? + NewAssign->setAddress(Dest); + if (Value) + NewAssign->replaceVariableLocationOp(0u, Value); + assert(Expr == NewAssign->getExpression()); + } // If we've updated the value but the original dbg.assign has an arglist // then kill it now - we can't use the requested new value. @@ -464,9 +481,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, // noted as slightly offset (in code) from the store. In practice this // should have little effect on the debugging experience due to the fact // that all the split stores should get the same line number. - NewAssign->moveBefore(DbgAssign->getIterator()); - - NewAssign->setDebugLoc(DbgAssign->getDebugLoc()); + if (NewAssign != DbgAssign) { + NewAssign->moveBefore(DbgAssign->getIterator()); + NewAssign->setDebugLoc(DbgAssign->getDebugLoc()); + } LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n"); }; diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index e4ba70d..5af6c96 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" -#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -3611,8 +3610,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, AAResults &AA, TargetTransformInfo &TTI, bool Trivial, bool NonTrivial, ScalarEvolution *SE, - MemorySSAUpdater *MSSAU, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI, LPMUpdater &LoopUpdater) { + MemorySSAUpdater *MSSAU, LPMUpdater &LoopUpdater) { assert(L.isRecursivelyLCSSAForm(DT, LI) && "Loops must be in LCSSA form before unswitching."); @@ -3652,35 +3650,6 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, if (F->hasOptSize()) return false; - // Returns true if Loop L's loop nest is cold, i.e. if the headers of L, - // of the loops L is nested in, and of the loops nested in L are all cold. - auto IsLoopNestCold = [&](const Loop *L) { - // Check L and all of its parent loops. - auto *Parent = L; - while (Parent) { - if (!PSI->isColdBlock(Parent->getHeader(), BFI)) - return false; - Parent = Parent->getParentLoop(); - } - // Next check all loops nested within L. - SmallVector<const Loop *, 4> Worklist; - llvm::append_range(Worklist, L->getSubLoops()); - while (!Worklist.empty()) { - auto *CurLoop = Worklist.pop_back_val(); - if (!PSI->isColdBlock(CurLoop->getHeader(), BFI)) - return false; - llvm::append_range(Worklist, CurLoop->getSubLoops()); - } - return true; - }; - - // Skip cold loops in cold loop nests, as unswitching them brings little - // benefit but increases the code size - if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) { - LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n"); - return false; - } - // Perform legality checks. if (!isSafeForNoNTrivialUnswitching(L, LI)) return false; @@ -3705,11 +3674,6 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, LPMUpdater &U) { Function &F = *L.getHeader()->getParent(); (void)F; - ProfileSummaryInfo *PSI = nullptr; - if (auto OuterProxy = - AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR) - .getCachedResult<ModuleAnalysisManagerFunctionProxy>(F)) - PSI = OuterProxy->getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n"); @@ -3720,7 +3684,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, AR.MSSA->verifyMemorySSA(); } if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial, - &AR.SE, MSSAU ? &*MSSAU : nullptr, PSI, AR.BFI, U)) + &AR.SE, MSSAU ? &*MSSAU : nullptr, U)) return PreservedAnalyses::all(); if (AR.MSSA && VerifyMemorySSA) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 88af2cf..9cd52da 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2242,8 +2242,49 @@ public: /// may not be necessary. bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const; bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const; + Align Alignment, const int64_t Diff, + const size_t Sz) const; + + /// Return true if an array of scalar loads can be replaced with a strided + /// load (with constant stride). + /// + /// TODO: + /// It is possible that the load gets "widened". Suppose that originally each + /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is + /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2 + /// ... + /// %b + 0 * %s + (w - 1) + /// + /// %b + 1 * %s + 0 + /// %b + 1 * %s + 1 + /// %b + 1 * %s + 2 + /// ... + /// %b + 1 * %s + (w - 1) + /// ... + /// + /// %b + (n - 1) * %s + 0 + /// %b + (n - 1) * %s + 1 + /// %b + (n - 1) * %s + 2 + /// ... + /// %b + (n - 1) * %s + (w - 1) + /// + /// In this case we will generate a strided load of type `<n x (k * w)>`. + /// + /// \param PointerOps list of pointer arguments of loads. + /// \param ElemTy original scalar type of loads. + /// \param Alignment alignment of the first load. + /// \param SortedIndices is the order of PointerOps as returned by + /// `sortPtrAccesses` + /// \param Diff Pointer difference between the lowest and the highes pointer + /// in `PointerOps` as returned by `getPointersDiff`. + /// \param Ptr0 first pointer in `PointersOps`. + /// \param PtrN last pointer in `PointersOps`. + /// \param SPtrInfo If the function return `true`, it also sets all the fields + /// of `SPtrInfo` necessary to generate the strided load later. + bool analyzeConstantStrideCandidate( + const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment, + const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, + Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const; /// Return true if an array of scalar loads can be replaced with a strided /// load (with run-time stride). @@ -6849,9 +6890,8 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, /// current graph (for masked gathers extra extractelement instructions /// might be required). bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const { - const size_t Sz = PointerOps.size(); + Align Alignment, const int64_t Diff, + const size_t Sz) const { if (Diff % (Sz - 1) != 0) return false; @@ -6875,27 +6915,40 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, return false; if (!TTI->isLegalStridedLoadStore(VecTy, Alignment)) return false; + return true; + } + return false; +} - // Iterate through all pointers and check if all distances are - // unique multiple of Dist. - SmallSet<int64_t, 4> Dists; - for (Value *Ptr : PointerOps) { - int64_t Dist = 0; - if (Ptr == PtrN) - Dist = Diff; - else if (Ptr != Ptr0) - Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); - // If the strides are not the same or repeated, we can't - // vectorize. - if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) - break; - } - if (Dists.size() == Sz) { - Type *StrideTy = DL->getIndexType(Ptr0->getType()); - SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); - SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); - return true; - } +bool BoUpSLP::analyzeConstantStrideCandidate( + const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment, + const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, + Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const { + const size_t Sz = PointerOps.size(); + if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz)) + return false; + + int64_t Stride = Diff / static_cast<int64_t>(Sz - 1); + + // Iterate through all pointers and check if all distances are + // unique multiple of Dist. + SmallSet<int64_t, 4> Dists; + for (Value *Ptr : PointerOps) { + int64_t Dist = 0; + if (Ptr == PtrN) + Dist = Diff; + else if (Ptr != Ptr0) + Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); + // If the strides are not the same or repeated, we can't + // vectorize. + if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) + break; + } + if (Dists.size() == Sz) { + Type *StrideTy = DL->getIndexType(Ptr0->getType()); + SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); + SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); + return true; } return false; } @@ -6995,8 +7048,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( Align Alignment = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]) ->getAlign(); - if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN, - SPtrInfo)) + if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order, + *Diff, Ptr0, PtrN, SPtrInfo)) return LoadsState::StridedVectorize; } if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || @@ -17632,7 +17685,9 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { } if (IsPHI || (!E->isGather() && E->State != TreeEntry::SplitVectorize && - E->doesNotNeedToSchedule()) || + (E->doesNotNeedToSchedule() || + (E->hasCopyableElements() && !E->isCopyableElement(LastInst) && + isUsedOutsideBlock(LastInst)))) || (GatheredLoadsEntriesFirst.has_value() && E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && E->getOpcode() == Instruction::Load)) { |