diff options
Diffstat (limited to 'llvm/lib/Transforms')
47 files changed, 1759 insertions, 1410 deletions
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 45ee2d4..fe7b3b1 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -181,6 +181,7 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) { /// the bit indexes (Mask) needed by a masked compare. If we're matching a chain /// of 'and' ops, then we also need to capture the fact that we saw an /// "and X, 1", so that's an extra return value for that case. +namespace { struct MaskOps { Value *Root = nullptr; APInt Mask; @@ -190,6 +191,7 @@ struct MaskOps { MaskOps(unsigned BitWidth, bool MatchAnds) : Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {} }; +} // namespace /// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a /// chain of 'and' or 'or' instructions looking for shift ops of a common source @@ -423,11 +425,8 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI, Arg, 0, SimplifyQuery(Call->getDataLayout(), &TLI, &DT, &AC, Call)))) { IRBuilder<> Builder(Call); - IRBuilderBase::FastMathFlagGuard Guard(Builder); - Builder.setFastMathFlags(Call->getFastMathFlags()); - - Value *NewSqrt = Builder.CreateIntrinsic(Intrinsic::sqrt, Ty, Arg, - /*FMFSource=*/nullptr, "sqrt"); + Value *NewSqrt = + Builder.CreateIntrinsic(Intrinsic::sqrt, Ty, Arg, Call, "sqrt"); Call->replaceAllUsesWith(NewSqrt); // Explicitly erase the old call because a call with side effects is not diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index 240d089..7b59c39 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -69,7 +69,6 @@ static const char *const CoroIntrinsics[] = { "llvm.coro.async.context.dealloc", "llvm.coro.async.resume", "llvm.coro.async.size.replace", - "llvm.coro.async.store_resume", "llvm.coro.await.suspend.bool", "llvm.coro.await.suspend.handle", "llvm.coro.await.suspend.void", diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index fde43bb..c3d0a1a 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -1950,9 +1950,8 @@ Expected<bool> FunctionImporter::importFunctions( SrcModule->setPartialSampleProfileRatio(Index); // Link in the specified functions. - if (renameModuleForThinLTO(*SrcModule, Index, ClearDSOLocalOnDeclarations, - &GlobalsToImport)) - return true; + renameModuleForThinLTO(*SrcModule, Index, ClearDSOLocalOnDeclarations, + &GlobalsToImport); if (PrintImports) { for (const auto *GV : GlobalsToImport) @@ -2026,11 +2025,8 @@ static bool doImportingForModuleForTest( // Next we need to promote to global scope and rename any local values that // are potentially exported to other modules. - if (renameModuleForThinLTO(M, *Index, /*ClearDSOLocalOnDeclarations=*/false, - /*GlobalsToImport=*/nullptr)) { - errs() << "Error renaming module\n"; - return true; - } + renameModuleForThinLTO(M, *Index, /*ClearDSOLocalOnDeclarations=*/false, + /*GlobalsToImport=*/nullptr); // Perform the import now. auto ModuleLoader = [&M](StringRef Identifier) { diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 96956481..449d64d 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -66,19 +66,19 @@ static cl::opt<unsigned> MaxCodeSizeGrowth( "Maximum codesize growth allowed per function")); static cl::opt<unsigned> MinCodeSizeSavings( - "funcspec-min-codesize-savings", cl::init(20), cl::Hidden, cl::desc( - "Reject specializations whose codesize savings are less than this" - "much percent of the original function size")); + "funcspec-min-codesize-savings", cl::init(20), cl::Hidden, + cl::desc("Reject specializations whose codesize savings are less than this " + "much percent of the original function size")); static cl::opt<unsigned> MinLatencySavings( "funcspec-min-latency-savings", cl::init(40), cl::Hidden, - cl::desc("Reject specializations whose latency savings are less than this" + cl::desc("Reject specializations whose latency savings are less than this " "much percent of the original function size")); static cl::opt<unsigned> MinInliningBonus( - "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc( - "Reject specializations whose inlining bonus is less than this" - "much percent of the original function size")); + "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, + cl::desc("Reject specializations whose inlining bonus is less than this " + "much percent of the original function size")); static cl::opt<bool> SpecializeOnAddress( "funcspec-on-address", cl::init(false), cl::Hidden, cl::desc( diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 16a80e9..78cd249 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -105,7 +105,7 @@ static cl::opt<int> ColdCCRelFreq( "coldcc-rel-freq", cl::Hidden, cl::init(2), cl::desc( "Maximum block frequency, expressed as a percentage of caller's " - "entry frequency, for a call site to be considered cold for enabling" + "entry frequency, for a call site to be considered cold for enabling " "coldcc")); /// Is this global variable possibly used by a leak checker as a root? If so, diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 1bf7ff4..016db55 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -122,6 +122,20 @@ static cl::opt<unsigned> cl::desc("Max depth to recursively search for missing " "frames through tail calls.")); +// By default enable cloning of callsites involved with recursive cycles +static cl::opt<bool> AllowRecursiveCallsites( + "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, + cl::desc("Allow cloning of callsites involved in recursive cycles")); + +// When disabled, try to detect and prevent cloning of recursive contexts. +// This is only necessary until we support cloning through recursive cycles. +// Leave on by default for now, as disabling requires a little bit of compile +// time overhead and doesn't affect correctness, it will just inflate the cold +// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled. +static cl::opt<bool> AllowRecursiveContexts( + "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, + cl::desc("Allow cloning of contexts through recursive cycles")); + namespace llvm { cl::opt<bool> EnableMemProfContextDisambiguation( "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden, @@ -1236,9 +1250,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( StackEntryIdToContextNodeMap[StackId] = StackNode; StackNode->OrigStackOrAllocId = StackId; } - auto Ins = StackIdSet.insert(StackId); - if (!Ins.second) - StackNode->Recursive = true; + // Marking a node recursive will prevent its cloning completely, even for + // non-recursive contexts flowing through it. + if (!AllowRecursiveCallsites) { + auto Ins = StackIdSet.insert(StackId); + if (!Ins.second) + StackNode->Recursive = true; + } StackNode->AllocTypes |= (uint8_t)AllocType; PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); PrevNode = StackNode; @@ -1375,8 +1393,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, set_union(CallerEdgeContextIds, Edge->ContextIds); } // Node can have more context ids than callers if some contexts terminate at - // node and some are longer. - assert(NodeContextIds == CallerEdgeContextIds || + // node and some are longer. If we are allowing recursive callsites but + // haven't disabled recursive contexts, this will be violated for + // incompletely cloned recursive cycles, so skip the checking in that case. + assert((AllowRecursiveCallsites && AllowRecursiveContexts) || + NodeContextIds == CallerEdgeContextIds || set_is_subset(CallerEdgeContextIds, NodeContextIds)); } if (Node->CalleeEdges.size()) { @@ -3370,6 +3391,21 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( assert(Node->AllocTypes != (uint8_t)AllocationType::None); + DenseSet<uint32_t> RecursiveContextIds; + // If we are allowing recursive callsites, but have also disabled recursive + // contexts, look for context ids that show up in multiple caller edges. + if (AllowRecursiveCallsites && !AllowRecursiveContexts) { + DenseSet<uint32_t> AllCallerContextIds; + for (auto &CE : Node->CallerEdges) { + // Resize to the largest set of caller context ids, since we know the + // final set will be at least that large. + AllCallerContextIds.reserve(CE->getContextIds().size()); + for (auto Id : CE->getContextIds()) + if (!AllCallerContextIds.insert(Id).second) + RecursiveContextIds.insert(Id); + } + } + // Iterate until we find no more opportunities for disambiguating the alloc // types via cloning. In most cases this loop will terminate once the Node // has a single allocation type, in which case no more cloning is needed. @@ -3394,6 +3430,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( // allocation. auto CallerEdgeContextsForAlloc = set_intersection(CallerEdge->getContextIds(), AllocContextIds); + if (!RecursiveContextIds.empty()) + CallerEdgeContextsForAlloc = + set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds); if (CallerEdgeContextsForAlloc.empty()) { ++EI; continue; diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index b40ab35..67585e9 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -129,7 +129,7 @@ static cl::opt<bool> PrintModuleBeforeOptimizations( static cl::opt<bool> AlwaysInlineDeviceFunctions( "openmp-opt-inline-device", - cl::desc("Inline all applicible functions on the device."), cl::Hidden, + cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false)); static cl::opt<bool> diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 603beb3..b978c54 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -162,7 +162,7 @@ static cl::opt<bool> ProfileSampleBlockAccurate( static cl::opt<bool> ProfileAccurateForSymsInList( "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " - "be accurate. It may be overriden by profile-sample-accurate. ")); + "be accurate. It may be overridden by profile-sample-accurate. ")); static cl::opt<bool> ProfileMergeInlinee( "sample-profile-merge-inlinee", cl::Hidden, cl::init(true), @@ -193,9 +193,10 @@ static cl::opt<bool> ProfileSizeInline( // and inline the hot functions (that are skipped in this pass). static cl::opt<bool> DisableSampleLoaderInlining( "disable-sample-loader-inlining", cl::Hidden, cl::init(false), - cl::desc("If true, artifically skip inline transformation in sample-loader " - "pass, and merge (or scale) profiles (as configured by " - "--sample-profile-merge-inlinee).")); + cl::desc( + "If true, artificially skip inline transformation in sample-loader " + "pass, and merge (or scale) profiles (as configured by " + "--sample-profile-merge-inlinee).")); namespace llvm { cl::opt<bool> @@ -255,7 +256,7 @@ static cl::opt<unsigned> PrecentMismatchForStalenessError( static cl::opt<bool> CallsitePrioritizedInline( "sample-profile-prioritized-inline", cl::Hidden, - cl::desc("Use call site prioritized inlining for sample profile loader." + cl::desc("Use call site prioritized inlining for sample profile loader. " "Currently only CSSPGO is supported.")); static cl::opt<bool> UsePreInlinerDecision( diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 7a184a1..73876d0 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1326,6 +1326,18 @@ Instruction *InstCombinerImpl::foldAddLikeCommutative(Value *LHS, Value *RHS, R->setHasNoUnsignedWrap(NUWOut); return R; } + + // ((X s/ C1) << C2) + X => X s% -C1 where -C1 is 1 << C2 + const APInt *C1, *C2; + if (match(LHS, m_Shl(m_SDiv(m_Specific(RHS), m_APInt(C1)), m_APInt(C2)))) { + APInt One(C2->getBitWidth(), 1); + APInt MinusC1 = -(*C1); + if (MinusC1 == (One << *C2)) { + Constant *NewRHS = ConstantInt::get(RHS->getType(), MinusC1); + return BinaryOperator::CreateSRem(RHS, NewRHS); + } + } + return nullptr; } @@ -1623,17 +1635,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) { // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1) if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V); - // ((X s/ C1) << C2) + X => X s% -C1 where -C1 is 1 << C2 - const APInt *C1, *C2; - if (match(LHS, m_Shl(m_SDiv(m_Specific(RHS), m_APInt(C1)), m_APInt(C2)))) { - APInt one(C2->getBitWidth(), 1); - APInt minusC1 = -(*C1); - if (minusC1 == (one << *C2)) { - Constant *NewRHS = ConstantInt::get(RHS->getType(), minusC1); - return BinaryOperator::CreateSRem(RHS, NewRHS); - } - } - + const APInt *C1; // (A & 2^C1) + A => A & (2^C1 - 1) iff bit C1 in A is a sign bit if (match(&I, m_c_Add(m_And(m_Value(A), m_APInt(C1)), m_Deferred(A))) && C1->isPowerOf2() && (ComputeNumSignBits(A) > C1->countl_zero())) { @@ -2845,12 +2847,11 @@ Instruction *InstCombinerImpl::hoistFNegAboveFMulFDiv(Value *FNegOp, // Make sure to preserve flags and metadata on the call. if (II->getIntrinsicID() == Intrinsic::ldexp) { FastMathFlags FMF = FMFSource.getFastMathFlags() | II->getFastMathFlags(); - IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); - Builder.setFastMathFlags(FMF); - - CallInst *New = Builder.CreateCall( - II->getCalledFunction(), - {Builder.CreateFNeg(II->getArgOperand(0)), II->getArgOperand(1)}); + CallInst *New = + Builder.CreateCall(II->getCalledFunction(), + {Builder.CreateFNegFMF(II->getArgOperand(0), FMF), + II->getArgOperand(1)}); + New->setFastMathFlags(FMF); New->copyMetadata(*II); return New; } @@ -2932,12 +2933,8 @@ Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) { // flags the copysign doesn't also have. FastMathFlags FMF = I.getFastMathFlags(); FMF &= cast<FPMathOperator>(OneUse)->getFastMathFlags(); - - IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); - Builder.setFastMathFlags(FMF); - - Value *NegY = Builder.CreateFNeg(Y); - Value *NewCopySign = Builder.CreateCopySign(X, NegY); + Value *NegY = Builder.CreateFNegFMF(Y, FMF); + Value *NewCopySign = Builder.CreateCopySign(X, NegY, FMF); return replaceInstUsesWith(I, NewCopySign); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index e576eea4..f82a557 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -39,11 +39,11 @@ static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS, /// This is the complement of getFCmpCode, which turns an opcode and two /// operands into either a FCmp instruction, or a true/false constant. static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS, - InstCombiner::BuilderTy &Builder) { + InstCombiner::BuilderTy &Builder, FMFSource FMF) { FCmpInst::Predicate NewPred; if (Constant *TorF = getPredForFCmpCode(Code, LHS->getType(), NewPred)) return TorF; - return Builder.CreateFCmp(NewPred, LHS, RHS); + return Builder.CreateFCmpFMF(NewPred, LHS, RHS, FMF); } /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise @@ -513,7 +513,8 @@ static Value *foldLogOpOfMaskedICmpsAsymmetric( /// into a single (icmp(A & X) ==/!= Y). static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, bool IsLogical, - InstCombiner::BuilderTy &Builder) { + InstCombiner::BuilderTy &Builder, + const SimplifyQuery &Q) { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); std::optional<std::pair<unsigned, unsigned>> MaskPair = @@ -586,93 +587,107 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, return Builder.CreateICmp(NewCC, NewAnd2, A); } - // Remaining cases assume at least that B and D are constant, and depend on - // their actual values. This isn't strictly necessary, just a "handle the - // easy cases for now" decision. const APInt *ConstB, *ConstD; - if (!match(B, m_APInt(ConstB)) || !match(D, m_APInt(ConstD))) - return nullptr; - - if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) { - // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and - // (icmp ne (A & B), B) & (icmp ne (A & D), D) - // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0) - // Only valid if one of the masks is a superset of the other (check "B&D" is - // the same as either B or D). - APInt NewMask = *ConstB & *ConstD; - if (NewMask == *ConstB) - return LHS; - else if (NewMask == *ConstD) - return RHS; - } - - if (Mask & AMask_NotAllOnes) { - // (icmp ne (A & B), B) & (icmp ne (A & D), D) - // -> (icmp ne (A & B), A) or (icmp ne (A & D), A) - // Only valid if one of the masks is a superset of the other (check "B|D" is - // the same as either B or D). - APInt NewMask = *ConstB | *ConstD; - if (NewMask == *ConstB) - return LHS; - else if (NewMask == *ConstD) - return RHS; - } - - if (Mask & (BMask_Mixed | BMask_NotMixed)) { - // Mixed: - // (icmp eq (A & B), C) & (icmp eq (A & D), E) - // We already know that B & C == C && D & E == E. - // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of - // C and E, which are shared by both the mask B and the mask D, don't - // contradict, then we can transform to - // -> (icmp eq (A & (B|D)), (C|E)) - // Currently, we only handle the case of B, C, D, and E being constant. - // We can't simply use C and E because we might actually handle - // (icmp ne (A & B), B) & (icmp eq (A & D), D) - // with B and D, having a single bit set. - - // NotMixed: - // (icmp ne (A & B), C) & (icmp ne (A & D), E) - // -> (icmp ne (A & (B & D)), (C & E)) - // Check the intersection (B & D) for inequality. - // Assume that (B & D) == B || (B & D) == D, i.e B/D is a subset of D/B - // and (B & D) & (C ^ E) == 0, bits of C and E, which are shared by both the - // B and the D, don't contradict. - // Note that we can assume (~B & C) == 0 && (~D & E) == 0, previous - // operation should delete these icmps if it hadn't been met. - - const APInt *OldConstC, *OldConstE; - if (!match(C, m_APInt(OldConstC)) || !match(E, m_APInt(OldConstE))) - return nullptr; - - auto FoldBMixed = [&](ICmpInst::Predicate CC, bool IsNot) -> Value * { - CC = IsNot ? CmpInst::getInversePredicate(CC) : CC; - const APInt ConstC = PredL != CC ? *ConstB ^ *OldConstC : *OldConstC; - const APInt ConstE = PredR != CC ? *ConstD ^ *OldConstE : *OldConstE; + if (match(B, m_APInt(ConstB)) && match(D, m_APInt(ConstD))) { + if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) { + // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0) + // Only valid if one of the masks is a superset of the other (check "B&D" + // is the same as either B or D). + APInt NewMask = *ConstB & *ConstD; + if (NewMask == *ConstB) + return LHS; + if (NewMask == *ConstD) + return RHS; + } - if (((*ConstB & *ConstD) & (ConstC ^ ConstE)).getBoolValue()) - return IsNot ? nullptr : ConstantInt::get(LHS->getType(), !IsAnd); + if (Mask & AMask_NotAllOnes) { + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), A) or (icmp ne (A & D), A) + // Only valid if one of the masks is a superset of the other (check "B|D" + // is the same as either B or D). + APInt NewMask = *ConstB | *ConstD; + if (NewMask == *ConstB) + return LHS; + if (NewMask == *ConstD) + return RHS; + } - if (IsNot && !ConstB->isSubsetOf(*ConstD) && !ConstD->isSubsetOf(*ConstB)) + if (Mask & (BMask_Mixed | BMask_NotMixed)) { + // Mixed: + // (icmp eq (A & B), C) & (icmp eq (A & D), E) + // We already know that B & C == C && D & E == E. + // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of + // C and E, which are shared by both the mask B and the mask D, don't + // contradict, then we can transform to + // -> (icmp eq (A & (B|D)), (C|E)) + // Currently, we only handle the case of B, C, D, and E being constant. + // We can't simply use C and E because we might actually handle + // (icmp ne (A & B), B) & (icmp eq (A & D), D) + // with B and D, having a single bit set. + + // NotMixed: + // (icmp ne (A & B), C) & (icmp ne (A & D), E) + // -> (icmp ne (A & (B & D)), (C & E)) + // Check the intersection (B & D) for inequality. + // Assume that (B & D) == B || (B & D) == D, i.e B/D is a subset of D/B + // and (B & D) & (C ^ E) == 0, bits of C and E, which are shared by both + // the B and the D, don't contradict. Note that we can assume (~B & C) == + // 0 && (~D & E) == 0, previous operation should delete these icmps if it + // hadn't been met. + + const APInt *OldConstC, *OldConstE; + if (!match(C, m_APInt(OldConstC)) || !match(E, m_APInt(OldConstE))) return nullptr; - APInt BD, CE; - if (IsNot) { - BD = *ConstB & *ConstD; - CE = ConstC & ConstE; - } else { - BD = *ConstB | *ConstD; - CE = ConstC | ConstE; - } - Value *NewAnd = Builder.CreateAnd(A, BD); - Value *CEVal = ConstantInt::get(A->getType(), CE); - return Builder.CreateICmp(CC, CEVal, NewAnd); - }; + auto FoldBMixed = [&](ICmpInst::Predicate CC, bool IsNot) -> Value * { + CC = IsNot ? CmpInst::getInversePredicate(CC) : CC; + const APInt ConstC = PredL != CC ? *ConstB ^ *OldConstC : *OldConstC; + const APInt ConstE = PredR != CC ? *ConstD ^ *OldConstE : *OldConstE; + + if (((*ConstB & *ConstD) & (ConstC ^ ConstE)).getBoolValue()) + return IsNot ? nullptr : ConstantInt::get(LHS->getType(), !IsAnd); + + if (IsNot && !ConstB->isSubsetOf(*ConstD) && + !ConstD->isSubsetOf(*ConstB)) + return nullptr; + + APInt BD, CE; + if (IsNot) { + BD = *ConstB & *ConstD; + CE = ConstC & ConstE; + } else { + BD = *ConstB | *ConstD; + CE = ConstC | ConstE; + } + Value *NewAnd = Builder.CreateAnd(A, BD); + Value *CEVal = ConstantInt::get(A->getType(), CE); + return Builder.CreateICmp(CC, CEVal, NewAnd); + }; + + if (Mask & BMask_Mixed) + return FoldBMixed(NewCC, false); + if (Mask & BMask_NotMixed) // can be else also + return FoldBMixed(NewCC, true); + } + } - if (Mask & BMask_Mixed) - return FoldBMixed(NewCC, false); - if (Mask & BMask_NotMixed) // can be else also - return FoldBMixed(NewCC, true); + // (icmp eq (A & B), 0) | (icmp eq (A & D), 0) + // -> (icmp ne (A & (B|D)), (B|D)) + // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) + // -> (icmp eq (A & (B|D)), (B|D)) + // iff B and D is known to be a power of two + if (Mask & Mask_NotAllZeros && + isKnownToBeAPowerOfTwo(B, /*OrZero=*/false, /*Depth=*/0, Q) && + isKnownToBeAPowerOfTwo(D, /*OrZero=*/false, /*Depth=*/0, Q)) { + // If this is a logical and/or, then we must prevent propagation of a + // poison value from the RHS by inserting freeze. + if (IsLogical) + D = Builder.CreateFreeze(D); + Value *Mask = Builder.CreateOr(B, D); + Value *Masked = Builder.CreateAnd(A, Mask); + return Builder.CreateICmp(NewCC, Masked, Mask); } return nullptr; } @@ -775,46 +790,6 @@ foldAndOrOfICmpsWithPow2AndWithZero(InstCombiner::BuilderTy &Builder, return Builder.CreateICmp(Pred, And, Op); } -// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) -// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) -Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, - ICmpInst *RHS, - Instruction *CxtI, - bool IsAnd, - bool IsLogical) { - CmpInst::Predicate Pred = IsAnd ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; - if (LHS->getPredicate() != Pred || RHS->getPredicate() != Pred) - return nullptr; - - if (!match(LHS->getOperand(1), m_Zero()) || - !match(RHS->getOperand(1), m_Zero())) - return nullptr; - - Value *L1, *L2, *R1, *R2; - if (match(LHS->getOperand(0), m_And(m_Value(L1), m_Value(L2))) && - match(RHS->getOperand(0), m_And(m_Value(R1), m_Value(R2)))) { - if (L1 == R2 || L2 == R2) - std::swap(R1, R2); - if (L2 == R1) - std::swap(L1, L2); - - if (L1 == R1 && - isKnownToBeAPowerOfTwo(L2, false, 0, CxtI) && - isKnownToBeAPowerOfTwo(R2, false, 0, CxtI)) { - // If this is a logical and/or, then we must prevent propagation of a - // poison value from the RHS by inserting freeze. - if (IsLogical) - R2 = Builder.CreateFreeze(R2); - Value *Mask = Builder.CreateOr(L2, R2); - Value *Masked = Builder.CreateAnd(L1, Mask); - auto NewPred = IsAnd ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE; - return Builder.CreateICmp(NewPred, Masked, Mask); - } - } - - return nullptr; -} - /// General pattern: /// X & Y /// @@ -1429,12 +1404,8 @@ static Value *matchIsFiniteTest(InstCombiner::BuilderTy &Builder, FCmpInst *LHS, !matchUnorderedInfCompare(PredR, RHS0, RHS1)) return nullptr; - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - FastMathFlags FMF = LHS->getFastMathFlags(); - FMF &= RHS->getFastMathFlags(); - Builder.setFastMathFlags(FMF); - - return Builder.CreateFCmp(FCmpInst::getOrderedPredicate(PredR), RHS0, RHS1); + return Builder.CreateFCmpFMF(FCmpInst::getOrderedPredicate(PredR), RHS0, RHS1, + FMFSource::intersect(LHS, RHS)); } Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, @@ -1470,12 +1441,8 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, // Intersect the fast math flags. // TODO: We can union the fast math flags unless this is a logical select. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - FastMathFlags FMF = LHS->getFastMathFlags(); - FMF &= RHS->getFastMathFlags(); - Builder.setFastMathFlags(FMF); - - return getFCmpValue(NewPred, LHS0, LHS1, Builder); + return getFCmpValue(NewPred, LHS0, LHS1, Builder, + FMFSource::intersect(LHS, RHS)); } // This transform is not valid for a logical select. @@ -1492,10 +1459,8 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, // Ignore the constants because they are obviously not NANs: // (fcmp ord x, 0.0) & (fcmp ord y, 0.0) -> (fcmp ord x, y) // (fcmp uno x, 0.0) | (fcmp uno y, 0.0) -> (fcmp uno x, y) - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(LHS->getFastMathFlags() & - RHS->getFastMathFlags()); - return Builder.CreateFCmp(PredL, LHS0, RHS0); + return Builder.CreateFCmpFMF(PredL, LHS0, RHS0, + FMFSource::intersect(LHS, RHS)); } } @@ -1557,15 +1522,14 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, std::swap(PredL, PredR); } if (IsLessThanOrLessEqual(IsAnd ? PredL : PredR)) { - BuilderTy::FastMathFlagGuard Guard(Builder); FastMathFlags NewFlag = LHS->getFastMathFlags(); if (!IsLogicalSelect) NewFlag |= RHS->getFastMathFlags(); - Builder.setFastMathFlags(NewFlag); - Value *FAbs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, LHS0); - return Builder.CreateFCmp(PredL, FAbs, - ConstantFP::get(LHS0->getType(), *LHSC)); + Value *FAbs = + Builder.CreateUnaryIntrinsic(Intrinsic::fabs, LHS0, NewFlag); + return Builder.CreateFCmpFMF( + PredL, FAbs, ConstantFP::get(LHS0->getType(), *LHSC), NewFlag); } } @@ -2372,6 +2336,26 @@ static Value *simplifyAndOrWithOpReplaced(Value *V, Value *Op, Value *RepOp, return IC.Builder.CreateBinOp(I->getOpcode(), NewOp0, NewOp1); } +/// Reassociate and/or expressions to see if we can fold the inner and/or ops. +/// TODO: Make this recursive; it's a little tricky because an arbitrary +/// number of and/or instructions might have to be created. +Value *InstCombinerImpl::reassociateBooleanAndOr(Value *LHS, Value *X, Value *Y, + Instruction &I, bool IsAnd, + bool RHSIsLogical) { + Instruction::BinaryOps Opcode = IsAnd ? Instruction::And : Instruction::Or; + // LHS bop (X lop Y) --> (LHS bop X) lop Y + // LHS bop (X bop Y) --> (LHS bop X) bop Y + if (Value *Res = foldBooleanAndOr(LHS, X, I, IsAnd, /*IsLogical=*/false)) + return RHSIsLogical ? Builder.CreateLogicalOp(Opcode, Res, Y) + : Builder.CreateBinOp(Opcode, Res, Y); + // LHS bop (X bop Y) --> X bop (LHS bop Y) + // LHS bop (X lop Y) --> X lop (LHS bop Y) + if (Value *Res = foldBooleanAndOr(LHS, Y, I, IsAnd, /*IsLogical=*/false)) + return RHSIsLogical ? Builder.CreateLogicalOp(Opcode, X, Res) + : Builder.CreateBinOp(Opcode, X, Res); + return nullptr; +} + // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. @@ -2755,31 +2739,17 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/true, /*IsLogical=*/false)) return replaceInstUsesWith(I, Res); - // TODO: Make this recursive; it's a little tricky because an arbitrary - // number of 'and' instructions might have to be created. if (match(Op1, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) { bool IsLogical = isa<SelectInst>(Op1); - // Op0 & (X && Y) --> (Op0 && X) && Y - if (Value *Res = foldBooleanAndOr(Op0, X, I, /* IsAnd */ true, IsLogical)) - return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(Res, Y) - : Builder.CreateAnd(Res, Y)); - // Op0 & (X && Y) --> X && (Op0 & Y) - if (Value *Res = foldBooleanAndOr(Op0, Y, I, /* IsAnd */ true, - /* IsLogical */ false)) - return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(X, Res) - : Builder.CreateAnd(X, Res)); + if (auto *V = reassociateBooleanAndOr(Op0, X, Y, I, /*IsAnd=*/true, + /*RHSIsLogical=*/IsLogical)) + return replaceInstUsesWith(I, V); } if (match(Op0, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) { bool IsLogical = isa<SelectInst>(Op0); - // (X && Y) & Op1 --> (X && Op1) && Y - if (Value *Res = foldBooleanAndOr(X, Op1, I, /* IsAnd */ true, IsLogical)) - return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(Res, Y) - : Builder.CreateAnd(Res, Y)); - // (X && Y) & Op1 --> X && (Y & Op1) - if (Value *Res = foldBooleanAndOr(Y, Op1, I, /* IsAnd */ true, - /* IsLogical */ false)) - return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(X, Res) - : Builder.CreateAnd(X, Res)); + if (auto *V = reassociateBooleanAndOr(Op1, X, Y, I, /*IsAnd=*/true, + /*RHSIsLogical=*/IsLogical)) + return replaceInstUsesWith(I, V); } if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder)) @@ -3330,12 +3300,6 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsLogical) { const SimplifyQuery Q = SQ.getWithInstruction(&I); - // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) - // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) - // if K1 and K2 are a one-bit mask. - if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, &I, IsAnd, IsLogical)) - return V; - ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0); Value *LHS1 = LHS->getOperand(1), *RHS1 = RHS->getOperand(1); @@ -3362,7 +3326,7 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, // handle (roughly): // (icmp ne (A & B), C) | (icmp ne (A & D), E) // (icmp eq (A & B), C) & (icmp eq (A & D), E) - if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, IsAnd, IsLogical, Builder)) + if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, IsAnd, IsLogical, Builder, Q)) return V; if (Value *V = @@ -3840,31 +3804,17 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/false, /*IsLogical=*/false)) return replaceInstUsesWith(I, Res); - // TODO: Make this recursive; it's a little tricky because an arbitrary - // number of 'or' instructions might have to be created. if (match(Op1, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) { bool IsLogical = isa<SelectInst>(Op1); - // Op0 | (X || Y) --> (Op0 || X) || Y - if (Value *Res = foldBooleanAndOr(Op0, X, I, /* IsAnd */ false, IsLogical)) - return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(Res, Y) - : Builder.CreateOr(Res, Y)); - // Op0 | (X || Y) --> X || (Op0 | Y) - if (Value *Res = foldBooleanAndOr(Op0, Y, I, /* IsAnd */ false, - /* IsLogical */ false)) - return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(X, Res) - : Builder.CreateOr(X, Res)); + if (auto *V = reassociateBooleanAndOr(Op0, X, Y, I, /*IsAnd=*/false, + /*RHSIsLogical=*/IsLogical)) + return replaceInstUsesWith(I, V); } if (match(Op0, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) { bool IsLogical = isa<SelectInst>(Op0); - // (X || Y) | Op1 --> (X || Op1) || Y - if (Value *Res = foldBooleanAndOr(X, Op1, I, /* IsAnd */ false, IsLogical)) - return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(Res, Y) - : Builder.CreateOr(Res, Y)); - // (X || Y) | Op1 --> X || (Y | Op1) - if (Value *Res = foldBooleanAndOr(Y, Op1, I, /* IsAnd */ false, - /* IsLogical */ false)) - return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(X, Res) - : Builder.CreateOr(X, Res)); + if (auto *V = reassociateBooleanAndOr(Op1, X, Y, I, /*IsAnd=*/false, + /*RHSIsLogical=*/IsLogical)) + return replaceInstUsesWith(I, V); } if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder)) @@ -4981,8 +4931,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { // (A & B) ^ (A | C) --> A ? ~B : C -- There are 4 commuted variants. if (I.getType()->isIntOrIntVectorTy(1) && - match(Op0, m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B)))) && - match(Op1, m_OneUse(m_LogicalOr(m_Value(C), m_Value(D))))) { + match(&I, m_c_Xor(m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B))), + m_OneUse(m_LogicalOr(m_Value(C), m_Value(D)))))) { bool NeedFreeze = isa<SelectInst>(Op0) && isa<SelectInst>(Op1) && B == D; if (B == C || B == D) std::swap(A, B); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index fd38738..c55c40c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -839,6 +839,35 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) { if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(), WO->getRHS(), *WO, OperationResult, OverflowResult)) return createOverflowTuple(WO, OperationResult, OverflowResult); + + // See whether we can optimize the overflow check with assumption information. + for (User *U : WO->users()) { + if (!match(U, m_ExtractValue<1>(m_Value()))) + continue; + + for (auto &AssumeVH : AC.assumptionsFor(U)) { + if (!AssumeVH) + continue; + CallInst *I = cast<CallInst>(AssumeVH); + if (!match(I->getArgOperand(0), m_Not(m_Specific(U)))) + continue; + if (!isValidAssumeForContext(I, II, /*DT=*/nullptr, + /*AllowEphemerals=*/true)) + continue; + Value *Result = + Builder.CreateBinOp(WO->getBinaryOp(), WO->getLHS(), WO->getRHS()); + Result->takeName(WO); + if (auto *Inst = dyn_cast<Instruction>(Result)) { + if (WO->isSigned()) + Inst->setHasNoSignedWrap(); + else + Inst->setHasNoUnsignedWrap(); + } + return createOverflowTuple(WO, Result, + ConstantInt::getFalse(U->getType())); + } + } + return nullptr; } @@ -2644,8 +2673,11 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // Propagate sign argument through nested calls: // copysign Mag, (copysign ?, X) --> copysign Mag, X Value *X; - if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X)))) - return replaceOperand(*II, 1, X); + if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X)))) { + Value *CopySign = + Builder.CreateCopySign(Mag, X, FMFSource::intersect(II, Sign)); + return replaceInstUsesWith(*II, CopySign); + } // Clear sign-bit of constant magnitude: // copysign -MagC, X --> copysign MagC, X diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 0b93799..4ec1af3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1852,15 +1852,13 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) { Value *X; Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0)); if (Op && Op->hasOneUse()) { - IRBuilder<>::FastMathFlagGuard FMFG(Builder); FastMathFlags FMF = FPT.getFastMathFlags(); if (auto *FPMO = dyn_cast<FPMathOperator>(Op)) FMF &= FPMO->getFastMathFlags(); - Builder.setFastMathFlags(FMF); if (match(Op, m_FNeg(m_Value(X)))) { - Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty); - Value *Neg = Builder.CreateFNeg(InnerTrunc); + Value *InnerTrunc = Builder.CreateFPTruncFMF(X, Ty, FMF); + Value *Neg = Builder.CreateFNegFMF(InnerTrunc, FMF); return replaceInstUsesWith(FPT, Neg); } @@ -1870,15 +1868,17 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) { if (match(Op, m_Select(m_Value(Cond), m_FPExt(m_Value(X)), m_Value(Y))) && X->getType() == Ty) { // fptrunc (select Cond, (fpext X), Y --> select Cond, X, (fptrunc Y) - Value *NarrowY = Builder.CreateFPTrunc(Y, Ty); - Value *Sel = Builder.CreateSelect(Cond, X, NarrowY, "narrow.sel", Op); + Value *NarrowY = Builder.CreateFPTruncFMF(Y, Ty, FMF); + Value *Sel = + Builder.CreateSelectFMF(Cond, X, NarrowY, FMF, "narrow.sel", Op); return replaceInstUsesWith(FPT, Sel); } if (match(Op, m_Select(m_Value(Cond), m_Value(Y), m_FPExt(m_Value(X)))) && X->getType() == Ty) { // fptrunc (select Cond, Y, (fpext X) --> select Cond, (fptrunc Y), X - Value *NarrowY = Builder.CreateFPTrunc(Y, Ty); - Value *Sel = Builder.CreateSelect(Cond, NarrowY, X, "narrow.sel", Op); + Value *NarrowY = Builder.CreateFPTruncFMF(Y, Ty, FMF); + Value *Sel = + Builder.CreateSelectFMF(Cond, NarrowY, X, FMF, "narrow.sel", Op); return replaceInstUsesWith(FPT, Sel); } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index d6fdade..2e45725 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -747,6 +747,8 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, ConstantExpr::getPointerBitCastOrAddrSpaceCast( cast<Constant>(RHS), Base->getType())); } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) { + GEPNoWrapFlags NW = GEPLHS->getNoWrapFlags() & GEPRHS->getNoWrapFlags(); + // If the base pointers are different, but the indices are the same, just // compare the base pointer. if (PtrBase != GEPRHS->getOperand(0)) { @@ -764,7 +766,8 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // If all indices are the same, just compare the base pointers. Type *BaseType = GEPLHS->getOperand(0)->getType(); - if (IndicesTheSame && CmpInst::makeCmpResultType(BaseType) == I.getType()) + if (IndicesTheSame && + CmpInst::makeCmpResultType(BaseType) == I.getType() && CanFold(NW)) return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0)); // If we're comparing GEPs with two base pointers that only differ in type @@ -804,7 +807,6 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, return transformToIndexedCompare(GEPLHS, RHS, Cond, DL, *this); } - GEPNoWrapFlags NW = GEPLHS->getNoWrapFlags() & GEPRHS->getNoWrapFlags(); if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands() && GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType()) { // If the GEPs only differ by one index, compare it. @@ -2483,9 +2485,8 @@ Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp, // icmp ule i64 (shl X, 32), 8589934592 -> // icmp ule i32 (trunc X, i32), 2 -> // icmp ult i32 (trunc X, i32), 3 - if (auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant( - Pred, ConstantInt::get(ShType->getContext(), C))) { + if (auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant( + Pred, ConstantInt::get(ShType->getContext(), C))) { CmpPred = FlippedStrictness->first; RHSC = cast<ConstantInt>(FlippedStrictness->second)->getValue(); } @@ -3089,12 +3090,12 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp, unsigned BW = C.getBitWidth(); std::bitset<4> Table; auto ComputeTable = [&](bool Op0Val, bool Op1Val) { - int Res = 0; + APInt Res(BW, 0); if (Op0Val) - Res += isa<ZExtInst>(Ext0) ? 1 : -1; + Res += APInt(BW, isa<ZExtInst>(Ext0) ? 1 : -1, /*isSigned=*/true); if (Op1Val) - Res += isa<ZExtInst>(Ext1) ? 1 : -1; - return ICmpInst::compare(APInt(BW, Res, true), C, Pred); + Res += APInt(BW, isa<ZExtInst>(Ext1) ? 1 : -1, /*isSigned=*/true); + return ICmpInst::compare(Res, C, Pred); }; Table[0] = ComputeTable(false, false); @@ -3278,8 +3279,7 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) { // x sgt C-1 <--> x sge C <--> not(x slt C) auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant( - PredB, cast<Constant>(RHS2)); + getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2)); if (!FlippedStrictness) return false; assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && @@ -6906,79 +6906,6 @@ Instruction *InstCombinerImpl::foldICmpUsingBoolRange(ICmpInst &I) { return nullptr; } -std::optional<std::pair<CmpPredicate, Constant *>> -InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpPredicate Pred, - Constant *C) { - assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) && - "Only for relational integer predicates."); - - Type *Type = C->getType(); - bool IsSigned = ICmpInst::isSigned(Pred); - - CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred); - bool WillIncrement = - UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT; - - // Check if the constant operand can be safely incremented/decremented - // without overflowing/underflowing. - auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) { - return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned); - }; - - Constant *SafeReplacementConstant = nullptr; - if (auto *CI = dyn_cast<ConstantInt>(C)) { - // Bail out if the constant can't be safely incremented/decremented. - if (!ConstantIsOk(CI)) - return std::nullopt; - } else if (auto *FVTy = dyn_cast<FixedVectorType>(Type)) { - unsigned NumElts = FVTy->getNumElements(); - for (unsigned i = 0; i != NumElts; ++i) { - Constant *Elt = C->getAggregateElement(i); - if (!Elt) - return std::nullopt; - - if (isa<UndefValue>(Elt)) - continue; - - // Bail out if we can't determine if this constant is min/max or if we - // know that this constant is min/max. - auto *CI = dyn_cast<ConstantInt>(Elt); - if (!CI || !ConstantIsOk(CI)) - return std::nullopt; - - if (!SafeReplacementConstant) - SafeReplacementConstant = CI; - } - } else if (isa<VectorType>(C->getType())) { - // Handle scalable splat - Value *SplatC = C->getSplatValue(); - auto *CI = dyn_cast_or_null<ConstantInt>(SplatC); - // Bail out if the constant can't be safely incremented/decremented. - if (!CI || !ConstantIsOk(CI)) - return std::nullopt; - } else { - // ConstantExpr? - return std::nullopt; - } - - // It may not be safe to change a compare predicate in the presence of - // undefined elements, so replace those elements with the first safe constant - // that we found. - // TODO: in case of poison, it is safe; let's replace undefs only. - if (C->containsUndefOrPoisonElement()) { - assert(SafeReplacementConstant && "Replacement constant not set"); - C = Constant::replaceUndefsWith(C, SafeReplacementConstant); - } - - CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred); - - // Increment or decrement the constant. - Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true); - Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne); - - return std::make_pair(NewPred, NewC); -} - /// If we have an icmp le or icmp ge instruction with a constant operand, turn /// it into the appropriate icmp lt or icmp gt instruction. This transform /// allows them to be folded in visitICmpInst. @@ -6994,8 +6921,7 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) { if (!Op1C) return nullptr; - auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, Op1C); + auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C); if (!FlippedStrictness) return nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 3a074ee..f6992119 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -429,12 +429,12 @@ private: Value *foldBooleanAndOr(Value *LHS, Value *RHS, Instruction &I, bool IsAnd, bool IsLogical); + Value *reassociateBooleanAndOr(Value *LHS, Value *X, Value *Y, Instruction &I, + bool IsAnd, bool RHSIsLogical); + Instruction * canonicalizeConditionalNegationViaMathToSelect(BinaryOperator &i); - Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS, - Instruction *CxtI, bool IsAnd, - bool IsLogical = false); Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D, bool InvertFalseVal = false); Value *getSelectCondition(Value *A, Value *B, bool ABIsTheSame); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index f85a3c9..0c34cf0 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -121,21 +121,17 @@ static Value *foldMulSelectToNegate(BinaryOperator &I, // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0), m_SpecificFP(-1.0))), - m_Value(OtherOp)))) { - IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); - Builder.setFastMathFlags(I.getFastMathFlags()); - return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp)); - } + m_Value(OtherOp)))) + return Builder.CreateSelectFMF(Cond, OtherOp, + Builder.CreateFNegFMF(OtherOp, &I), &I); // fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp // fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0), m_SpecificFP(1.0))), - m_Value(OtherOp)))) { - IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); - Builder.setFastMathFlags(I.getFastMathFlags()); - return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp); - } + m_Value(OtherOp)))) + return Builder.CreateSelectFMF(Cond, Builder.CreateFNegFMF(OtherOp, &I), + OtherOp, &I); return nullptr; } @@ -590,11 +586,9 @@ Instruction *InstCombinerImpl::foldFPSignBitOps(BinaryOperator &I) { // fabs(X) / fabs(Y) --> fabs(X / Y) if (match(Op0, m_FAbs(m_Value(X))) && match(Op1, m_FAbs(m_Value(Y))) && (Op0->hasOneUse() || Op1->hasOneUse())) { - IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); - Builder.setFastMathFlags(I.getFastMathFlags()); - Value *XY = Builder.CreateBinOp(Opcode, X, Y); - Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, XY); - Fabs->takeName(&I); + Value *XY = Builder.CreateBinOpFMF(Opcode, X, Y, &I); + Value *Fabs = + Builder.CreateUnaryIntrinsic(Intrinsic::fabs, XY, &I, I.getName()); return replaceInstUsesWith(I, Fabs); } @@ -685,8 +679,6 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) { match(Op0, m_AllowReassoc(m_BinOp(Op0BinOp)))) { // Everything in this scope folds I with Op0, intersecting their FMF. FastMathFlags FMF = I.getFastMathFlags() & Op0BinOp->getFastMathFlags(); - IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); - Builder.setFastMathFlags(FMF); Constant *C1; if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) { // (C1 / X) * C --> (C * C1) / X @@ -718,7 +710,7 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) { // (X + C1) * C --> (X * C) + (C * C1) if (Constant *CC1 = ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL)) { - Value *XC = Builder.CreateFMul(X, C); + Value *XC = Builder.CreateFMulFMF(X, C, FMF); return BinaryOperator::CreateFAddFMF(XC, CC1, FMF); } } @@ -726,7 +718,7 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) { // (C1 - X) * C --> (C * C1) - (X * C) if (Constant *CC1 = ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL)) { - Value *XC = Builder.CreateFMul(X, C); + Value *XC = Builder.CreateFMulFMF(X, C, FMF); return BinaryOperator::CreateFSubFMF(CC1, XC, FMF); } } @@ -740,9 +732,7 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) { FastMathFlags FMF = I.getFastMathFlags() & DivOp->getFastMathFlags(); if (FMF.allowReassoc()) { // Sink division: (X / Y) * Z --> (X * Z) / Y - IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); - Builder.setFastMathFlags(FMF); - auto *NewFMul = Builder.CreateFMul(X, Z); + auto *NewFMul = Builder.CreateFMulFMF(X, Z, FMF); return BinaryOperator::CreateFDivFMF(NewFMul, Y, FMF); } } @@ -2066,14 +2056,18 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I, bool ShiftByX = false; // If V is not nullptr, it will be matched using m_Specific. - auto MatchShiftOrMulXC = [](Value *Op, Value *&V, APInt &C) -> bool { + auto MatchShiftOrMulXC = [](Value *Op, Value *&V, APInt &C, + bool &PreserveNSW) -> bool { const APInt *Tmp = nullptr; if ((!V && match(Op, m_Mul(m_Value(V), m_APInt(Tmp)))) || (V && match(Op, m_Mul(m_Specific(V), m_APInt(Tmp))))) C = *Tmp; else if ((!V && match(Op, m_Shl(m_Value(V), m_APInt(Tmp)))) || - (V && match(Op, m_Shl(m_Specific(V), m_APInt(Tmp))))) + (V && match(Op, m_Shl(m_Specific(V), m_APInt(Tmp))))) { C = APInt(Tmp->getBitWidth(), 1) << *Tmp; + // We cannot preserve NSW when shifting by BW - 1. + PreserveNSW = Tmp->ult(Tmp->getBitWidth() - 1); + } if (Tmp != nullptr) return true; @@ -2095,7 +2089,9 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I, return false; }; - if (MatchShiftOrMulXC(Op0, X, Y) && MatchShiftOrMulXC(Op1, X, Z)) { + bool Op0PreserveNSW = true, Op1PreserveNSW = true; + if (MatchShiftOrMulXC(Op0, X, Y, Op0PreserveNSW) && + MatchShiftOrMulXC(Op1, X, Z, Op1PreserveNSW)) { // pass } else if (MatchShiftCX(Op0, Y, X) && MatchShiftCX(Op1, Z, X)) { ShiftByX = true; @@ -2108,7 +2104,7 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I, OverflowingBinaryOperator *BO0 = cast<OverflowingBinaryOperator>(Op0); // TODO: We may be able to deduce more about nsw/nuw of BO0/BO1 based on Y >= // Z or Z >= Y. - bool BO0HasNSW = BO0->hasNoSignedWrap(); + bool BO0HasNSW = Op0PreserveNSW && BO0->hasNoSignedWrap(); bool BO0HasNUW = BO0->hasNoUnsignedWrap(); bool BO0NoWrap = IsSRem ? BO0HasNSW : BO0HasNUW; @@ -2131,7 +2127,7 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I, }; OverflowingBinaryOperator *BO1 = cast<OverflowingBinaryOperator>(Op1); - bool BO1HasNSW = BO1->hasNoSignedWrap(); + bool BO1HasNSW = Op1PreserveNSW && BO1->hasNoSignedWrap(); bool BO1HasNUW = BO1->hasNoUnsignedWrap(); bool BO1NoWrap = IsSRem ? BO1HasNSW : BO1HasNUW; // (rem (mul X, Y), (mul nuw/nsw X, Z)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 1fcf1c5..80308bf 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -765,30 +765,14 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); LoadInst *NewLI = new LoadInst(FirstLI->getType(), NewPN, "", IsVolatile, LoadAlignment); - - unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_range, - LLVMContext::MD_invariant_load, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_nonnull, - LLVMContext::MD_align, - LLVMContext::MD_dereferenceable, - LLVMContext::MD_dereferenceable_or_null, - LLVMContext::MD_access_group, - LLVMContext::MD_noundef, - }; - - for (unsigned ID : KnownIDs) - NewLI->setMetadata(ID, FirstLI->getMetadata(ID)); + NewLI->copyMetadata(*FirstLI); // Add all operands to the new PHI and combine TBAA metadata. for (auto Incoming : drop_begin(zip(PN.blocks(), PN.incoming_values()))) { BasicBlock *BB = std::get<0>(Incoming); Value *V = std::get<1>(Incoming); LoadInst *LI = cast<LoadInst>(V); - combineMetadata(NewLI, LI, KnownIDs, true); + combineMetadataForCSE(NewLI, LI, true); Value *NewInVal = LI->getOperand(0); if (NewInVal != InVal) InVal = nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 3d251d6..1eca177 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1225,8 +1225,12 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, // zext/trunc) have one use (ending at the select), the cttz/ctlz result will // not be used if the input is zero. Relax to 'zero is poison' for that case. if (II->hasOneUse() && SelectArg->hasOneUse() && - !match(II->getArgOperand(1), m_One())) + !match(II->getArgOperand(1), m_One())) { II->setArgOperand(1, ConstantInt::getTrue(II->getContext())); + // noundef attribute on the intrinsic may no longer be valid. + II->dropUBImplyingAttrsAndMetadata(); + IC.addToWorklist(II); + } return nullptr; } @@ -1685,8 +1689,7 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp, return nullptr; // Check the constant we'd have with flipped-strictness predicate. - auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, C0); + auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0); if (!FlippedStrictness) return nullptr; @@ -1966,8 +1969,7 @@ static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal, Value *RHS; SelectPatternFlavor SPF; const DataLayout &DL = BOp->getDataLayout(); - auto Flipped = - InstCombiner::getFlippedStrictnessPredicateAndConstant(Predicate, C1); + auto Flipped = getFlippedStrictnessPredicateAndConstant(Predicate, C1); if (C3 == ConstantFoldBinaryOpOperands(Opcode, C1, C2, DL)) { SPF = getSelectPattern(Predicate).Flavor; @@ -2819,9 +2821,9 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC, // %cnd = icmp slt i32 %rem, 0 // %add = add i32 %rem, %n // %sel = select i1 %cnd, i32 %add, i32 %rem - if (match(TrueVal, m_Add(m_Specific(RemRes), m_Value(Remainder))) && + if (match(TrueVal, m_c_Add(m_Specific(RemRes), m_Value(Remainder))) && match(RemRes, m_SRem(m_Value(Op), m_Specific(Remainder))) && - IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero*/ true) && + IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero=*/true) && FalseVal == RemRes) return FoldToBitwiseAnd(Remainder); @@ -3769,22 +3771,9 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI, if (!SIFOp || !SIFOp->hasNoSignedZeros() || !SIFOp->hasNoNaNs()) return nullptr; - // select((fcmp Pred, X, 0), (fadd X, C), C) - // => fadd((select (fcmp Pred, X, 0), X, 0), C) - // - // Pred := OGT, OGE, OLT, OLE, UGT, UGE, ULT, and ULE - Instruction *FAdd; - Constant *C; - Value *X, *Z; - CmpPredicate Pred; - - // Note: OneUse check for `Cmp` is necessary because it makes sure that other - // InstCombine folds don't undo this transformation and cause an infinite - // loop. Furthermore, it could also increase the operation count. - if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), - m_OneUse(m_Instruction(FAdd)), m_Constant(C))) || - match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), - m_Constant(C), m_OneUse(m_Instruction(FAdd))))) { + auto TryFoldIntoAddConstant = + [&Builder, &SI](CmpInst::Predicate Pred, Value *X, Value *Z, + Instruction *FAdd, Constant *C, bool Swapped) -> Value * { // Only these relational predicates can be transformed into maxnum/minnum // intrinsic. if (!CmpInst::isRelational(Pred) || !match(Z, m_AnyZeroFP())) @@ -3793,7 +3782,8 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI, if (!match(FAdd, m_FAdd(m_Specific(X), m_Specific(C)))) return nullptr; - Value *NewSelect = Builder.CreateSelect(SI.getCondition(), X, Z, "", &SI); + Value *NewSelect = Builder.CreateSelect(SI.getCondition(), Swapped ? Z : X, + Swapped ? X : Z, "", &SI); NewSelect->takeName(&SI); Value *NewFAdd = Builder.CreateFAdd(NewSelect, C); @@ -3808,7 +3798,27 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI, cast<Instruction>(NewSelect)->setFastMathFlags(NewFMF); return NewFAdd; - } + }; + + // select((fcmp Pred, X, 0), (fadd X, C), C) + // => fadd((select (fcmp Pred, X, 0), X, 0), C) + // + // Pred := OGT, OGE, OLT, OLE, UGT, UGE, ULT, and ULE + Instruction *FAdd; + Constant *C; + Value *X, *Z; + CmpPredicate Pred; + + // Note: OneUse check for `Cmp` is necessary because it makes sure that other + // InstCombine folds don't undo this transformation and cause an infinite + // loop. Furthermore, it could also increase the operation count. + if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), + m_OneUse(m_Instruction(FAdd)), m_Constant(C)))) + return TryFoldIntoAddConstant(Pred, X, Z, FAdd, C, /*Swapped=*/false); + + if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))), + m_Constant(C), m_OneUse(m_Instruction(FAdd))))) + return TryFoldIntoAddConstant(Pred, X, Z, FAdd, C, /*Swapped=*/true); return nullptr; } @@ -3902,12 +3912,11 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X if (FCmp->hasOneUse() && FCmpInst::isUnordered(Pred)) { FCmpInst::Predicate InvPred = FCmp->getInversePredicate(); - IRBuilder<>::FastMathFlagGuard FMFG(Builder); // FIXME: The FMF should propagate from the select, not the fcmp. - Builder.setFastMathFlags(FCmp->getFastMathFlags()); - Value *NewCond = Builder.CreateFCmp(InvPred, Cmp0, Cmp1, - FCmp->getName() + ".inv"); - Value *NewSel = Builder.CreateSelect(NewCond, FalseVal, TrueVal); + Value *NewCond = Builder.CreateFCmpFMF(InvPred, Cmp0, Cmp1, FCmp, + FCmp->getName() + ".inv"); + Value *NewSel = + Builder.CreateSelectFMF(NewCond, FalseVal, TrueVal, FCmp); return replaceInstUsesWith(SI, NewSel); } } @@ -4072,15 +4081,11 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { CmpInst::Predicate MinMaxPred = getMinMaxPred(SPF, SPR.Ordered); Value *Cmp; - if (CmpInst::isIntPredicate(MinMaxPred)) { + if (CmpInst::isIntPredicate(MinMaxPred)) Cmp = Builder.CreateICmp(MinMaxPred, LHS, RHS); - } else { - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - auto FMF = - cast<FPMathOperator>(SI.getCondition())->getFastMathFlags(); - Builder.setFastMathFlags(FMF); - Cmp = Builder.CreateFCmp(MinMaxPred, LHS, RHS); - } + else + Cmp = Builder.CreateFCmpFMF(MinMaxPred, LHS, RHS, + cast<Instruction>(SI.getCondition())); Value *NewSI = Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI); if (!IsCastNeeded) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 934156f..2fb60ef 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -939,12 +939,11 @@ Instruction *InstCombinerImpl::foldBinOpShiftWithShift(BinaryOperator &I) { m_OneUse(m_Shift(m_Value(Y), m_Value(Shift))))) return nullptr; if (!match(I.getOperand(1 - ShOpnum), - m_BinOp(m_Value(ShiftedX), m_Value(Mask)))) + m_c_BinOp(m_CombineAnd( + m_OneUse(m_Shift(m_Value(X), m_Specific(Shift))), + m_Value(ShiftedX)), + m_Value(Mask)))) return nullptr; - - if (!match(ShiftedX, m_OneUse(m_Shift(m_Value(X), m_Specific(Shift))))) - return nullptr; - // Make sure we are matching instruction shifts and not ConstantExpr auto *IY = dyn_cast<Instruction>(I.getOperand(ShOpnum)); auto *IX = dyn_cast<Instruction>(ShiftedX); @@ -1822,12 +1821,29 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN, continue; } - // If the only use of phi is comparing it with a constant then we can - // put this comparison in the incoming BB directly after a ucmp/scmp call - // because we know that it will simplify to a single icmp. - const APInt *Ignored; - if (isa<CmpIntrinsic>(InVal) && InVal->hasOneUser() && - match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored)))) { + // Handle some cases that can't be fully simplified, but where we know that + // the two instructions will fold into one. + auto WillFold = [&]() { + if (!InVal->hasOneUser()) + return false; + + // icmp of ucmp/scmp with constant will fold to icmp. + const APInt *Ignored; + if (isa<CmpIntrinsic>(InVal) && + match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored)))) + return true; + + // icmp eq zext(bool), 0 will fold to !bool. + if (isa<ZExtInst>(InVal) && + cast<ZExtInst>(InVal)->getSrcTy()->isIntOrIntVectorTy(1) && + match(&I, + m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(PN), m_Zero()))) + return true; + + return false; + }; + + if (WillFold()) { OpsToMoveUseToIncomingBB.push_back(i); NewPhiValues.push_back(nullptr); continue; @@ -2782,6 +2798,7 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN, // loop iteration). if (Op1 == &GEP) return nullptr; + GEPNoWrapFlags NW = Op1->getNoWrapFlags(); int DI = -1; @@ -2838,6 +2855,8 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN, } } } + + NW &= Op2->getNoWrapFlags(); } // If not all GEPs are identical we'll have to create a new PHI node. @@ -2847,6 +2866,8 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN, return nullptr; auto *NewGEP = cast<GetElementPtrInst>(Op1->clone()); + NewGEP->setNoWrapFlags(NW); + if (DI == -1) { // All the GEPs feeding the PHI are identical. Clone one down into our // BB so that it can be merged with the current GEP. diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index f9be7f9..6e86ffd 100644 --- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -61,7 +61,7 @@ enum : uint32_t { }; static cl::opt<std::string> DefaultGCOVVersion("default-gcov-version", - cl::init("408*"), cl::Hidden, + cl::init("0000"), cl::Hidden, cl::ValueRequired); static cl::opt<bool> AtomicCounter("gcov-atomic-counter", cl::Hidden, @@ -154,6 +154,7 @@ private: GCOVOptions Options; llvm::endianness Endian; raw_ostream *os; + int Version = 0; // Checksum, produced by hash of EdgeDestinations SmallVector<uint32_t, 4> FileChecksums; @@ -334,12 +335,9 @@ namespace { : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident), Version(Version), EntryBlock(P, 0), ReturnBlock(P, 1) { LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n"); - bool ExitBlockBeforeBody = Version >= 48; - uint32_t i = ExitBlockBeforeBody ? 2 : 1; + uint32_t i = 2; for (BasicBlock &BB : *F) Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++))); - if (!ExitBlockBeforeBody) - ReturnBlock.Number = i; std::string FunctionNameAndLine; raw_string_ostream FNLOS(FunctionNameAndLine); @@ -363,44 +361,28 @@ namespace { void writeOut(uint32_t CfgChecksum) { write(GCOV_TAG_FUNCTION); SmallString<128> Filename = getFilename(SP); - uint32_t BlockLen = - 2 + (Version >= 47) + wordsOfString(getFunctionName(SP)); - if (Version < 80) - BlockLen += wordsOfString(Filename) + 1; - else - BlockLen += 1 + wordsOfString(Filename) + 3 + (Version >= 90); + uint32_t BlockLen = 3 + wordsOfString(getFunctionName(SP)); + BlockLen += 1 + wordsOfString(Filename) + 4; write(BlockLen); write(Ident); write(FuncChecksum); - if (Version >= 47) - write(CfgChecksum); + write(CfgChecksum); writeString(getFunctionName(SP)); - if (Version < 80) { - writeString(Filename); - write(SP->getLine()); - } else { - write(SP->isArtificial()); // artificial - writeString(Filename); - write(SP->getLine()); // start_line - write(0); // start_column - // EndLine is the last line with !dbg. It is not the } line as in GCC, - // but good enough. - write(EndLine); - if (Version >= 90) - write(0); // end_column - } + + write(SP->isArtificial()); // artificial + writeString(Filename); + write(SP->getLine()); // start_line + write(0); // start_column + // EndLine is the last line with !dbg. It is not the } line as in GCC, + // but good enough. + write(EndLine); + write(0); // end_column // Emit count of blocks. write(GCOV_TAG_BLOCKS); - if (Version < 80) { - write(Blocks.size() + 2); - for (int i = Blocks.size() + 2; i; --i) - write(0); - } else { - write(1); - write(Blocks.size() + 2); - } + write(1); + write(Blocks.size() + 2); LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n"); // Emit edges between blocks. @@ -767,7 +749,6 @@ bool GCOVProfiler::emitProfileNotes( function_ref<BlockFrequencyInfo *(Function &F)> GetBFI, function_ref<BranchProbabilityInfo *(Function &F)> GetBPI, function_ref<const TargetLibraryInfo &(Function &F)> GetTLI) { - int Version; { uint8_t c3 = Options.Version[0]; uint8_t c2 = Options.Version[1]; @@ -775,6 +756,11 @@ bool GCOVProfiler::emitProfileNotes( Version = c3 >= 'A' ? (c3 - 'A') * 100 + (c2 - '0') * 10 + c1 - '0' : (c3 - '0') * 10 + c1 - '0'; } + // Emit .gcno files that are compatible with GCC 11.1. + if (Version < 111) { + Version = 111; + memcpy(Options.Version, "B11*", 4); + } bool EmitGCDA = Options.EmitData; for (unsigned i = 0, e = CUNode->getNumOperands(); i != e; ++i) { @@ -973,10 +959,8 @@ bool GCOVProfiler::emitProfileNotes( out.write(Tmp, 4); } write(Stamp); - if (Version >= 90) - writeString(""); // unuseful current_working_directory - if (Version >= 80) - write(0); // unuseful has_unexecuted_blocks + writeString("."); // unuseful current_working_directory + write(0); // unuseful has_unexecuted_blocks for (auto &Func : Funcs) Func->writeOut(Stamp); diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 530061e..2031728 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -192,7 +192,7 @@ static cl::opt<bool> cl::Hidden); static cl::opt<int> ClHotPercentileCutoff("hwasan-percentile-cutoff-hot", - cl::desc("Hot percentile cuttoff.")); + cl::desc("Hot percentile cutoff.")); static cl::opt<float> ClRandomSkipRate("hwasan-random-rate", diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp index 2418030..f27798c 100644 --- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp +++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp @@ -30,7 +30,7 @@ using namespace llvm; static cl::opt<int> HotPercentileCutoff("lower-allow-check-percentile-cutoff-hot", - cl::desc("Hot percentile cuttoff.")); + cl::desc("Hot percentile cutoff.")); static cl::opt<float> RandomRate("lower-allow-check-random-rate", diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 471086c..db4d62e 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -158,11 +158,11 @@ STATISTIC(NumCoveredBlocks, "Number of basic blocks that were executed"); // Command line option to specify the file to read profile from. This is // mainly used for testing. -static cl::opt<std::string> - PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden, - cl::value_desc("filename"), - cl::desc("Specify the path of profile data file. This is" - "mainly for test purpose.")); +static cl::opt<std::string> PGOTestProfileFile( + "pgo-test-profile-file", cl::init(""), cl::Hidden, + cl::value_desc("filename"), + cl::desc("Specify the path of profile data file. This is " + "mainly for test purpose.")); static cl::opt<std::string> PGOTestProfileRemappingFile( "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden, cl::value_desc("filename"), @@ -186,7 +186,7 @@ static cl::opt<unsigned> MaxNumAnnotations( // to write to the metadata for a single memop intrinsic. static cl::opt<unsigned> MaxNumMemOPAnnotations( "memop-max-annotations", cl::init(4), cl::Hidden, - cl::desc("Max number of preicise value annotations for a single memop" + cl::desc("Max number of precise value annotations for a single memop" "intrinsic")); // Command line option to control appending FunctionHash to the name of a COMDAT @@ -291,13 +291,13 @@ static cl::opt<bool> PGOVerifyHotBFI( cl::desc("Print out the non-match BFI count if a hot raw profile count " "becomes non-hot, or a cold raw profile count becomes hot. " "The print is enabled under -Rpass-analysis=pgo, or " - "internal option -pass-remakrs-analysis=pgo.")); + "internal option -pass-remarks-analysis=pgo.")); static cl::opt<bool> PGOVerifyBFI( "pgo-verify-bfi", cl::init(false), cl::Hidden, cl::desc("Print out mismatched BFI counts after setting profile metadata " "The print is enabled under -Rpass-analysis=pgo, or " - "internal option -pass-remakrs-analysis=pgo.")); + "internal option -pass-remarks-analysis=pgo.")); static cl::opt<unsigned> PGOVerifyBFIRatio( "pgo-verify-bfi-ratio", cl::init(2), cl::Hidden, diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp index 1961095..9cd81f3 100644 --- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -70,7 +70,7 @@ namespace { /// violations. struct TypeSanitizer { TypeSanitizer(Module &M); - bool run(Function &F, const TargetLibraryInfo &TLI); + bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI); void instrumentGlobals(Module &M); private: @@ -510,7 +510,8 @@ void collectMemAccessInfo( } } -bool TypeSanitizer::run(Function &F, const TargetLibraryInfo &TLI) { +bool TypeSanitizer::sanitizeFunction(Function &F, + const TargetLibraryInfo &TLI) { // This is required to prevent instrumenting call to __tysan_init from within // the module constructor. if (&F == TysanCtorFunction.getCallee() || &F == TysanGlobalsSetTypeFunction) @@ -876,15 +877,8 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, return true; } -PreservedAnalyses TypeSanitizerPass::run(Function &F, - FunctionAnalysisManager &FAM) { - TypeSanitizer TySan(*F.getParent()); - TySan.run(F, FAM.getResult<TargetLibraryAnalysis>(F)); - return PreservedAnalyses::none(); -} - -PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M, - ModuleAnalysisManager &AM) { +PreservedAnalyses TypeSanitizerPass::run(Module &M, + ModuleAnalysisManager &MAM) { Function *TysanCtorFunction; std::tie(TysanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(M, kTysanModuleCtorName, @@ -894,5 +888,12 @@ PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M, TypeSanitizer TySan(M); TySan.instrumentGlobals(M); appendToGlobalCtors(M, TysanCtorFunction, 0); + + auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + for (Function &F : M) { + const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F); + TySan.sanitizeFunction(F, TLI); + } + return PreservedAnalyses::none(); } diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index ead07ed..91a3c3f 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -216,7 +216,7 @@ struct StackEntry { StackEntry(unsigned NumIn, unsigned NumOut, bool IsSigned, SmallVector<Value *, 2> ValuesToRelease) : NumIn(NumIn), NumOut(NumOut), IsSigned(IsSigned), - ValuesToRelease(ValuesToRelease) {} + ValuesToRelease(std::move(ValuesToRelease)) {} }; struct ConstraintTy { @@ -726,8 +726,8 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, } for (const auto &KV : VariablesB) { - if (SubOverflow(R[GetOrAddIndex(KV.Variable)], KV.Coefficient, - R[GetOrAddIndex(KV.Variable)])) + auto &Coeff = R[GetOrAddIndex(KV.Variable)]; + if (SubOverflow(Coeff, KV.Coefficient, Coeff)) return {}; auto I = KnownNonNegativeVariables.insert({KV.Variable, KV.IsKnownNonNegative}); @@ -759,9 +759,9 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, if (!KV.second || (!Value2Index.contains(KV.first) && !NewIndexMap.contains(KV.first))) continue; - SmallVector<int64_t, 8> C(Value2Index.size() + NewVariables.size() + 1, 0); + auto &C = Res.ExtraInfo.emplace_back( + Value2Index.size() + NewVariables.size() + 1, 0); C[GetOrAddIndex(KV.first)] = -1; - Res.ExtraInfo.push_back(C); } return Res; } @@ -1591,53 +1591,52 @@ void ConstraintInfo::addFact(CmpInst::Predicate Pred, Value *A, Value *B, LLVM_DEBUG(dbgs() << "Adding '"; dumpUnpackedICmp(dbgs(), Pred, A, B); dbgs() << "'\n"); - bool Added = false; auto &CSToUse = getCS(R.IsSigned); if (R.Coefficients.empty()) return; - Added |= CSToUse.addVariableRowFill(R.Coefficients); + bool Added = CSToUse.addVariableRowFill(R.Coefficients); + if (!Added) + return; // If R has been added to the system, add the new variables and queue it for // removal once it goes out-of-scope. - if (Added) { - SmallVector<Value *, 2> ValuesToRelease; - auto &Value2Index = getValue2Index(R.IsSigned); - for (Value *V : NewVariables) { - Value2Index.insert({V, Value2Index.size() + 1}); - ValuesToRelease.push_back(V); - } - - LLVM_DEBUG({ - dbgs() << " constraint: "; - dumpConstraint(R.Coefficients, getValue2Index(R.IsSigned)); - dbgs() << "\n"; - }); + SmallVector<Value *, 2> ValuesToRelease; + auto &Value2Index = getValue2Index(R.IsSigned); + for (Value *V : NewVariables) { + Value2Index.insert({V, Value2Index.size() + 1}); + ValuesToRelease.push_back(V); + } - DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned, - std::move(ValuesToRelease)); - - if (!R.IsSigned) { - for (Value *V : NewVariables) { - ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0), - false, false, false); - VarPos.Coefficients[Value2Index[V]] = -1; - CSToUse.addVariableRow(VarPos.Coefficients); - DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned, - SmallVector<Value *, 2>()); - } - } + LLVM_DEBUG({ + dbgs() << " constraint: "; + dumpConstraint(R.Coefficients, getValue2Index(R.IsSigned)); + dbgs() << "\n"; + }); - if (R.isEq()) { - // Also add the inverted constraint for equality constraints. - for (auto &Coeff : R.Coefficients) - Coeff *= -1; - CSToUse.addVariableRowFill(R.Coefficients); + DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned, + std::move(ValuesToRelease)); + if (!R.IsSigned) { + for (Value *V : NewVariables) { + ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0), + false, false, false); + VarPos.Coefficients[Value2Index[V]] = -1; + CSToUse.addVariableRow(VarPos.Coefficients); DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned, SmallVector<Value *, 2>()); } } + + if (R.isEq()) { + // Also add the inverted constraint for equality constraints. + for (auto &Coeff : R.Coefficients) + Coeff *= -1; + CSToUse.addVariableRowFill(R.Coefficients); + + DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned, + SmallVector<Value *, 2>()); + } } static bool replaceSubOverflowUses(IntrinsicInst *II, Value *A, Value *B, diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index ba1c224..3c82eed 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -128,7 +128,7 @@ static cl::opt<bool, true> static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", - cl::desc("Use loop idiom recognition code size heuristics when compiling" + cl::desc("Use loop idiom recognition code size heuristics when compiling " "with -Os/-Oz"), cl::init(true), cl::Hidden); diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 260cc72..0903488 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -104,7 +104,7 @@ static cl::opt<unsigned> UnrollMaxPercentThresholdBoost( static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze( "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, - cl::desc("Don't allow loop unrolling to simulate more than this number of" + cl::desc("Don't allow loop unrolling to simulate more than this number of " "iterations when checking full unroll profitability")); static cl::opt<unsigned> UnrollCount( diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index f58dcb5..6e91c4f 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -95,7 +95,7 @@ static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable"; /// invariant instructions in a loop. static cl::opt<float> LVInvarThreshold("licm-versioning-invariant-threshold", - cl::desc("LoopVersioningLICM's minimum allowed percentage" + cl::desc("LoopVersioningLICM's minimum allowed percentage " "of possible invariant instructions per loop"), cl::init(25), cl::Hidden); diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index bb98b3d..5f7cb92 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -345,10 +345,14 @@ static bool writtenBetween(MemorySSA *MSSA, BatchAAResults &AA, static void combineAAMetadata(Instruction *ReplInst, Instruction *I) { // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be // handled here, but combineMetadata doesn't support them yet - unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_invariant_group, - LLVMContext::MD_access_group}; + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_invariant_group, + LLVMContext::MD_access_group, LLVMContext::MD_prof, + LLVMContext::MD_memprof, LLVMContext::MD_callsite}; + // FIXME: https://github.com/llvm/llvm-project/issues/121495 + // Use custom AA metadata combining handling instead of combineMetadata, which + // is meant for CSE and will drop any metadata not in the KnownIDs list. combineMetadata(ReplInst, I, KnownIDs, true); } diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp index 1d4f561..b499ef8 100644 --- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp +++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp @@ -28,8 +28,8 @@ using namespace llvm; namespace llvm { cl::opt<bool> ShouldPreserveAllAttributes( "assume-preserve-all", cl::init(false), cl::Hidden, - cl::desc("enable preservation of all attrbitues. even those that are " - "unlikely to be usefull")); + cl::desc("enable preservation of all attributes. even those that are " + "unlikely to be useful")); cl::opt<bool> EnableKnowledgeRetention( "enable-knowledge-retention", cl::init(false), cl::Hidden, diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp index 47bb319..d47f1b4 100644 --- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp +++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp @@ -48,6 +48,21 @@ static void insertCall(Function &CurFn, StringRef Func, /*isVarArg=*/false)), {GV}, "", InsertionPt); Call->setDebugLoc(DL); + } else if (TargetTriple.isRISCV() || TargetTriple.isAArch64() || + TargetTriple.isLoongArch()) { + // On RISC-V, AArch64, and LoongArch, the `_mcount` function takes + // `__builtin_return_address(0)` as an argument since + // `__builtin_return_address(1)` is not available on these platforms. + Instruction *RetAddr = CallInst::Create( + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress), + ConstantInt::get(Type::getInt32Ty(C), 0), "", InsertionPt); + RetAddr->setDebugLoc(DL); + + FunctionCallee Fn = M.getOrInsertFunction( + Func, FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C), + false)); + CallInst *Call = CallInst::Create(Fn, RetAddr, "", InsertionPt); + Call->setDebugLoc(DL); } else { FunctionCallee Fn = M.getOrInsertFunction(Func, Type::getVoidTy(C)); CallInst *Call = CallInst::Create(Fn, "", InsertionPt); @@ -88,6 +103,12 @@ static bool runOnFunction(Function &F, bool PostInlining) { if (F.hasFnAttribute(Attribute::Naked)) return false; + // available_externally functions may not have definitions external to the + // module (e.g. gnu::always_inline). Instrumenting them might lead to linker + // errors if they are optimized out. Skip them like GCC. + if (F.hasAvailableExternallyLinkage()) + return false; + StringRef EntryAttr = PostInlining ? "instrument-function-entry-inlined" : "instrument-function-entry"; diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp index 766c750..ae1af943 100644 --- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -331,15 +331,12 @@ void FunctionImportGlobalProcessing::processGlobalsForThinLTO() { } } -bool FunctionImportGlobalProcessing::run() { - processGlobalsForThinLTO(); - return false; -} +void FunctionImportGlobalProcessing::run() { processGlobalsForThinLTO(); } -bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index, +void llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index, bool ClearDSOLocalOnDeclarations, SetVector<GlobalValue *> *GlobalsToImport) { FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport, ClearDSOLocalOnDeclarations); - return ThinLTOProcessing.run(); + ThinLTOProcessing.run(); } diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index a3af96d..1e4061c 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3308,6 +3308,9 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU, return Changed; } +// FIXME: https://github.com/llvm/llvm-project/issues/121495 +// Once external callers of this function are removed, either inline into +// combineMetadataForCSE, or internalize and remove KnownIDs parameter. void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsigned> KnownIDs, bool DoesKMove) { SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; @@ -3320,6 +3323,10 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, switch (Kind) { default: + // FIXME: https://github.com/llvm/llvm-project/issues/121495 + // Change to removing only explicitly listed other metadata, and assert + // on unknown metadata, to avoid inadvertently dropping newly added + // metadata types. K->setMetadata(Kind, nullptr); // Remove unknown metadata break; case LLVMContext::MD_dbg: @@ -3379,6 +3386,12 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, K->setMetadata(Kind, MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); break; + case LLVMContext::MD_memprof: + K->setMetadata(Kind, MDNode::getMergedMemProfMetadata(KMD, JMD)); + break; + case LLVMContext::MD_callsite: + K->setMetadata(Kind, MDNode::getMergedCallsiteMetadata(KMD, JMD)); + break; case LLVMContext::MD_preserve_access_index: // Preserve !preserve.access.index in K. break; @@ -3442,7 +3455,9 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J, LLVMContext::MD_nontemporal, LLVMContext::MD_noundef, LLVMContext::MD_mmra, - LLVMContext::MD_noalias_addrspace}; + LLVMContext::MD_noalias_addrspace, + LLVMContext::MD_memprof, + LLVMContext::MD_callsite}; combineMetadata(K, J, KnownIDs, KDominatesJ); } diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp index d829864..b3f9f76 100644 --- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -778,7 +778,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops", - false, true) + false, false) // Publicly exposed interface to pass... char &llvm::LoopSimplifyID = LoopSimplify::ID; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index febc568..e367b01 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -96,8 +96,9 @@ using namespace PatternMatch; cl::opt<bool> llvm::RequireAndPreserveDomTree( "simplifycfg-require-and-preserve-domtree", cl::Hidden, - cl::desc("Temorary development switch used to gradually uplift SimplifyCFG " - "into preserving DomTree,")); + cl::desc( + "Temporary development switch used to gradually uplift SimplifyCFG " + "into preserving DomTree,")); // Chosen as 2 so as to be cheap, but still to have enough power to fold // a select, so the "clamp" idiom (of a min followed by a max) will be caught. @@ -126,7 +127,7 @@ static cl::opt<bool> HoistLoadsStoresWithCondFaulting( static cl::opt<unsigned> HoistLoadsStoresWithCondFaultingThreshold( "hoist-loads-stores-with-cond-faulting-threshold", cl::Hidden, cl::init(6), - cl::desc("Control the maximal conditonal load/store that we are willing " + cl::desc("Control the maximal conditional load/store that we are willing " "to speculatively execute to eliminate conditional branch " "(default = 6)")); @@ -2153,12 +2154,9 @@ bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf( SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)]; if (!SI) { // Propagate fast-math-flags from phi node to its replacement select. - IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); - if (isa<FPMathOperator>(PN)) - Builder.setFastMathFlags(PN.getFastMathFlags()); - - SI = cast<SelectInst>(Builder.CreateSelect( + SI = cast<SelectInst>(Builder.CreateSelectFMF( BI->getCondition(), BB1V, BB2V, + isa<FPMathOperator>(PN) ? &PN : nullptr, BB1V->getName() + "." + BB2V->getName(), BI)); } @@ -3898,16 +3896,14 @@ static bool foldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, IRBuilder<NoFolder> Builder(DomBI); // Propagate fast-math-flags from phi nodes to replacement selects. - IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { - if (isa<FPMathOperator>(PN)) - Builder.setFastMathFlags(PN->getFastMathFlags()); - // Change the PHI node into a select instruction. Value *TrueVal = PN->getIncomingValueForBlock(IfTrue); Value *FalseVal = PN->getIncomingValueForBlock(IfFalse); - Value *Sel = Builder.CreateSelect(IfCond, TrueVal, FalseVal, "", DomBI); + Value *Sel = Builder.CreateSelectFMF(IfCond, TrueVal, FalseVal, + isa<FPMathOperator>(PN) ? PN : nullptr, + "", DomBI); PN->replaceAllUsesWith(Sel); Sel->takeName(PN); PN->eraseFromParent(); diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 737818b..2b2b467 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -2005,28 +2005,21 @@ Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) { AbsOp = Real; } - if (AbsOp) { - IRBuilderBase::FastMathFlagGuard Guard(B); - B.setFastMathFlags(CI->getFastMathFlags()); - + if (AbsOp) return copyFlags( - *CI, B.CreateUnaryIntrinsic(Intrinsic::fabs, AbsOp, nullptr, "cabs")); - } + *CI, B.CreateUnaryIntrinsic(Intrinsic::fabs, AbsOp, CI, "cabs")); if (!CI->isFast()) return nullptr; } // Propagate fast-math flags from the existing call to new instructions. - IRBuilderBase::FastMathFlagGuard Guard(B); - B.setFastMathFlags(CI->getFastMathFlags()); - - Value *RealReal = B.CreateFMul(Real, Real); - Value *ImagImag = B.CreateFMul(Imag, Imag); - - return copyFlags(*CI, B.CreateUnaryIntrinsic(Intrinsic::sqrt, - B.CreateFAdd(RealReal, ImagImag), - nullptr, "cabs")); + Value *RealReal = B.CreateFMulFMF(Real, Real, CI); + Value *ImagImag = B.CreateFMulFMF(Imag, Imag, CI); + return copyFlags( + *CI, B.CreateUnaryIntrinsic(Intrinsic::sqrt, + B.CreateFAddFMF(RealReal, ImagImag, CI), CI, + "cabs")); } // Return a properly extended integer (DstWidth bits wide) if the operation is @@ -2480,15 +2473,13 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) { // "Ideally, fmax would be sensitive to the sign of zero, for example // fmax(-0.0, +0.0) would return +0; however, implementation in software // might be impractical." - IRBuilderBase::FastMathFlagGuard Guard(B); FastMathFlags FMF = CI->getFastMathFlags(); FMF.setNoSignedZeros(); - B.setFastMathFlags(FMF); Intrinsic::ID IID = Callee->getName().starts_with("fmin") ? Intrinsic::minnum : Intrinsic::maxnum; return copyFlags(*CI, B.CreateBinaryIntrinsic(IID, CI->getArgOperand(0), - CI->getArgOperand(1))); + CI->getArgOperand(1), FMF)); } Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) { @@ -2783,20 +2774,18 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) { // Fast math flags for any created instructions should match the sqrt // and multiply. - IRBuilderBase::FastMathFlagGuard Guard(B); - B.setFastMathFlags(I->getFastMathFlags()); // If we found a repeated factor, hoist it out of the square root and // replace it with the fabs of that factor. Value *FabsCall = - B.CreateUnaryIntrinsic(Intrinsic::fabs, RepeatOp, nullptr, "fabs"); + B.CreateUnaryIntrinsic(Intrinsic::fabs, RepeatOp, I, "fabs"); if (OtherOp) { // If we found a non-repeated factor, we still need to get its square // root. We then multiply that by the value that was simplified out // of the square root calculation. Value *SqrtCall = - B.CreateUnaryIntrinsic(Intrinsic::sqrt, OtherOp, nullptr, "sqrt"); - return copyFlags(*CI, B.CreateFMul(FabsCall, SqrtCall)); + B.CreateUnaryIntrinsic(Intrinsic::sqrt, OtherOp, I, "sqrt"); + return copyFlags(*CI, B.CreateFMulFMF(FabsCall, SqrtCall, I)); } return copyFlags(*CI, FabsCall); } @@ -2951,26 +2940,23 @@ static Value *optimizeSymmetricCall(CallInst *CI, bool IsEven, Value *Src = CI->getArgOperand(0); if (match(Src, m_OneUse(m_FNeg(m_Value(X))))) { - IRBuilderBase::FastMathFlagGuard Guard(B); - B.setFastMathFlags(CI->getFastMathFlags()); - - auto *CallInst = copyFlags(*CI, B.CreateCall(CI->getCalledFunction(), {X})); + auto *Call = B.CreateCall(CI->getCalledFunction(), {X}); + Call->copyFastMathFlags(CI); + auto *CallInst = copyFlags(*CI, Call); if (IsEven) { // Even function: f(-x) = f(x) return CallInst; } // Odd function: f(-x) = -f(x) - return B.CreateFNeg(CallInst); + return B.CreateFNegFMF(CallInst, CI); } // Even function: f(abs(x)) = f(x), f(copysign(x, y)) = f(x) if (IsEven && (match(Src, m_FAbs(m_Value(X))) || match(Src, m_CopySign(m_Value(X), m_Value())))) { - IRBuilderBase::FastMathFlagGuard Guard(B); - B.setFastMathFlags(CI->getFastMathFlags()); - - auto *CallInst = copyFlags(*CI, B.CreateCall(CI->getCalledFunction(), {X})); - return CallInst; + auto *Call = B.CreateCall(CI->getCalledFunction(), {X}); + Call->copyFastMathFlags(CI); + return copyFlags(*CI, Call); } return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 02ec1d5..9e81573 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -324,6 +324,11 @@ private: Instruction *ChainElem, Instruction *ChainBegin, const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets); + /// Merges the equivalence classes if they have underlying objects that differ + /// by one level of indirection (i.e., one is a getelementptr and the other is + /// the base pointer in that getelementptr). + void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const; + /// Collects loads and stores grouped by "equivalence class", where: /// - all elements in an eq class are a load or all are a store, /// - they all load/store the same element size (it's OK to have e.g. i8 and @@ -1305,6 +1310,119 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects( return std::nullopt; } +void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const { + if (EQClasses.size() < 2) // There is nothing to merge. + return; + + // The reduced key has all elements of the ECClassKey except the underlying + // object. Check that EqClassKey has 4 elements and define the reduced key. + static_assert(std::tuple_size_v<EqClassKey> == 4, + "EqClassKey has changed - EqClassReducedKey needs changes too"); + using EqClassReducedKey = + std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */, + std::tuple_element_t<2, EqClassKey> /* Element size */, + std::tuple_element_t<3, EqClassKey> /* IsLoad; */>; + using ECReducedKeyToUnderlyingObjectMap = + MapVector<EqClassReducedKey, + SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>; + + // Form a map from the reduced key (without the underlying object) to the + // underlying objects: 1 reduced key to many underlying objects, to form + // groups of potentially merge-able equivalence classes. + ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap; + bool FoundPotentiallyOptimizableEC = false; + for (const auto &EC : EQClasses) { + const auto &Key = EC.first; + EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key), + std::get<3>(Key)}; + RedKeyToUOMap[RedKey].insert(std::get<0>(Key)); + if (RedKeyToUOMap[RedKey].size() > 1) + FoundPotentiallyOptimizableEC = true; + } + if (!FoundPotentiallyOptimizableEC) + return; + + LLVM_DEBUG({ + dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n"; + for (const auto &EC : EQClasses) { + dbgs() << " Key: {" << EC.first << "}\n"; + for (const auto &Inst : EC.second) + dbgs() << " Inst: " << *Inst << '\n'; + } + }); + LLVM_DEBUG({ + dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n"; + for (const auto &RedKeyToUO : RedKeyToUOMap) { + dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", " + << std::get<1>(RedKeyToUO.first) << ", " + << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> " + << RedKeyToUO.second.size() << " underlying objects:\n"; + for (auto UObject : RedKeyToUO.second) + dbgs() << " " << *UObject << '\n'; + } + }); + + using UObjectToUObjectMap = DenseMap<const Value *, const Value *>; + + // Compute the ultimate targets for a set of underlying objects. + auto GetUltimateTargets = + [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap { + UObjectToUObjectMap IndirectionMap; + for (const auto *UObject : UObjects) { + const unsigned MaxLookupDepth = 1; // look for 1-level indirections only + const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth); + if (UltimateTarget != UObject) + IndirectionMap[UObject] = UltimateTarget; + } + UObjectToUObjectMap UltimateTargetsMap; + for (const auto *UObject : UObjects) { + auto Target = UObject; + auto It = IndirectionMap.find(Target); + for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target)) + Target = It->second; + UltimateTargetsMap[UObject] = Target; + } + return UltimateTargetsMap; + }; + + // For each item in RedKeyToUOMap, if it has more than one underlying object, + // try to merge the equivalence classes. + for (auto &[RedKey, UObjects] : RedKeyToUOMap) { + if (UObjects.size() < 2) + continue; + auto UTMap = GetUltimateTargets(UObjects); + for (const auto &[UObject, UltimateTarget] : UTMap) { + if (UObject == UltimateTarget) + continue; + + EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey), + std::get<2>(RedKey)}; + EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey), + std::get<2>(RedKey)}; + // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus, + // request the reference to the instructions vector for KeyTo first. + const auto &VecTo = EQClasses[KeyTo]; + const auto &VecFrom = EQClasses[KeyFrom]; + SmallVector<Instruction *, 8> MergedVec; + std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(), + std::back_inserter(MergedVec), + [](Instruction *A, Instruction *B) { + return A && B && A->comesBefore(B); + }); + EQClasses[KeyTo] = std::move(MergedVec); + EQClasses.erase(KeyFrom); + } + } + LLVM_DEBUG({ + dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n"; + for (const auto &EC : EQClasses) { + dbgs() << " Key: {" << EC.first << "}\n"; + for (const auto &Inst : EC.second) + dbgs() << " Inst: " << *Inst << '\n'; + } + }); +} + EquivalenceClassMap Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, BasicBlock::iterator End) { @@ -1377,6 +1495,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, .emplace_back(&I); } + mergeEquivalenceClasses(Ret); return Ret; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 650a485..bc44ec1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -231,17 +231,21 @@ public: new VPInstruction(Ptr, Offset, GEPNoWrapFlags::inBounds(), DL, Name)); } + /// Convert the input value \p Current to the corresponding value of an + /// induction with \p Start and \p Step values, using \p Start + \p Current * + /// \p Step. VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, - VPCanonicalIVPHIRecipe *CanonicalIV, - VPValue *Step, const Twine &Name = "") { + VPValue *Current, VPValue *Step, + const Twine &Name = "") { return tryInsertInstruction( - new VPDerivedIVRecipe(Kind, FPBinOp, Start, CanonicalIV, Step, Name)); + new VPDerivedIVRecipe(Kind, FPBinOp, Start, Current, Step, Name)); } VPScalarCastRecipe *createScalarCast(Instruction::CastOps Opcode, VPValue *Op, - Type *ResultTy) { - return tryInsertInstruction(new VPScalarCastRecipe(Opcode, Op, ResultTy)); + Type *ResultTy, DebugLoc DL) { + return tryInsertInstruction( + new VPScalarCastRecipe(Opcode, Op, ResultTy, DL)); } VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index af6fce4..47866da 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -479,7 +479,8 @@ public: AC(AC), ORE(ORE), VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), - PSI(PSI), RTChecks(RTChecks), Plan(Plan) { + PSI(PSI), RTChecks(RTChecks), Plan(Plan), + VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -517,22 +518,6 @@ public: /// Fix the non-induction PHIs in \p Plan. void fixNonInductionPHIs(VPTransformState &State); - /// Create a ResumePHI VPInstruction for the induction \p InductionPhiIRI to - /// resume iteration count in the scalar epilogue from where the vectorized - /// loop left off, and add it to the scalar preheader of VPlan. Also creates - /// the induction resume value, and the value for the bypass block, if needed. - /// \p Step is the SCEV-expanded induction step to use. In cases where the - /// loop skeleton is more complicated (i.e., epilogue vectorization) and the - /// resume values can come from an additional bypass block, - /// \p MainVectorTripCount provides the trip count of the main vector loop, - /// used to compute the resume value reaching the scalar loop preheader - /// directly from this additional bypass block. - void createInductionResumeVPValue(VPIRInstruction *InductionPhiIRI, - const InductionDescriptor &ID, Value *Step, - ArrayRef<BasicBlock *> BypassBlocks, - VPBuilder &ScalarPHBuilder, - Value *MainVectorTripCount = nullptr); - /// Returns the original loop trip count. Value *getTripCount() const { return TripCount; } @@ -588,23 +573,21 @@ protected: /// vector loop preheader, middle block and scalar preheader. void createVectorLoopSkeleton(StringRef Prefix); - /// Create new phi nodes for the induction variables to resume iteration count - /// in the scalar epilogue, from where the vectorized loop left off. - /// In cases where the loop skeleton is more complicated (i.e. epilogue - /// vectorization), \p MainVectorTripCount provides the trip count of the main - /// loop, used to compute these resume values. If \p IVSubset is provided, it - /// contains the phi nodes for which resume values are needed, because they - /// will generate wide induction phis in the epilogue loop. - void - createInductionResumeVPValues(const SCEV2ValueTy &ExpandedSCEVs, - Value *MainVectorTripCount = nullptr, - SmallPtrSetImpl<PHINode *> *IVSubset = nullptr); + /// Create and record the values for induction variables to resume coming from + /// the additional bypass block. + void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, + Value *MainVectorTripCount); /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. virtual void printDebugTracesAtStart() {} virtual void printDebugTracesAtEnd() {} + /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the + /// vector preheader and its predecessor, also connecting the new block to the + /// scalar preheader. + void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); + /// The original loop. Loop *OrigLoop; @@ -699,6 +682,10 @@ protected: BasicBlock *AdditionalBypassBlock = nullptr; VPlan &Plan; + + /// The vector preheader block of \p Plan, used as target for check blocks + /// introduced during skeleton creation. + VPBlockBase *VectorPHVPB; }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -1744,7 +1731,8 @@ private: bool needsExtract(Value *V, ElementCount VF) const { Instruction *I = dyn_cast<Instruction>(V); if (VF.isScalar() || !I || !TheLoop->contains(I) || - TheLoop->isLoopInvariant(I)) + TheLoop->isLoopInvariant(I) || + getWideningDecision(I, VF) == CM_Scalarize) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -2406,12 +2394,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, // End if-block. VPRegionBlock *Parent = RepRecipe->getParent()->getParent(); bool IfPredicateInstr = Parent ? Parent->isReplicator() : false; - assert((Parent || all_of(RepRecipe->operands(), - [](VPValue *Op) { - return Op->isDefinedOutsideLoopRegions(); - })) && - "Expected a recipe is either within a region or all of its operands " - "are defined outside the vectorized region."); + assert( + (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || + all_of(RepRecipe->operands(), + [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && + "Expected a recipe is either within a region or all of its operands " + "are defined outside the vectorized region."); if (IfPredicateInstr) PredicatedInstructions.push_back(Cloned); } @@ -2466,19 +2454,15 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { return VectorTripCount; } -/// Introduces a new VPIRBasicBlock for \p CheckIRBB to \p Plan between the -/// vector preheader and its predecessor, also connecting the new block to the -/// scalar preheader. -static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) { +void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { VPBlockBase *ScalarPH = Plan.getScalarPreheader(); - VPBlockBase *VectorPH = Plan.getVectorPreheader(); - VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor(); + VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor(); if (PreVectorPH->getNumSuccessors() != 1) { assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors"); assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor"); - VPIRBasicBlock *CheckVPIRBB = VPIRBasicBlock::fromBasicBlock(CheckIRBB); - VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckVPIRBB); + VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB); + VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB); PreVectorPH = CheckVPIRBB; } VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH); @@ -2567,7 +2551,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { LoopBypassBlocks.push_back(TCCheckBlock); // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here. - introduceCheckBlockInVPlan(Plan, TCCheckBlock); + introduceCheckBlockInVPlan(TCCheckBlock); } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { @@ -2585,7 +2569,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; - introduceCheckBlockInVPlan(Plan, SCEVCheckBlock); + introduceCheckBlockInVPlan(SCEVCheckBlock); return SCEVCheckBlock; } @@ -2622,10 +2606,25 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { AddedSafetyChecks = true; - introduceCheckBlockInVPlan(Plan, MemCheckBlock); + introduceCheckBlockInVPlan(MemCheckBlock); return MemCheckBlock; } +/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p +/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must +/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All +/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. +static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { + VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); + for (auto &R : make_early_inc_range(*VPBB)) { + assert(!R.isPhi() && "Tried to move phi recipe to end of block"); + R.moveBefore(*IRVPBB, IRVPBB->end()); + } + + VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); + // VPBB is now dead and will be cleaned up when the plan gets destroyed. +} + void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); @@ -2636,64 +2635,11 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopMiddleBlock = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "middle.block"); + replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock); LoopScalarPreHeader = SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "scalar.ph"); -} - -void InnerLoopVectorizer::createInductionResumeVPValue( - VPIRInstruction *InductionPhiRI, const InductionDescriptor &II, Value *Step, - ArrayRef<BasicBlock *> BypassBlocks, VPBuilder &ScalarPHBuilder, - Value *MainVectorTripCount) { - // TODO: Move to LVP or general VPlan construction, once no IR values are - // generated. - auto *OrigPhi = cast<PHINode>(&InductionPhiRI->getInstruction()); - Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); - assert(VectorTripCount && "Expected valid arguments"); - - Instruction *OldInduction = Legal->getPrimaryInduction(); - // For the primary induction the end values are known. - Value *EndValue = VectorTripCount; - Value *EndValueFromAdditionalBypass = MainVectorTripCount; - // Otherwise compute them accordingly. - if (OrigPhi != OldInduction) { - IRBuilder<> B(LoopVectorPreHeader->getTerminator()); - - // Fast-math-flags propagate from the original induction instruction. - if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp())) - B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); - - EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), - Step, II.getKind(), II.getInductionBinOp()); - EndValue->setName("ind.end"); - - // Compute the end value for the additional bypass (if applicable). - if (MainVectorTripCount) { - B.SetInsertPoint(getAdditionalBypassBlock(), - getAdditionalBypassBlock()->getFirstInsertionPt()); - EndValueFromAdditionalBypass = - emitTransformedIndex(B, MainVectorTripCount, II.getStartValue(), Step, - II.getKind(), II.getInductionBinOp()); - EndValueFromAdditionalBypass->setName("ind.end"); - } - } - - auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( - VPInstruction::ResumePhi, - {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())}, - OrigPhi->getDebugLoc(), "bc.resume.val"); - assert(InductionPhiRI->getNumOperands() == 0 && - "InductionPhiRI should not have any operands"); - InductionPhiRI->addOperand(ResumePhiRecipe); - - if (EndValueFromAdditionalBypass) { - // Store the bypass value here, as it needs to be added as operand to its - // scalar preheader phi node after the epilogue skeleton has been created. - // TODO: Directly add as extra operand to the VPResumePHI recipe. - assert(!Induction2AdditionalBypassValue.contains(OrigPhi) && - "entry for OrigPhi already exits"); - Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass; - } + replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); } /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV @@ -2733,46 +2679,40 @@ static void addFullyUnrolledInstructionsToIgnore( } } -void InnerLoopVectorizer::createInductionResumeVPValues( - const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount, - SmallPtrSetImpl<PHINode *> *IVSubset) { - // We are going to resume the execution of the scalar loop. - // Go over all of the induction variable PHIs of the scalar loop header and - // fix their starting values, which depend on the counter of the last - // iteration of the vectorized loop. If we come from one of the - // LoopBypassBlocks then we need to start from the original start value. - // Otherwise we provide the trip count from the main vector loop. - VPBasicBlock *ScalarPHVPBB = Plan.getScalarPreheader(); - VPBuilder ScalarPHBuilder(ScalarPHVPBB, ScalarPHVPBB->begin()); - bool HasCanonical = false; - for (VPRecipeBase &R : *Plan.getScalarHeader()) { - auto *PhiR = cast<VPIRInstruction>(&R); - auto *Phi = dyn_cast<PHINode>(&PhiR->getInstruction()); - if (!Phi) - break; - if (!Legal->getInductionVars().contains(Phi) || - (IVSubset && !IVSubset->contains(Phi))) - continue; - const InductionDescriptor &II = Legal->getInductionVars().find(Phi)->second; - createInductionResumeVPValue(PhiR, II, getExpandedStep(II, ExpandedSCEVs), - LoopBypassBlocks, ScalarPHBuilder, - MainVectorTripCount); - auto *ConstStart = dyn_cast<ConstantInt>(II.getStartValue()); - auto *ConstStep = II.getConstIntStepValue(); - if (Phi->getType() == VectorTripCount->getType() && ConstStart && - ConstStart->isZero() && ConstStep && ConstStep->isOne()) - HasCanonical = true; - } - - if (!IVSubset || HasCanonical) - return; - // When vectorizing the epilogue, create a resume phi for the canonical IV if - // no suitable resume phi was already created. - ScalarPHBuilder.createNaryOp( - VPInstruction::ResumePhi, - {&Plan.getVectorTripCount(), - Plan.getOrAddLiveIn(ConstantInt::get(VectorTripCount->getType(), 0))}, - {}, "vec.epilog.resume.val"); +void InnerLoopVectorizer::createInductionAdditionalBypassValues( + const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) { + assert(MainVectorTripCount && "Must have bypass information"); + + Instruction *OldInduction = Legal->getPrimaryInduction(); + IRBuilder<> BypassBuilder(getAdditionalBypassBlock(), + getAdditionalBypassBlock()->getFirstInsertionPt()); + for (const auto &InductionEntry : Legal->getInductionVars()) { + PHINode *OrigPhi = InductionEntry.first; + const InductionDescriptor &II = InductionEntry.second; + Value *Step = getExpandedStep(II, ExpandedSCEVs); + // For the primary induction the additional bypass end value is known. + // Otherwise it is computed. + Value *EndValueFromAdditionalBypass = MainVectorTripCount; + if (OrigPhi != OldInduction) { + auto *BinOp = II.getInductionBinOp(); + // Fast-math-flags propagate from the original induction instruction. + if (isa_and_nonnull<FPMathOperator>(BinOp)) + BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags()); + + // Compute the end value for the additional bypass. + EndValueFromAdditionalBypass = + emitTransformedIndex(BypassBuilder, MainVectorTripCount, + II.getStartValue(), Step, II.getKind(), BinOp); + EndValueFromAdditionalBypass->setName("ind.end"); + } + + // Store the bypass value here, as it needs to be added as operand to its + // scalar preheader phi node after the epilogue skeleton has been created. + // TODO: Directly add as extra operand to the VPResumePHI recipe. + assert(!Induction2AdditionalBypassValue.contains(OrigPhi) && + "entry for OrigPhi already exits"); + Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass; + } } BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton( @@ -2832,9 +2772,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton( // faster. emitMemRuntimeChecks(LoopScalarPreHeader); - // Emit phis for the new starting index of the scalar loop. - createInductionResumeVPValues(ExpandedSCEVs); - return LoopVectorPreHeader; } @@ -3048,22 +2985,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { PSE.getSE()->forgetLoop(OrigLoop); PSE.getSE()->forgetBlockAndLoopDispositions(); - // When dealing with uncountable early exits we create middle.split blocks - // between the vector loop region and the exit block. These blocks need - // adding to any outer loop. - VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); - Loop *OuterLoop = OrigLoop->getParentLoop(); - if (Legal->hasUncountableEarlyExit() && OuterLoop) { - VPBasicBlock *MiddleVPBB = State.Plan->getMiddleBlock(); - VPBlockBase *PredVPBB = MiddleVPBB->getSinglePredecessor(); - while (PredVPBB && PredVPBB != VectorRegion) { - BasicBlock *MiddleSplitBB = - State.CFG.VPBB2IRBB[cast<VPBasicBlock>(PredVPBB)]; - OuterLoop->addBasicBlockToLoop(MiddleSplitBB, *LI); - PredVPBB = PredVPBB->getSinglePredecessor(); - } - } - // After vectorization, the exit blocks of the original loop will have // additional predecessors. Invalidate SCEVs for the exit phis in case SE // looked through single-entry phis. @@ -3091,9 +3012,15 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State); } + // Don't apply optimizations below when no vector region remains, as they all + // require a vector loop at the moment. + if (!State.Plan->getVectorLoopRegion()) + return; + for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); + VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; @@ -3576,10 +3503,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( if (hasIrregularType(ScalarTy, DL)) return false; - // For scalable vectors, the only interleave factor currently supported - // must be power of 2 since we require the (de)interleave2 intrinsics - // instead of shufflevectors. - if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor)) + // We currently only know how to emit interleave/deinterleave with + // Factor=2 for scalable vectors. This is purely an implementation + // limit. + if (VF.isScalable() && InterleaveFactor != 2) return false; // If the group involves a non-integral pointer, we may not be able to @@ -4768,7 +4695,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); - LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); return ChosenFactor; } #endif @@ -7697,6 +7623,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { "when vectorizing, the scalar cost must be computed."); #endif + LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n"); return BestFactor; } @@ -7802,7 +7729,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV, - &BestVPlan, Legal->getWidestInductionType()); + &BestVPlan, OrigLoop->getParentLoop(), + Legal->getWidestInductionType()); #ifdef EXPENSIVE_CHECKS assert(DT->verify(DominatorTree::VerificationLevel::Fast)); @@ -7810,11 +7738,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // 0. Generate SCEV-dependent code in the entry, including TripCount, before // making any changes to the CFG. - if (!BestVPlan.getEntry()->empty()) { - State.CFG.PrevBB = OrigLoop->getLoopPreheader(); - State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); + if (!BestVPlan.getEntry()->empty()) BestVPlan.getEntry()->execute(&State); - } + if (!ILV.getTripCount()) ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0))); else @@ -7823,6 +7749,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. + VPBasicBlock *VectorPH = + cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor()); State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton( ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); if (VectorizingEpilogue) @@ -7860,19 +7788,20 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( BestVPlan.prepareToExecute( ILV.getTripCount(), ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); + replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB); BestVPlan.execute(&State); - auto *ExitVPBB = BestVPlan.getMiddleBlock(); + auto *MiddleVPBB = BestVPlan.getMiddleBlock(); // 2.5 When vectorizing the epilogue, fix reduction and induction resume // values from the additional bypass block. if (VectorizingEpilogue) { assert(!ILV.Legal->hasUncountableEarlyExit() && "Epilogue vectorisation not yet supported with early exits"); BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); - for (VPRecipeBase &R : *ExitVPBB) { + for (VPRecipeBase &R : *MiddleVPBB) { fixReductionScalarResumeWhenVectorizingEpilog( - &R, State, State.CFG.VPBB2IRBB[ExitVPBB], BypassBlock); + &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock); } BasicBlock *PH = OrigLoop->getLoopPreheader(); for (const auto &[IVPhi, _] : Legal->getInductionVars()) { @@ -7885,30 +7814,31 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). - MDNode *OrigLoopID = OrigLoop->getLoopID(); + if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) { + MDNode *OrigLoopID = OrigLoop->getLoopID(); - std::optional<MDNode *> VectorizedLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupVectorized}); - - VPBasicBlock *HeaderVPBB = - BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); - Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); - if (VectorizedLoopID) - L->setLoopID(*VectorizedLoopID); - else { - // Keep all loop hints from the original loop on the vector loop (we'll - // replace the vectorizer-specific hints below). - if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); - - LoopVectorizeHints Hints(L, true, *ORE); - Hints.setAlreadyVectorized(); + std::optional<MDNode *> VectorizedLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized}); + + VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); + Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); + if (VectorizedLoopID) { + L->setLoopID(*VectorizedLoopID); + } else { + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + if (MDNode *LID = OrigLoop->getLoopID()) + L->setLoopID(LID); + + LoopVectorizeHints Hints(L, true, *ORE); + Hints.setAlreadyVectorized(); + } + TargetTransformInfo::UnrollingPreferences UP; + TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); + if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) + addRuntimeUnrollDisableMetaData(L); } - TargetTransformInfo::UnrollingPreferences UP; - TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); - if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) - addRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7917,15 +7847,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); // 4. Adjust branch weight of the branch in the middle block. - auto *MiddleTerm = - cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); - if (MiddleTerm->isConditional() && - hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - // Assume that `Count % VectorTripCount` is equally distributed. - unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); - assert(TripCount > 0 && "trip count should not be zero"); - const uint32_t Weights[] = {1, TripCount - 1}; - setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + if (BestVPlan.getVectorLoopRegion()) { + auto *MiddleVPBB = BestVPlan.getMiddleBlock(); + auto *MiddleTerm = + cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); + if (MiddleTerm->isConditional() && + hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + const uint32_t Weights[] = {1, TripCount - 1}; + setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + } } return State.ExpandedSCEVs; @@ -7968,17 +7901,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( // Generate the induction variable. EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); - // Generate VPValues and ResumePhi recipes for wide inductions in the epilogue - // plan only. Other inductions only need a resume value for the canonical - // induction, which will get created during epilogue skeleton construction. - SmallPtrSet<PHINode *, 4> WideIVs; - for (VPRecipeBase &H : - EPI.EpiloguePlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - if (auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&H)) - WideIVs.insert(WideIV->getPHINode()); - } - createInductionResumeVPValues(ExpandedSCEVs, nullptr, &WideIVs); - return LoopVectorPreHeader; } @@ -8048,7 +7970,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); - introduceCheckBlockInVPlan(Plan, TCCheckBlock); + introduceCheckBlockInVPlan(TCCheckBlock); return TCCheckBlock; } @@ -8128,14 +8050,11 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( Phi->removeIncomingValue(EPI.MemSafetyCheck); } - // Generate induction resume values. These variables save the new starting - // indexes for the scalar loop. They are used to test if there are any tail - // iterations left once the vector loop has completed. - // Note that when the vectorized epilogue is skipped due to iteration count - // check, then the resume value for the induction variable comes from - // the trip count of the main vector loop, passed as the second argument. - createInductionResumeVPValues(ExpandedSCEVs, EPI.VectorTripCount); - + // Generate bypass values from the additional bypass block. Note that when the + // vectorized epilogue is skipped due to iteration count check, then the + // resume value for the induction variable comes from the trip count of the + // main vector loop, passed as the second argument. + createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount); return LoopVectorPreHeader; } @@ -8185,13 +8104,13 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( // A new entry block has been created for the epilogue VPlan. Hook it in, as // otherwise we would try to modify the entry to the main vector loop. - VPIRBasicBlock *NewEntry = VPIRBasicBlock::fromBasicBlock(Insert); + VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert); VPBasicBlock *OldEntry = Plan.getEntry(); VPBlockUtils::reassociateBlocks(OldEntry, NewEntry); Plan.setEntry(NewEntry); - delete OldEntry; + // OldEntry is now dead and will be cleaned up when the plan gets destroyed. - introduceCheckBlockInVPlan(Plan, Insert); + introduceCheckBlockInVPlan(Insert); return Insert; } @@ -8435,17 +8354,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, auto *GEP = dyn_cast<GetElementPtrInst>( Ptr->getUnderlyingValue()->stripPointerCasts()); VPSingleDefRecipe *VectorPtr; - if (Reverse) + if (Reverse) { + // When folding the tail, we may compute an address that we don't in the + // original scalar loop and it may not be inbounds. Drop Inbounds in that + // case. + GEPNoWrapFlags Flags = + (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) + ? GEPNoWrapFlags::none() + : GEPNoWrapFlags::inBounds(); VectorPtr = new VPReverseVectorPointerRecipe( - Ptr, &Plan.getVF(), getLoadStoreType(I), - GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds() - : GEPNoWrapFlags::none(), - I->getDebugLoc()); - else + Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); + } else { VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), I->getDebugLoc()); + } Builder.getInsertBlock()->appendRecipe(VectorPtr); Ptr = VectorPtr; } @@ -8955,14 +8879,56 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } -/// Create resume phis in the scalar preheader for first-order recurrences and -/// reductions and update the VPIRInstructions wrapping the original phis in the -/// scalar header. +/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the +/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute +/// the end value of the induction. +static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, + VPBuilder &VectorPHBuilder, + VPBuilder &ScalarPHBuilder, + VPTypeAnalysis &TypeInfo, + VPValue *VectorTC) { + auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV); + // Truncated wide inductions resume from the last lane of their vector value + // in the last vector iteration which is handled elsewhere. + if (WideIntOrFp && WideIntOrFp->getTruncInst()) + return nullptr; + + VPValue *Start = WideIV->getStartValue(); + VPValue *Step = WideIV->getStepValue(); + const InductionDescriptor &ID = WideIV->getInductionDescriptor(); + VPValue *EndValue = VectorTC; + if (!WideIntOrFp || !WideIntOrFp->isCanonical()) { + EndValue = VectorPHBuilder.createDerivedIV( + ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), + Start, VectorTC, Step); + } + + // EndValue is derived from the vector trip count (which has the same type as + // the widest induction) and thus may be wider than the induction here. + Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV); + if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) { + EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue, + ScalarTypeOfWideIV, + WideIV->getDebugLoc()); + } + + auto *ResumePhiRecipe = + ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start}, + WideIV->getDebugLoc(), "bc.resume.val"); + return ResumePhiRecipe; +} + +/// Create resume phis in the scalar preheader for first-order recurrences, +/// reductions and inductions, and update the VPIRInstructions wrapping the +/// original phis in the scalar header. static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); auto *ScalarPH = Plan.getScalarPreheader(); auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor()); - VPBuilder ScalarPHBuilder(ScalarPH); + VPBuilder VectorPHBuilder( + cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor())); VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); + VPBuilder ScalarPHBuilder(ScalarPH); VPValue *OneVPV = Plan.getOrAddLiveIn( ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) { @@ -8970,9 +8936,23 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction()); if (!ScalarPhiI) break; + auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI)); - if (!isa<VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe>(VectorPhiR)) + if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) { + if (VPValue *ResumePhi = addResumePhiRecipeForInduction( + WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo, + &Plan.getVectorTripCount())) { + ScalarPhiIRI->addOperand(ResumePhi); + continue; + } + // TODO: Also handle truncated inductions here. Computing end-values + // separately should be done as VPlan-to-VPlan optimization, after + // legalizing all resume values to use the last lane from the loop. + assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() && + "should only skip truncated wide inductions"); continue; + } + // The backedge value provides the value to resume coming out of a loop, // which for FORs is a vector whose last element needs to be extracted. The // start value provides the value if the loop is bypassed. @@ -8990,14 +8970,73 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { } } +/// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is +/// either an untruncated wide induction, or if it increments a wide induction +/// by its step. +static bool isOptimizableIVOrUse(VPValue *VPV) { + VPRecipeBase *Def = VPV->getDefiningRecipe(); + if (!Def) + return false; + auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def); + if (WideIV) { + // VPV itself is a wide induction, separately compute the end value for exit + // users if it is not a truncated IV. + return isa<VPWidenPointerInductionRecipe>(WideIV) || + !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst(); + } + + // Check if VPV is an optimizable induction increment. + if (Def->getNumOperands() != 2) + return false; + WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0)); + if (!WideIV) + WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1)); + if (!WideIV) + return false; + + using namespace VPlanPatternMatch; + auto &ID = WideIV->getInductionDescriptor(); + + // Check if VPV increments the induction by the induction step. + VPValue *IVStep = WideIV->getStepValue(); + switch (ID.getInductionOpcode()) { + case Instruction::Add: + return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV), + m_Specific(IVStep))); + case Instruction::FAdd: + return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV), + m_Specific(IVStep))); + case Instruction::FSub: + return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV), + m_Specific(IVStep))); + case Instruction::Sub: { + // IVStep will be the negated step of the subtraction. Check if Step == -1 * + // IVStep. + VPValue *Step; + if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) || + !Step->isLiveIn() || !IVStep->isLiveIn()) + return false; + auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); + auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue()); + return StepCI && IVStepCI && + StepCI->getValue() == (-1 * IVStepCI->getValue()); + } + default: + return ID.getKind() == InductionDescriptor::IK_PtrInduction && + match(VPV, m_GetElementPtr(m_Specific(WideIV), + m_Specific(WideIV->getStepValue()))); + } + llvm_unreachable("should have been covered by switch above"); +} + // Collect VPIRInstructions for phis in the exit blocks that are modeled // in VPlan and add the exiting VPValue as operand. Some exiting values are not // modeled explicitly yet and won't be included. Those are un-truncated // VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction // increments. -static SetVector<VPIRInstruction *> collectUsersInExitBlocks( - Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, - const MapVector<PHINode *, InductionDescriptor> &Inductions) { +static SetVector<VPIRInstruction *> +collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, + VPlan &Plan) { auto *MiddleVPBB = Plan.getMiddleBlock(); SetVector<VPIRInstruction *> ExitUsersToFix; for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { @@ -9022,18 +9061,9 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks( // Exit values for inductions are computed and updated outside of VPlan // and independent of induction recipes. // TODO: Compute induction exit values in VPlan. - if ((isa<VPWidenIntOrFpInductionRecipe>(V) && - !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) || - isa<VPWidenPointerInductionRecipe>(V) || - (isa<Instruction>(IncomingValue) && - OrigLoop->contains(cast<Instruction>(IncomingValue)) && - any_of(IncomingValue->users(), [&Inductions](User *U) { - auto *P = dyn_cast<PHINode>(U); - return P && Inductions.contains(P); - }))) { - if (ExitVPBB->getSinglePredecessor() == MiddleVPBB) - continue; - } + if (isOptimizableIVOrUse(V) && + ExitVPBB->getSinglePredecessor() == MiddleVPBB) + continue; ExitUsersToFix.insert(ExitIRI); ExitIRI->addOperand(V); } @@ -9239,9 +9269,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); // For scalable vectors, the only interleave factor currently supported - // must be power of 2 since we require the (de)interleave2 intrinsics - // instead of shufflevectors. - assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) && + // is 2 since we require the (de)interleave2 intrinsics instead of + // shufflevectors. + assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && "Unsupported interleave factor for scalable vectors"); return Result; }; @@ -9335,7 +9365,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { VPBB->appendRecipe(Recipe); } - VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); + VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB); VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); } @@ -9348,14 +9378,28 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); + // Update wide induction increments to use the same step as the corresponding + // wide induction. This enables detecting induction increments directly in + // VPlan and removes redundant splats. + for (const auto &[Phi, ID] : Legal->getInductionVars()) { + auto *IVInc = cast<Instruction>( + Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); + if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add) + continue; + VPWidenInductionRecipe *WideIV = + cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi)); + VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc); + R->setOperand(1, WideIV->getStepValue()); + } + if (auto *UncountableExitingBlock = Legal->getUncountableEarlyExitingBlock()) { VPlanTransforms::handleUncountableEarlyExit( *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder); } addScalarResumePhis(RecipeBuilder, *Plan); - SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks( - OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); + SetVector<VPIRInstruction *> ExitUsersToFix = + collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan); addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) { reportVectorizationFailure( @@ -9474,6 +9518,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { bool HasNUW = true; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DebugLoc()); + + // Collect mapping of IR header phis to header phi recipes, to be used in + // addScalarResumePhis. + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); + for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + if (isa<VPCanonicalIVPHIRecipe>(&R)) + continue; + auto *HeaderR = cast<VPHeaderPHIRecipe>(&R); + RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR); + } + addScalarResumePhis(RecipeBuilder, *Plan); + assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; } @@ -9762,13 +9818,18 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); Value *Step = State.get(getStepValue(), VPLane(0)); - Value *CanonicalIV = State.get(getOperand(1), VPLane(0)); + Value *Index = State.get(getOperand(1), VPLane(0)); Value *DerivedIV = emitTransformedIndex( - State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, - Kind, cast_if_present<BinaryOperator>(FPBinOp)); + State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind, + cast_if_present<BinaryOperator>(FPBinOp)); DerivedIV->setName(Name); - assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); - + // If index is the vector trip count, the concrete value will only be set in + // prepareToExecute, leading to missed simplifications, e.g. if it is 0. + // TODO: Remove the special case for the vector trip count once it is computed + // in VPlan and can be used during VPlan simplification. + assert((DerivedIV != Index || + getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) && + "IV didn't need transforming?"); State.set(this, DerivedIV, VPLane(0)); } @@ -10078,6 +10139,57 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || !EnableLoopVectorization) {} +/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue +/// vectorization. Remove ResumePhis from \p MainPlan for inductions that +/// don't have a corresponding wide induction in \p EpiPlan. +static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { + // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those + // will need their resume-values computed in the main vector loop. Others + // can be removed from the main VPlan. + SmallPtrSet<PHINode *, 2> EpiWidenedPhis; + for (VPRecipeBase &R : + EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + if (isa<VPCanonicalIVPHIRecipe>(&R)) + continue; + EpiWidenedPhis.insert( + cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue())); + } + for (VPRecipeBase &R : make_early_inc_range( + *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) { + auto *VPIRInst = cast<VPIRInstruction>(&R); + auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction()); + if (!IRI) + break; + if (EpiWidenedPhis.contains(IRI)) + continue; + // There is no corresponding wide induction in the epilogue plan that would + // need a resume value. Remove the VPIRInst wrapping the scalar header phi + // together with the corresponding ResumePhi. The resume values for the + // scalar loop will be created during execution of EpiPlan. + VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe(); + VPIRInst->eraseFromParent(); + ResumePhi->eraseFromParent(); + } + VPlanTransforms::removeDeadRecipes(MainPlan); + + using namespace VPlanPatternMatch; + VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader(); + VPValue *VectorTC = &MainPlan.getVectorTripCount(); + // If there is a suitable resume value for the canonical induction in the + // scalar (which will become vector) epilogue loop we are done. Otherwise + // create it below. + if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) { + return match(&R, m_VPInstruction<VPInstruction::ResumePhi>( + m_Specific(VectorTC), m_SpecificInt(0))); + })) + return; + VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin()); + ScalarPHBuilder.createNaryOp( + VPInstruction::ResumePhi, + {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {}, + "vec.epilog.resume.val"); +} + /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. static void @@ -10542,12 +10654,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); + preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, BestEpiPlan); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks, *BestMainPlan); - auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, false); ++LoopsVectorized; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f52ddfd..36fed89 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -104,6 +104,7 @@ using namespace llvm; using namespace llvm::PatternMatch; using namespace slpvectorizer; +using namespace std::placeholders; #define SV_NAME "slp-vectorizer" #define DEBUG_TYPE "SLP" @@ -816,27 +817,34 @@ class InstructionsState { Instruction *AltOp = nullptr; public: - Instruction *getMainOp() const { return MainOp; } + Instruction *getMainOp() const { + assert(valid() && "InstructionsState is invalid."); + return MainOp; + } - Instruction *getAltOp() const { return AltOp; } + Instruction *getAltOp() const { + assert(valid() && "InstructionsState is invalid."); + return AltOp; + } /// The main/alternate opcodes for the list of instructions. - unsigned getOpcode() const { - return MainOp ? MainOp->getOpcode() : 0; - } + unsigned getOpcode() const { return getMainOp()->getOpcode(); } - unsigned getAltOpcode() const { - return AltOp ? AltOp->getOpcode() : 0; - } + unsigned getAltOpcode() const { return getAltOp()->getOpcode(); } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return AltOp != MainOp; } + bool isAltShuffle() const { return getMainOp() != getAltOp(); } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; } + /// Checks if the current state is valid, i.e. has non-null MainOp + bool valid() const { return MainOp && AltOp; } + + explicit operator bool() const { return valid(); } + InstructionsState() = delete; InstructionsState(Instruction *MainOp, Instruction *AltOp) : MainOp(MainOp), AltOp(AltOp) {} @@ -869,8 +877,8 @@ static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) && !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) || BaseOp0 == Op0 || BaseOp1 == Op1 || - getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() || - getSameOpcode({BaseOp1, Op1}, TLI).getOpcode(); + getSameOpcode({BaseOp0, Op0}, TLI) || + getSameOpcode({BaseOp1, Op1}, TLI); } /// \returns true if a compare instruction \p CI has similar "look" and @@ -1847,7 +1855,7 @@ public: InstructionsState S = getSameOpcode(Ops, TLI); // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. - if (S.getOpcode() && + if (S && (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() || !S.isAltShuffle()) && all_of(Ops, [&S](Value *V) { @@ -2382,7 +2390,7 @@ public: // Use Boyer-Moore majority voting for finding the majority opcode and // the number of times it occurs. if (auto *I = dyn_cast<Instruction>(OpData.V)) { - if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() || + if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) || I->getParent() != Parent) { if (NumOpsWithSameOpcodeParent == 0) { NumOpsWithSameOpcodeParent = 1; @@ -2501,8 +2509,7 @@ public: // 2.1. If we have only 2 lanes, need to check that value in the // next lane does not build same opcode sequence. (Lns == 2 && - !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) - .getOpcode() && + !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) && isa<Constant>(Data.V)))) || // 3. The operand in the current lane is loop invariant (can be // hoisted out) and another operand is also a loop invariant @@ -2511,7 +2518,7 @@ public: // FIXME: need to teach the cost model about this case for better // estimation. (IsInvariant && !isa<Constant>(Data.V) && - !getSameOpcode({Op, Data.V}, TLI).getOpcode() && + !getSameOpcode({Op, Data.V}, TLI) && L->isLoopInvariant(Data.V))) { FoundCandidate = true; Data.IsUsed = Data.V == Op; @@ -2541,7 +2548,7 @@ public: return true; Value *OpILn = getValue(OpI, Ln); return (L && L->isLoopInvariant(OpILn)) || - (getSameOpcode({Op, OpILn}, TLI).getOpcode() && + (getSameOpcode({Op, OpILn}, TLI) && allSameBlock({Op, OpILn})); })) return true; @@ -2698,7 +2705,7 @@ public: OperandData &AltOp = getData(OpIdx, Lane); InstructionsState OpS = getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI); - if (OpS.getOpcode() && OpS.isAltShuffle()) + if (OpS && OpS.isAltShuffle()) MainAltOps[OpIdx].push_back(AltOp.V); } } @@ -3400,6 +3407,7 @@ private: } void setOperations(const InstructionsState &S) { + assert(S && "InstructionsState is invalid."); MainOp = S.getMainOp(); AltOp = S.getAltOp(); } @@ -3600,7 +3608,7 @@ private: "Need to vectorize gather entry?"); // Gathered loads still gathered? Do not create entry, use the original one. if (GatheredLoadsEntriesFirst.has_value() && - EntryState == TreeEntry::NeedToGather && + EntryState == TreeEntry::NeedToGather && S && S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX && !UserTreeIdx.UserTE) return nullptr; @@ -3618,7 +3626,8 @@ private: ReuseShuffleIndices.end()); if (ReorderIndices.empty()) { Last->Scalars.assign(VL.begin(), VL.end()); - Last->setOperations(S); + if (S) + Last->setOperations(S); } else { // Reorder scalars and build final mask. Last->Scalars.assign(VL.size(), nullptr); @@ -3629,7 +3638,8 @@ private: return VL[Idx]; }); InstructionsState S = getSameOpcode(Last->Scalars, *TLI); - Last->setOperations(S); + if (S) + Last->setOperations(S); Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } if (!Last->isGather()) { @@ -4774,8 +4784,7 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, (!GEP2 || isConstant(GEP2->getOperand(1)))) || !CompareOpcodes || (GEP1 && GEP2 && - getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI) - .getOpcode())); + getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI))); } /// Calculates minimal alignment as a common alignment. @@ -4947,6 +4956,37 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); } +/// Correctly creates insert_subvector, checking that the index is multiple of +/// the subvectors length. Otherwise, generates shuffle using \p Generator or +/// using default shuffle. +static Value *createInsertVector( + IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, + function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) { + const unsigned SubVecVF = getNumElements(V->getType()); + if (Index % SubVecVF == 0) { + Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, + Builder.getInt64(Index)); + } else { + // Create shuffle, insertvector requires that index is multiple of + // the subvector length. + const unsigned VecVF = getNumElements(Vec->getType()); + SmallVector<int> Mask(VecVF, PoisonMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), Index), 0); + for (unsigned I : seq<unsigned>(SubVecVF)) + Mask[I + Index] = I + VecVF; + if (Generator) { + Vec = Generator(Vec, V, Mask); + } else { + // 1. Resize V to the size of Vec. + SmallVector<int> ResizeMask(VecVF, PoisonMaskElem); + std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0); + V = Builder.CreateShuffleVector(V, ResizeMask); + Vec = Builder.CreateShuffleVector(Vec, V, Mask); + } + } + return Vec; +} + BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order, @@ -5347,11 +5387,10 @@ static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, SmallPtrSet<Value *, 13> SecondPointers; Value *P1 = Ptr1; Value *P2 = Ptr2; - if (P1 == P2) - return false; unsigned Depth = 0; - while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1) && - Depth <= RecursionMaxDepth) { + while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) { + if (P1 == P2 || Depth > RecursionMaxDepth) + return false; FirstPointers.insert(P1); SecondPointers.insert(P2); P1 = getUnderlyingObject(P1, /*MaxLookup=*/1); @@ -7500,7 +7539,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, [&](ArrayRef<Value *> Op) { if (allConstant(Op) || (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) && - getSameOpcode(Op, *TLI).getMainOp())) + getSameOpcode(Op, *TLI))) return false; DenseMap<Value *, unsigned> Uniques; for (Value *V : Op) { @@ -8071,15 +8110,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Don't go into catchswitch blocks, which can happen with PHIs. // Such blocks can only have PHIs and the catchswitch. There is no // place to insert a shuffle if we need to, so just avoid that issue. - if (S.getMainOp() && - isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) { + if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) { LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; } // Check if this is a duplicate of another entry. - if (S.getOpcode()) { + if (S) { if (TreeEntry *E = getTreeEntry(S.getMainOp())) { LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n"); @@ -8140,13 +8178,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // a load), in which case peek through to include it in the tree, without // ballooning over-budget. if (Depth >= RecursionMaxDepth && - !(S.getMainOp() && !S.isAltShuffle() && VL.size() >= 4 && + !(S && !S.isAltShuffle() && VL.size() >= 4 && (match(S.getMainOp(), m_Load(m_Value())) || all_of(VL, [&S](const Value *I) { return match(I, m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) && - cast<Instruction>(I)->getOpcode() == - S.getMainOp()->getOpcode(); + cast<Instruction>(I)->getOpcode() == S.getOpcode(); })))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); if (TryToFindDuplicates(S)) @@ -8156,7 +8193,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } // Don't handle scalable vectors - if (S.getOpcode() == Instruction::ExtractElement && + if (S && S.getOpcode() == Instruction::ExtractElement && isa<ScalableVectorType>( cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); @@ -8180,7 +8217,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // vectorize. auto &&NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) { - if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2) + if (!S || !S.isAltShuffle() || VL.size() > 2) return false; if (VectorizableTree.size() < MinTreeSize) return false; @@ -8235,7 +8272,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, bool IsScatterVectorizeUserTE = UserTreeIdx.UserTE && UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; - bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL); + bool AreAllSameBlock = S && allSameBlock(VL); bool AreScatterAllGEPSameBlock = (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() && VL.size() > 2 && @@ -8252,8 +8289,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE, SortedIndices)); bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock; - if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) || - (isa_and_present<InsertElementInst, ExtractValueInst, ExtractElementInst>( + if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) || + (S && + isa<InsertElementInst, ExtractValueInst, ExtractElementInst>( S.getMainOp()) && !all_of(VL, isVectorLikeInstWithConstOps)) || NotProfitableForVectorization(VL)) { @@ -8265,7 +8303,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } // Don't vectorize ephemeral values. - if (S.getOpcode() && !EphValues.empty()) { + if (S && !EphValues.empty()) { for (Value *V : VL) { if (EphValues.count(V)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V @@ -8324,7 +8362,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Instruction *VL0 = S.getMainOp(); BB = VL0->getParent(); - if (S.getMainOp() && + if (S && (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) || !DT->isReachableFromEntry(BB))) { // Don't go into unreachable blocks. They may contain instructions with @@ -8378,8 +8416,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = + S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) { // Postpone PHI nodes creation SmallVector<unsigned> PHIOps; @@ -8388,7 +8426,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (Op.empty()) continue; InstructionsState S = getSameOpcode(Op, *TLI); - if (S.getOpcode() != Instruction::PHI || S.isAltShuffle()) + if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle()) buildTree_rec(Op, Depth + 1, {TE, I}); else PHIOps.push_back(I); @@ -9771,7 +9809,7 @@ void BoUpSLP::transformNodes() { if (IsSplat) continue; InstructionsState S = getSameOpcode(Slice, *TLI); - if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) || + if (!S || S.isAltShuffle() || !allSameBlock(Slice) || (S.getOpcode() == Instruction::Load && areKnownNonVectorizableLoads(Slice)) || (S.getOpcode() != Instruction::Load && !has_single_bit(VF))) @@ -11086,7 +11124,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, if (const TreeEntry *OpTE = getTreeEntry(V)) return getCastContextHint(*OpTE); InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI); - if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle()) + if (SrcState && SrcState.getOpcode() == Instruction::Load && + !SrcState.isAltShuffle()) return TTI::CastContextHint::GatherScatter; return TTI::CastContextHint::None; }; @@ -13265,7 +13304,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( Value *In1 = PHI1->getIncomingValue(I); if (isConstant(In) && isConstant(In1)) continue; - if (!getSameOpcode({In, In1}, *TLI).getOpcode()) + if (!getSameOpcode({In, In1}, *TLI)) return false; if (cast<Instruction>(In)->getParent() != cast<Instruction>(In1)->getParent()) @@ -13293,7 +13332,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( if (It != UsedValuesEntry.end()) UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second; return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && - getSameOpcode({V, V1}, *TLI).getOpcode() && + getSameOpcode({V, V1}, *TLI) && cast<Instruction>(V)->getParent() == cast<Instruction>(V1)->getParent() && (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1)); @@ -13876,9 +13915,8 @@ Value *BoUpSLP::gather( Instruction *InsElt; if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) { assert(SLPReVec && "FixedVectorType is not expected."); - Vec = InsElt = Builder.CreateInsertVector( - Vec->getType(), Vec, Scalar, - Builder.getInt64(Pos * VecTy->getNumElements())); + Vec = InsElt = cast<Instruction>(createInsertVector( + Builder, Vec, Scalar, Pos * getNumElements(VecTy))); auto *II = dyn_cast<IntrinsicInst>(InsElt); if (!II || II->getIntrinsicID() != Intrinsic::vector_insert) return Vec; @@ -14478,23 +14516,10 @@ public: V, SimplifyQuery(*R.DL)); })); unsigned InsertionIndex = Idx * ScalarTyNumElements; - const unsigned SubVecVF = - cast<FixedVectorType>(V->getType())->getNumElements(); - if (InsertionIndex % SubVecVF == 0) { - Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, - Builder.getInt64(InsertionIndex)); - } else { - // Create shuffle, insertvector requires that index is multiple of - // the subvectors length. - const unsigned VecVF = - cast<FixedVectorType>(Vec->getType())->getNumElements(); - SmallVector<int> Mask(VecVF, PoisonMaskElem); - std::iota(Mask.begin(), Mask.end(), 0); - for (unsigned I : seq<unsigned>( - InsertionIndex, (Idx + SubVecVF) * ScalarTyNumElements)) - Mask[I] = I - Idx + VecVF; - Vec = createShuffle(Vec, V, Mask); - } + Vec = createInsertVector( + Builder, Vec, V, InsertionIndex, + std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2, + _3)); if (!CommonMask.empty()) { std::iota( std::next(CommonMask.begin(), InsertionIndex), @@ -14560,12 +14585,12 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, ArrayRef<Value *> VL = E->getOperand(NodeIdx); InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. - if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { + if (!S && VL.front()->getType()->isPointerTy()) { const auto *It = find_if(VL, IsaPred<GetElementPtrInst>); if (It != VL.end()) S = getSameOpcode(*It, *TLI); } - if (!S.getOpcode()) + if (!S) return nullptr; auto CheckSameVE = [&](const TreeEntry *VE) { return VE->isSame(VL) && @@ -17740,7 +17765,6 @@ bool BoUpSLP::collectValuesToDemote( BitWidth = std::max(BitWidth, BitWidth1); return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2); }; - using namespace std::placeholders; auto FinalAnalysis = [&]() { if (!IsProfitableToDemote) return false; @@ -18546,8 +18570,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(), ValOps.size()) || (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1)); - if ((!IsAllowedSize && S.getOpcode() && - S.getOpcode() != Instruction::Load && + if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load && (!S.getMainOp()->isSafeToRemove() || any_of(ValOps.getArrayRef(), [&](Value *V) { @@ -18557,8 +18580,8 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, return !Stores.contains(U); })); }))) || - (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) { - Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2; + (ValOps.size() > Chain.size() / 2 && !S)) { + Size = (!IsAllowedSize && S) ? 1 : 2; return false; } } @@ -18581,7 +18604,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, R.computeMinimumValueSizes(); Size = R.getCanonicalGraphSize(); - if (S.getOpcode() == Instruction::Load) + if (S && S.getOpcode() == Instruction::Load) Size = 2; // cut off masked gather small trees InstructionCost Cost = R.getTreeCost(); @@ -19082,7 +19105,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, // Check that all of the parts are instructions of the same type, // we permit an alternate opcode via InstructionsState. InstructionsState S = getSameOpcode(VL, *TLI); - if (!S.getOpcode()) + if (!S) return false; Instruction *I0 = S.getMainOp(); @@ -19906,16 +19929,16 @@ public: // Also check if the instruction was folded to constant/other value. auto *Inst = dyn_cast<Instruction>(RdxVal); if ((Inst && isVectorLikeInstWithConstOps(Inst) && - (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) || - (S.getOpcode() && !Inst)) + (!S || !S.isOpcodeOrAlt(Inst))) || + (S && !Inst)) continue; Candidates.push_back(RdxVal); TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]); } bool ShuffledExtracts = false; // Try to handle shuffled extractelements. - if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() && - I + 1 < E) { + if (S && S.getOpcode() == Instruction::ExtractElement && + !S.isAltShuffle() && I + 1 < E) { SmallVector<Value *> CommonCandidates(Candidates); for (Value *RV : ReducedVals[I + 1]) { Value *RdxVal = TrackedVals.at(RV); @@ -21310,7 +21333,7 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); } InstructionsState S = getSameOpcode({I1, I2}, TLI); - if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle())) + if (S && (IsCompatibility || !S.isAltShuffle())) continue; if (IsCompatibility) return false; @@ -21468,7 +21491,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (NodeI1 != NodeI2) return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); InstructionsState S = getSameOpcode({I1, I2}, *TLI); - if (S.getOpcode() && !S.isAltShuffle()) + if (S && !S.isAltShuffle()) continue; return I1->getOpcode() < I2->getOpcode(); } @@ -21531,8 +21554,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return false; if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}, *TLI); - if (S.getOpcode()) + if (getSameOpcode({I1, I2}, *TLI)) continue; return false; } @@ -21904,8 +21926,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}, *TLI); - return S.getOpcode() > 0; + return getSameOpcode({I1, I2}, *TLI).valid(); } if (isa<Constant>(V1->getValueOperand()) && isa<Constant>(V2->getValueOperand())) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 9a08292..e804f81 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -205,11 +205,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { return Parent->getEnclosingBlockWithPredecessors(); } -void VPBlockBase::deleteCFG(VPBlockBase *Entry) { - for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Entry))) - delete Block; -} - VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { iterator It = begin(); while (It != end() && It->isPhi()) @@ -221,9 +216,10 @@ VPTransformState::VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, InnerLoopVectorizer *ILV, VPlan *Plan, - Type *CanonicalIVTy) + Loop *CurrentParentLoop, Type *CanonicalIVTy) : TTI(TTI), VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan), - LVer(nullptr), TypeAnalysis(CanonicalIVTy) {} + CurrentParentLoop(CurrentParentLoop), LVer(nullptr), + TypeAnalysis(CanonicalIVTy) {} Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) { if (Def->isLiveIn()) @@ -474,6 +470,13 @@ void VPIRBasicBlock::execute(VPTransformState *State) { connectToPredecessors(State->CFG); } +VPIRBasicBlock *VPIRBasicBlock::clone() { + auto *NewBlock = getPlan()->createEmptyVPIRBasicBlock(IRBB); + for (VPRecipeBase &R : Recipes) + NewBlock->appendRecipe(R.clone()); + return NewBlock; +} + void VPBasicBlock::execute(VPTransformState *State) { bool Replica = bool(State->Lane); BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. @@ -484,11 +487,9 @@ void VPBasicBlock::execute(VPTransformState *State) { }; // 1. Create an IR basic block. - if (this == getPlan()->getVectorPreheader() || - (Replica && this == getParent()->getEntry()) || + if ((Replica && this == getParent()->getEntry()) || IsReplicateRegion(getSingleHierarchicalPredecessor())) { // Reuse the previous basic block if the current VPBB is either - // * the vector preheader, // * the entry to a replicate region, or // * the exit of a replicate region. State->CFG.VPBB2IRBB[this] = NewBB; @@ -500,8 +501,8 @@ void VPBasicBlock::execute(VPTransformState *State) { UnreachableInst *Terminator = State->Builder.CreateUnreachable(); // Register NewBB in its loop. In innermost loops its the same for all // BB's. - if (State->CurrentVectorLoop) - State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI); + if (State->CurrentParentLoop) + State->CurrentParentLoop->addBasicBlockToLoop(NewBB, *State->LI); State->Builder.SetInsertPoint(Terminator); State->CFG.PrevBB = NewBB; @@ -513,14 +514,11 @@ void VPBasicBlock::execute(VPTransformState *State) { executeRecipes(State, NewBB); } -void VPBasicBlock::dropAllReferences(VPValue *NewValue) { - for (VPRecipeBase &R : Recipes) { - for (auto *Def : R.definedValues()) - Def->replaceAllUsesWith(NewValue); - - for (unsigned I = 0, E = R.getNumOperands(); I != E; I++) - R.setOperand(I, NewValue); - } +VPBasicBlock *VPBasicBlock::clone() { + auto *NewBlock = getPlan()->createVPBasicBlock(getName()); + for (VPRecipeBase &R : *this) + NewBlock->appendRecipe(R.clone()); + return NewBlock; } void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) { @@ -541,7 +539,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { SmallVector<VPBlockBase *, 2> Succs(successors()); // Create new empty block after the block to split. - auto *SplitBlock = new VPBasicBlock(getName() + ".split"); + auto *SplitBlock = getPlan()->createVPBasicBlock(getName() + ".split"); VPBlockUtils::insertBlockAfter(SplitBlock, this); // Finally, move the recipes starting at SplitAt to new block. @@ -557,7 +555,9 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { template <typename T> static T *getEnclosingLoopRegionForRegion(T *P) { if (P && P->isReplicator()) { P = P->getParent(); - assert(!cast<VPRegionBlock>(P)->isReplicator() && + // Multiple loop regions can be nested, but replicate regions can only be + // nested inside a loop region or must be outside any other region. + assert((!P || !cast<VPRegionBlock>(P)->isReplicator()) && "unexpected nested replicate regions"); } return P; @@ -701,37 +701,30 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) { VPRegionBlock *VPRegionBlock::clone() { const auto &[NewEntry, NewExiting] = cloneFrom(getEntry()); - auto *NewRegion = - new VPRegionBlock(NewEntry, NewExiting, getName(), isReplicator()); + auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting, + getName(), isReplicator()); for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry)) Block->setParent(NewRegion); return NewRegion; } -void VPRegionBlock::dropAllReferences(VPValue *NewValue) { - for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) - // Drop all references in VPBasicBlocks and replace all uses with - // DummyValue. - Block->dropAllReferences(NewValue); -} - void VPRegionBlock::execute(VPTransformState *State) { ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(Entry); if (!isReplicator()) { // Create and register the new vector loop. - Loop *PrevLoop = State->CurrentVectorLoop; - State->CurrentVectorLoop = State->LI->AllocateLoop(); + Loop *PrevLoop = State->CurrentParentLoop; + State->CurrentParentLoop = State->LI->AllocateLoop(); BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()]; Loop *ParentLoop = State->LI->getLoopFor(VectorPH); // Insert the new loop into the loop nest and register the new basic blocks // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) - ParentLoop->addChildLoop(State->CurrentVectorLoop); + ParentLoop->addChildLoop(State->CurrentParentLoop); else - State->LI->addTopLevelLoop(State->CurrentVectorLoop); + State->LI->addTopLevelLoop(State->CurrentParentLoop); // Visit the VPBlocks connected to "this", starting from it. for (VPBlockBase *Block : RPOT) { @@ -739,7 +732,7 @@ void VPRegionBlock::execute(VPTransformState *State) { Block->execute(State); } - State->CurrentVectorLoop = PrevLoop; + State->CurrentParentLoop = PrevLoop; return; } @@ -822,17 +815,26 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, #endif VPlan::VPlan(Loop *L) { - setEntry(VPIRBasicBlock::fromBasicBlock(L->getLoopPreheader())); - ScalarHeader = VPIRBasicBlock::fromBasicBlock(L->getHeader()); + setEntry(createVPIRBasicBlock(L->getLoopPreheader())); + ScalarHeader = createVPIRBasicBlock(L->getHeader()); } VPlan::~VPlan() { - if (Entry) { - VPValue DummyValue; - for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) - Block->dropAllReferences(&DummyValue); - - VPBlockBase::deleteCFG(Entry); + VPValue DummyValue; + + for (auto *VPB : CreatedBlocks) { + if (auto *VPBB = dyn_cast<VPBasicBlock>(VPB)) { + // Replace all operands of recipes and all VPValues defined in VPBB with + // DummyValue so the block can be deleted. + for (VPRecipeBase &R : *VPBB) { + for (auto *Def : R.definedValues()) + Def->replaceAllUsesWith(&DummyValue); + + for (unsigned I = 0, E = R.getNumOperands(); I != E; I++) + R.setOperand(I, &DummyValue); + } + } + delete VPB; } for (VPValue *VPV : VPLiveInsToFree) delete VPV; @@ -840,14 +842,6 @@ VPlan::~VPlan() { delete BackedgeTakenCount; } -VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) { - auto *VPIRBB = new VPIRBasicBlock(IRBB); - for (Instruction &I : - make_range(IRBB->begin(), IRBB->getTerminator()->getIterator())) - VPIRBB->appendRecipe(new VPIRInstruction(I)); - return VPIRBB; -} - VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, @@ -861,7 +855,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // an epilogue vector loop, the original entry block here will be replaced by // a new VPIRBasicBlock wrapping the entry to the epilogue vector loop after // generating code for the main vector loop. - VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); + VPBasicBlock *VecPreheader = Plan->createVPBasicBlock("vector.ph"); VPBlockUtils::connectBlocks(Plan->getEntry(), VecPreheader); // Create SCEV and VPValue for the trip count. @@ -878,17 +872,17 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // Create VPRegionBlock, with empty header and latch blocks, to be filled // during processing later. - VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); - VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); + VPBasicBlock *HeaderVPBB = Plan->createVPBasicBlock("vector.body"); + VPBasicBlock *LatchVPBB = Plan->createVPBasicBlock("vector.latch"); VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); - auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop", - false /*isReplicator*/); + auto *TopRegion = Plan->createVPRegionBlock( + HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/); VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader); - VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); + VPBasicBlock *MiddleVPBB = Plan->createVPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); - VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); + VPBasicBlock *ScalarPH = Plan->createVPBasicBlock("scalar.ph"); VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader); if (!RequiresScalarEpilogueCheck) { VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); @@ -904,7 +898,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // we unconditionally branch to the scalar preheader. Do nothing. // 3) Otherwise, construct a runtime check. BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock(); - auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock); + auto *VPExitBlock = Plan->createVPIRBasicBlock(IRExitBlock); // The connection order corresponds to the operands of the conditional branch. VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); @@ -942,7 +936,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); // FIXME: Model VF * UF computation completely in VPlan. - assert(VFxUF.getNumUsers() && "VFxUF expected to always have users"); + assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) && + "VFxUF expected to always have users"); unsigned UF = getUF(); if (VF.getNumUsers()) { Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF); @@ -955,22 +950,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, } } -/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p -/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must -/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All -/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. -static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { - VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB); - for (auto &R : make_early_inc_range(*VPBB)) { - assert(!R.isPhi() && "Tried to move phi recipe to end of block"); - R.moveBefore(*IRVPBB, IRVPBB->end()); - } - - VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); - - delete VPBB; -} - /// Generate the code inside the preheader and body of the vectorized loop. /// Assumes a single pre-header basic-block was created for this. Introduce /// additional basic-blocks as needed, and fill them all. @@ -978,25 +957,13 @@ void VPlan::execute(VPTransformState *State) { // Initialize CFG state. State->CFG.PrevVPBB = nullptr; State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor(); - BasicBlock *VectorPreHeader = State->CFG.PrevBB; - State->Builder.SetInsertPoint(VectorPreHeader->getTerminator()); // Disconnect VectorPreHeader from ExitBB in both the CFG and DT. + BasicBlock *VectorPreHeader = State->CFG.PrevBB; cast<BranchInst>(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr); State->CFG.DTU.applyUpdates( {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}}); - // Replace regular VPBB's for the vector preheader, middle and scalar - // preheader blocks with VPIRBasicBlocks wrapping their IR blocks. The IR - // blocks are created during skeleton creation, so we can only create the - // VPIRBasicBlocks now during VPlan execution rather than earlier during VPlan - // construction. - BasicBlock *MiddleBB = State->CFG.ExitBB; - BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor(); - replaceVPBBWithIRVPBB(getVectorPreheader(), VectorPreHeader); - replaceVPBBWithIRVPBB(getMiddleBlock(), MiddleBB); - replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh); - LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF << ", UF=" << getUF() << '\n'); setName("Final VPlan"); @@ -1005,6 +972,8 @@ void VPlan::execute(VPTransformState *State) { // Disconnect the middle block from its single successor (the scalar loop // header) in both the CFG and DT. The branch will be recreated during VPlan // execution. + BasicBlock *MiddleBB = State->CFG.ExitBB; + BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor(); auto *BrInst = new UnreachableInst(MiddleBB->getContext()); BrInst->insertBefore(MiddleBB->getTerminator()); MiddleBB->getTerminator()->eraseFromParent(); @@ -1022,12 +991,18 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : RPOT) Block->execute(State); - VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock(); + State->CFG.DTU.flush(); + + auto *LoopRegion = getVectorLoopRegion(); + if (!LoopRegion) + return; + + VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; // Fix the latch value of canonical, reduction and first-order recurrences // phis in the vector loop. - VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); + VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { // Skip phi-like recipes that generate their backedege values themselves. if (isa<VPWidenPHIRecipe>(&R)) @@ -1066,8 +1041,6 @@ void VPlan::execute(VPTransformState *State) { Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB); } - - State->CFG.DTU.flush(); } InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { @@ -1080,14 +1053,14 @@ VPRegionBlock *VPlan::getVectorLoopRegion() { // TODO: Cache if possible. for (VPBlockBase *B : vp_depth_first_shallow(getEntry())) if (auto *R = dyn_cast<VPRegionBlock>(B)) - return R; + return R->isReplicator() ? nullptr : R; return nullptr; } const VPRegionBlock *VPlan::getVectorLoopRegion() const { for (const VPBlockBase *B : vp_depth_first_shallow(getEntry())) if (auto *R = dyn_cast<VPRegionBlock>(B)) - return R; + return R->isReplicator() ? nullptr : R; return nullptr; } @@ -1217,6 +1190,7 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry, } VPlan *VPlan::duplicate() { + unsigned NumBlocksBeforeCloning = CreatedBlocks.size(); // Clone blocks. const auto &[NewEntry, __] = cloneFrom(Entry); @@ -1257,9 +1231,32 @@ VPlan *VPlan::duplicate() { assert(Old2NewVPValues.contains(TripCount) && "TripCount must have been added to Old2NewVPValues"); NewPlan->TripCount = Old2NewVPValues[TripCount]; + + // Transfer all cloned blocks (the second half of all current blocks) from + // current to new VPlan. + unsigned NumBlocksAfterCloning = CreatedBlocks.size(); + for (unsigned I : + seq<unsigned>(NumBlocksBeforeCloning, NumBlocksAfterCloning)) + NewPlan->CreatedBlocks.push_back(this->CreatedBlocks[I]); + CreatedBlocks.truncate(NumBlocksBeforeCloning); + return NewPlan; } +VPIRBasicBlock *VPlan::createEmptyVPIRBasicBlock(BasicBlock *IRBB) { + auto *VPIRBB = new VPIRBasicBlock(IRBB); + CreatedBlocks.push_back(VPIRBB); + return VPIRBB; +} + +VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) { + auto *VPIRBB = createEmptyVPIRBasicBlock(IRBB); + for (Instruction &I : + make_range(IRBB->begin(), IRBB->getTerminator()->getIterator())) + VPIRBB->appendRecipe(new VPIRInstruction(I)); + return VPIRBB; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) Twine VPlanPrinter::getUID(const VPBlockBase *Block) { @@ -1409,11 +1406,17 @@ void VPlanIngredient::print(raw_ostream &O) const { #endif -bool VPValue::isDefinedOutsideLoopRegions() const { - return !hasDefiningRecipe() || - !getDefiningRecipe()->getParent()->getEnclosingLoopRegion(); +/// Returns true if there is a vector loop region and \p VPV is defined in a +/// loop region. +static bool isDefinedInsideLoopRegions(const VPValue *VPV) { + const VPRecipeBase *DefR = VPV->getDefiningRecipe(); + return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() || + DefR->getParent()->getEnclosingLoopRegion()); } +bool VPValue::isDefinedOutsideLoopRegions() const { + return !isDefinedInsideLoopRegions(this); +} void VPValue::replaceAllUsesWith(VPValue *New) { replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 404202b..cfbb4ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -236,7 +236,8 @@ public: struct VPTransformState { VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan, Type *CanonicalIVTy); + InnerLoopVectorizer *ILV, VPlan *Plan, + Loop *CurrentParentLoop, Type *CanonicalIVTy); /// Target Transform Info. const TargetTransformInfo *TTI; @@ -373,8 +374,8 @@ struct VPTransformState { /// Pointer to the VPlan code is generated for. VPlan *Plan; - /// The loop object for the current parent region, or nullptr. - Loop *CurrentVectorLoop = nullptr; + /// The parent loop object for the current scope, or nullptr. + Loop *CurrentParentLoop = nullptr; /// LoopVersioning. It's only set up (non-null) if memchecks were /// used. @@ -636,9 +637,6 @@ public: /// Return the cost of the block. virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0; - /// Delete all blocks reachable from a given VPBlockBase, inclusive. - static void deleteCFG(VPBlockBase *Entry); - /// Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() { // There are currently no constraints that prevent an instruction to be @@ -646,10 +644,6 @@ public: return true; } - /// Replace all operands of VPUsers in the block with \p NewValue and also - /// replaces all uses of VPValues defined in the block with NewValue. - virtual void dropAllReferences(VPValue *NewValue) = 0; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printAsOperand(raw_ostream &OS, bool PrintType = false) const { OS << getName(); @@ -1357,6 +1351,9 @@ public: } } + /// Returns true if the underlying opcode may read from or write to memory. + bool opcodeMayReadOrWriteFromMemory() const; + /// Returns true if the recipe only uses the first lane of operand \p Op. bool onlyFirstLaneUsed(const VPValue *Op) const override; @@ -1586,14 +1583,16 @@ class VPScalarCastRecipe : public VPSingleDefRecipe { Value *generate(VPTransformState &State); public: - VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) - : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}), Opcode(Opcode), + VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, + DebugLoc DL) + : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}, DL), Opcode(Opcode), ResultTy(ResultTy) {} ~VPScalarCastRecipe() override = default; VPScalarCastRecipe *clone() override { - return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy); + return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy, + getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPScalarCastSC) @@ -2101,6 +2100,15 @@ public: R->getVPDefID() == VPDef::VPWidenPointerInductionSC; } + static inline bool classof(const VPValue *V) { + auto *R = V->getDefiningRecipe(); + return R && classof(R); + } + + static inline bool classof(const VPHeaderPHIRecipe *R) { + return classof(static_cast<const VPRecipeBase *>(R)); + } + virtual void execute(VPTransformState &State) override = 0; /// Returns the step value of the induction. @@ -3556,8 +3564,6 @@ public: return make_range(begin(), getFirstNonPhi()); } - void dropAllReferences(VPValue *NewValue) override; - /// Split current block at \p SplitAt by inserting a new block between the /// current block and its successors and moving all recipes starting at /// SplitAt to the new block. Returns the new block. @@ -3587,12 +3593,7 @@ public: /// Clone the current block and it's recipes, without updating the operands of /// the cloned recipes. - VPBasicBlock *clone() override { - auto *NewBlock = new VPBasicBlock(getName()); - for (VPRecipeBase &R : *this) - NewBlock->appendRecipe(R.clone()); - return NewBlock; - } + VPBasicBlock *clone() override; protected: /// Execute the recipes in the IR basic block \p BB. @@ -3628,20 +3629,11 @@ public: return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; } - /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all - /// instructions in \p IRBB, except its terminator which is managed in VPlan. - static VPIRBasicBlock *fromBasicBlock(BasicBlock *IRBB); - /// The method which generates the output IR instructions that correspond to /// this VPBasicBlock, thereby "executing" the VPlan. void execute(VPTransformState *State) override; - VPIRBasicBlock *clone() override { - auto *NewBlock = new VPIRBasicBlock(IRBB); - for (VPRecipeBase &R : Recipes) - NewBlock->appendRecipe(R.clone()); - return NewBlock; - } + VPIRBasicBlock *clone() override; BasicBlock *getIRBasicBlock() const { return IRBB; } }; @@ -3680,13 +3672,7 @@ public: : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr), IsReplicator(IsReplicator) {} - ~VPRegionBlock() override { - if (Entry) { - VPValue DummyValue; - Entry->dropAllReferences(&DummyValue); - deleteCFG(Entry); - } - } + ~VPRegionBlock() override {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPBlockBase *V) { @@ -3734,8 +3720,6 @@ public: // Return the cost of this region. InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override; - void dropAllReferences(VPValue *NewValue) override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using @@ -3812,6 +3796,10 @@ class VPlan { /// been modeled in VPlan directly. DenseMap<const SCEV *, VPValue *> SCEVToExpansion; + /// Blocks allocated and owned by the VPlan. They will be deleted once the + /// VPlan is destroyed. + SmallVector<VPBlockBase *> CreatedBlocks; + /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader /// wrapping the original header of the scalar loop. VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader) @@ -3830,8 +3818,8 @@ public: /// Construct a VPlan with a new VPBasicBlock as entry, a VPIRBasicBlock /// wrapping \p ScalarHeaderBB and a trip count of \p TC. VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) { - setEntry(new VPBasicBlock("preheader")); - ScalarHeader = VPIRBasicBlock::fromBasicBlock(ScalarHeaderBB); + setEntry(createVPBasicBlock("preheader")); + ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB); TripCount = TC; } @@ -3870,9 +3858,13 @@ public: VPBasicBlock *getEntry() { return Entry; } const VPBasicBlock *getEntry() const { return Entry; } - /// Returns the preheader of the vector loop region. + /// Returns the preheader of the vector loop region, if one exists, or null + /// otherwise. VPBasicBlock *getVectorPreheader() { - return cast<VPBasicBlock>(getVectorLoopRegion()->getSinglePredecessor()); + VPRegionBlock *VectorRegion = getVectorLoopRegion(); + return VectorRegion + ? cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()) + : nullptr; } /// Returns the VPRegionBlock of the vector loop. @@ -4029,6 +4021,49 @@ public: /// Clone the current VPlan, update all VPValues of the new VPlan and cloned /// recipes to refer to the clones, and return it. VPlan *duplicate(); + + /// Create a new VPBasicBlock with \p Name and containing \p Recipe if + /// present. The returned block is owned by the VPlan and deleted once the + /// VPlan is destroyed. + VPBasicBlock *createVPBasicBlock(const Twine &Name, + VPRecipeBase *Recipe = nullptr) { + auto *VPB = new VPBasicBlock(Name, Recipe); + CreatedBlocks.push_back(VPB); + return VPB; + } + + /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p + /// IsReplicator is true, the region is a replicate region. The returned block + /// is owned by the VPlan and deleted once the VPlan is destroyed. + VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, + const std::string &Name = "", + bool IsReplicator = false) { + auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator); + CreatedBlocks.push_back(VPB); + return VPB; + } + + /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set + /// to nullptr. If \p IsReplicator is true, the region is a replicate region. + /// The returned block is owned by the VPlan and deleted once the VPlan is + /// destroyed. + VPRegionBlock *createVPRegionBlock(const std::string &Name = "", + bool IsReplicator = false) { + auto *VPB = new VPRegionBlock(Name, IsReplicator); + CreatedBlocks.push_back(VPB); + return VPB; + } + + /// Create a VPIRBasicBlock wrapping \p IRBB, but do not create + /// VPIRInstructions wrapping the instructions in t\p IRBB. The returned + /// block is owned by the VPlan and deleted once the VPlan is destroyed. + VPIRBasicBlock *createEmptyVPIRBasicBlock(BasicBlock *IRBB); + + /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all + /// instructions in \p IRBB, except its terminator which is managed by the + /// successors of the block in VPlan. The returned block is owned by the VPlan + /// and deleted once the VPlan is destroyed. + VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB); }; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 6e63373..76ed578 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -182,7 +182,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { // Create new VPBB. StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName(); LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n"); - VPBasicBlock *VPBB = new VPBasicBlock(Name); + VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name); BB2VPBB[BB] = VPBB; // Get or create a region for the loop containing BB. @@ -204,7 +204,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { if (LoopOfBB == TheLoop) { RegionOfVPBB = Plan.getVectorLoopRegion(); } else { - RegionOfVPBB = new VPRegionBlock(Name.str(), false /*isReplicator*/); + RegionOfVPBB = Plan.createVPRegionBlock(Name.str(), false /*isReplicator*/); RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]); } RegionOfVPBB->setEntry(VPBB); @@ -357,12 +357,10 @@ void PlainCFGBuilder::buildPlainCFG() { BB2VPBB[TheLoop->getHeader()] = VectorHeaderVPBB; VectorHeaderVPBB->clearSuccessors(); VectorLatchVPBB->clearPredecessors(); - if (TheLoop->getHeader() != TheLoop->getLoopLatch()) { + if (TheLoop->getHeader() != TheLoop->getLoopLatch()) BB2VPBB[TheLoop->getLoopLatch()] = VectorLatchVPBB; - } else { + else TheRegion->setExiting(VectorHeaderVPBB); - delete VectorLatchVPBB; - } // 1. Scan the body of the loop in a topological order to visit each basic // block after having visited its predecessor basic blocks. Create a VPBB for diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index ec3c203..4866426 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -139,7 +139,8 @@ struct MatchRecipeAndOpcode<Opcode, RecipeTy> { if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value || std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value || std::is_same<RecipeTy, VPWidenSelectRecipe>::value || - std::is_same<RecipeTy, VPDerivedIVRecipe>::value) + std::is_same<RecipeTy, VPDerivedIVRecipe>::value || + std::is_same<RecipeTy, VPWidenGEPRecipe>::value) return DefR; else return DefR && DefR->getOpcode() == Opcode; @@ -309,6 +310,12 @@ m_Binary(const Op0_t &Op0, const Op1_t &Op1) { return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>(Op0, Op1); } +template <unsigned Opcode, typename Op0_t, typename Op1_t> +inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true> +m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) { + return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>(Op0, Op1); +} + template <typename Op0_t, typename Op1_t> inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul> m_Mul(const Op0_t &Op0, const Op1_t &Op1) { @@ -339,6 +346,18 @@ m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) { return m_BinaryOr<Op0_t, Op1_t, /*Commutative*/ true>(Op0, Op1); } +template <typename Op0_t, typename Op1_t> +using GEPLikeRecipe_match = + BinaryRecipe_match<Op0_t, Op1_t, Instruction::GetElementPtr, false, + VPWidenRecipe, VPReplicateRecipe, VPWidenGEPRecipe, + VPInstruction>; + +template <typename Op0_t, typename Op1_t> +inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0, + const Op1_t &Op1) { + return GEPLikeRecipe_match<Op0_t, Op1_t>(Op0, Op1); +} + template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode> using AllTernaryRecipe_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, Opcode, false, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7038e52..e54df8bd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -51,24 +51,7 @@ extern cl::opt<unsigned> ForceTargetInstructionCost; bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { case VPInstructionSC: - if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode())) - return false; - switch (cast<VPInstruction>(this)->getOpcode()) { - case Instruction::Or: - case Instruction::ICmp: - case Instruction::Select: - case VPInstruction::AnyOf: - case VPInstruction::Not: - case VPInstruction::CalculateTripCountMinusVF: - case VPInstruction::CanonicalIVIncrementForPart: - case VPInstruction::ExtractFromEnd: - case VPInstruction::FirstOrderRecurrenceSplice: - case VPInstruction::LogicalAnd: - case VPInstruction::PtrAdd: - return false; - default: - return true; - } + return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); case VPInterleaveSC: return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: @@ -115,6 +98,8 @@ bool VPRecipeBase::mayWriteToMemory() const { bool VPRecipeBase::mayReadFromMemory() const { switch (getVPDefID()) { + case VPInstructionSC: + return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; @@ -707,6 +692,26 @@ void VPInstruction::execute(VPTransformState &State) { /*IsScalar*/ GeneratesPerFirstLaneOnly); } +bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { + if (Instruction::isBinaryOp(getOpcode())) + return false; + switch (getOpcode()) { + case Instruction::ICmp: + case Instruction::Select: + case VPInstruction::AnyOf: + case VPInstruction::CalculateTripCountMinusVF: + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::ExtractFromEnd: + case VPInstruction::FirstOrderRecurrenceSplice: + case VPInstruction::LogicalAnd: + case VPInstruction::Not: + case VPInstruction::PtrAdd: + return false; + default: + return true; + } +} + bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode())) @@ -1352,10 +1357,9 @@ void VPWidenRecipe::execute(VPTransformState &State) { Value *C = nullptr; if (FCmp) { // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue())) - Builder.setFastMathFlags(I->getFastMathFlags()); - C = Builder.CreateFCmp(getPredicate(), A, B); + C = Builder.CreateFCmpFMF( + getPredicate(), A, B, + dyn_cast_or_null<Instruction>(getUnderlyingValue())); } else { C = Builder.CreateICmp(getPredicate(), A, B); } @@ -2328,6 +2332,7 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, #endif Value *VPScalarCastRecipe ::generate(VPTransformState &State) { + State.setDebugLocFrom(getDebugLoc()); assert(vputils::onlyFirstLaneUsed(this) && "Codegen only implemented for first lane."); switch (Opcode) { @@ -2789,21 +2794,10 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { - assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for " - "scalable vectors, must be power of 2"); - SmallVector<Value *> InterleavingValues(Vals); - // When interleaving, the number of values will be shrunk until we have the - // single final interleaved value. - auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType()); - for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { - InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy); - for (unsigned I = 0; I < Midpoint; ++I) - InterleavingValues[I] = Builder.CreateIntrinsic( - InterleaveTy, Intrinsic::vector_interleave2, - {InterleavingValues[I], InterleavingValues[Midpoint + I]}, - /*FMFSource=*/nullptr, Name); - } - return InterleavingValues[0]; + VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); + return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, + Vals, + /*FMFSource=*/nullptr, Name); } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -2889,11 +2883,15 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { &InterleaveFactor](Value *MaskForGaps) -> Value * { if (State.VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); - assert(isPowerOf2_32(InterleaveFactor) && + assert(InterleaveFactor == 2 && "Unsupported deinterleave factor for scalable vectors"); auto *ResBlockInMask = State.get(BlockInMask); - SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask); - return interleaveVectors(State.Builder, Ops, "interleaved.mask"); + SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask}; + auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), + State.VF.getKnownMinValue() * 2, true); + return State.Builder.CreateIntrinsic( + MaskTy, Intrinsic::vector_interleave2, Ops, + /*FMFSource=*/nullptr, "interleaved.mask"); } if (!BlockInMask) @@ -2933,48 +2931,22 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { ArrayRef<VPValue *> VPDefs = definedValues(); const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); if (VecTy->isScalableTy()) { - assert(isPowerOf2_32(InterleaveFactor) && + assert(InterleaveFactor == 2 && "Unsupported deinterleave factor for scalable vectors"); - // Scalable vectors cannot use arbitrary shufflevectors (only splats), - // so must use intrinsics to deinterleave. - SmallVector<Value *> DeinterleavedValues(InterleaveFactor); - DeinterleavedValues[0] = NewLoad; - // For the case of InterleaveFactor > 2, we will have to do recursive - // deinterleaving, because the current available deinterleave intrinsic - // supports only Factor of 2, otherwise it will bailout after first - // iteration. - // When deinterleaving, the number of values will double until we - // have "InterleaveFactor". - for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; - NumVectors *= 2) { - // Deinterleave the elements within the vector - SmallVector<Value *> TempDeinterleavedValues(NumVectors); - for (unsigned I = 0; I < NumVectors; ++I) { - auto *DiTy = DeinterleavedValues[I]->getType(); - TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], - /*FMFSource=*/nullptr, "strided.vec"); - } - // Extract the deinterleaved values: - for (unsigned I = 0; I < 2; ++I) - for (unsigned J = 0; J < NumVectors; ++J) - DeinterleavedValues[NumVectors * I + J] = - State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I); - } - -#ifndef NDEBUG - for (Value *Val : DeinterleavedValues) - assert(Val && "NULL Deinterleaved Value"); -#endif - for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + Value *DI = State.Builder.CreateIntrinsic( + Intrinsic::vector_deinterleave2, VecTy, NewLoad, + /*FMFSource=*/nullptr, "strided.vec"); + unsigned J = 0; + for (unsigned I = 0; I < InterleaveFactor; ++I) { Instruction *Member = Group->getMember(I); - Value *StridedVec = DeinterleavedValues[I]; - if (!Member) { - // This value is not needed as it's not used - static_cast<Instruction *>(StridedVec)->eraseFromParent(); + + if (!Member) continue; - } + + Value *StridedVec = State.Builder.CreateExtractValue(DI, I); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); @@ -3398,7 +3370,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { : VectorType::get(StartV->getType(), State.VF); BasicBlock *HeaderBB = State.CFG.PrevBB; - assert(State.CurrentVectorLoop->getHeader() == HeaderBB && + assert(State.CurrentParentLoop->getHeader() == HeaderBB && "recipe must be in the vector loop header"); auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); Phi->insertBefore(HeaderBB->getFirstInsertionPt()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 0b809c2..3e3f5ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -217,7 +217,7 @@ static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) { // is connected to a successor replicate region with the same predicate by a // single, empty VPBasicBlock. static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { - SetVector<VPRegionBlock *> DeletedRegions; + SmallPtrSet<VPRegionBlock *, 4> TransformedRegions; // Collect replicate regions followed by an empty block, followed by another // replicate region with matching masks to process front. This is to avoid @@ -248,7 +248,7 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { // Move recipes from Region1 to its successor region, if both are triangles. for (VPRegionBlock *Region1 : WorkList) { - if (DeletedRegions.contains(Region1)) + if (TransformedRegions.contains(Region1)) continue; auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor()); auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor()); @@ -294,12 +294,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock); } VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock); - DeletedRegions.insert(Region1); + TransformedRegions.insert(Region1); } - for (VPRegionBlock *ToDelete : DeletedRegions) - delete ToDelete; - return !DeletedRegions.empty(); + return !TransformedRegions.empty(); } static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, @@ -310,7 +308,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, assert(Instr->getParent() && "Predicated instruction not in any basic block"); auto *BlockInMask = PredRecipe->getMask(); auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); - auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); + auto *Entry = + Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); // Replace predicated replicate recipe with a replicate recipe without a // mask but in the replicate region. @@ -318,7 +317,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, PredRecipe->getUnderlyingInstr(), make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())), PredRecipe->isUniform()); - auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); + auto *Pred = + Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); VPPredInstPHIRecipe *PHIRecipe = nullptr; if (PredRecipe->getNumUsers() != 0) { @@ -328,8 +328,10 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, PHIRecipe->setOperand(0, RecipeWithoutMask); } PredRecipe->eraseFromParent(); - auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); - VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); + auto *Exiting = + Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); + VPRegionBlock *Region = + Plan.createVPRegionBlock(Entry, Exiting, RegionName, true); // Note: first set Entry as region entry and then connect successors starting // from it in order, to propagate the "parent" of each VPBasicBlock. @@ -396,7 +398,7 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) { VPBlockUtils::disconnectBlocks(VPBB, Succ); VPBlockUtils::connectBlocks(PredVPBB, Succ); } - delete VPBB; + // VPBB is now dead and will be cleaned up when the plan gets destroyed. } return !WorkList.empty(); } @@ -525,7 +527,8 @@ static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, - VPValue *StartV, VPValue *Step, VPBuilder &Builder) { + VPValue *StartV, VPValue *Step, DebugLoc DL, + VPBuilder &Builder) { VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); VPSingleDefRecipe *BaseIV = Builder.createDerivedIV( @@ -540,7 +543,7 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() && "Not truncating."); assert(ResultTy->isIntegerTy() && "Truncation requires an integer type"); - BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy); + BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL); ResultTy = TruncTy; } @@ -554,26 +557,68 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, cast<VPBasicBlock>(HeaderVPBB->getSingleHierarchicalPredecessor()); VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(VecPreheader); - Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy); + Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL); } return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step); } +static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) { + SetVector<VPUser *> Users(V->user_begin(), V->user_end()); + for (unsigned I = 0; I != Users.size(); ++I) { + VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]); + if (isa<VPHeaderPHIRecipe>(Cur)) + continue; + for (VPValue *V : Cur->definedValues()) + Users.insert(V->user_begin(), V->user_end()); + } + return Users.takeVector(); +} + /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as /// VPWidenPointerInductionRecipe will generate vectors only. If some users /// require vectors while other require scalars, the scalar uses need to extract /// the scalars from the generated vectors (Note that this is different to how -/// int/fp inductions are handled). Also optimize VPWidenIntOrFpInductionRecipe, -/// if any of its users needs scalar values, by providing them scalar steps -/// built on the canonical scalar IV and update the original IV's users. This is -/// an optional optimization to reduce the needs of vector extracts. +/// int/fp inductions are handled). Legalize extract-from-ends using uniform +/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so +/// the correct end value is available. Also optimize +/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by +/// providing them scalar steps built on the canonical scalar IV and update the +/// original IV's users. This is an optional optimization to reduce the needs of +/// vector extracts. static void legalizeAndOptimizeInductions(VPlan &Plan) { + using namespace llvm::VPlanPatternMatch; SmallVector<VPRecipeBase *> ToRemove; VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1)); VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); for (VPRecipeBase &Phi : HeaderVPBB->phis()) { + auto *PhiR = dyn_cast<VPHeaderPHIRecipe>(&Phi); + if (!PhiR) + break; + + // Check if any uniform VPReplicateRecipes using the phi recipe are used by + // ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to + // ensure the final value is available. + // TODO: Remove once uniformity analysis is done on VPlan. + for (VPUser *U : collectUsersRecursively(PhiR)) { + auto *ExitIRI = dyn_cast<VPIRInstruction>(U); + VPValue *Op; + if (!ExitIRI || !match(ExitIRI->getOperand(0), + m_VPInstruction<VPInstruction::ExtractFromEnd>( + m_VPValue(Op), m_VPValue()))) + continue; + auto *RepR = dyn_cast<VPReplicateRecipe>(Op); + if (!RepR || !RepR->isUniform()) + continue; + assert(!RepR->isPredicated() && "RepR must not be predicated"); + Instruction *I = RepR->getUnderlyingInstr(); + auto *Clone = + new VPReplicateRecipe(I, RepR->operands(), /*IsUniform*/ false); + Clone->insertAfter(RepR); + RepR->replaceAllUsesWith(Clone); + } + // Replace wide pointer inductions which have only their scalars used by // PtrAdd(IndStart, ScalarIVSteps (0, Step)). if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) { @@ -586,7 +631,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { VPValue *StepV = PtrIV->getOperand(1); VPScalarIVStepsRecipe *Steps = createScalarIVSteps( Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr, - nullptr, StartV, StepV, Builder); + nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder); VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps, PtrIV->getDebugLoc(), "next.gep"); @@ -610,7 +655,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { Plan, ID.getKind(), ID.getInductionOpcode(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(), - Builder); + WideIV->getDebugLoc(), Builder); // Update scalar users of IV to use Step instead. if (!HasOnlyVectorVFs) @@ -660,13 +705,158 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) { } } +/// Try to simplify recipe \p R. +static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { + using namespace llvm::VPlanPatternMatch; + + if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { + // Try to remove redundant blend recipes. + SmallPtrSet<VPValue *, 4> UniqueValues; + if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) + UniqueValues.insert(Blend->getIncomingValue(0)); + for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) + if (!match(Blend->getMask(I), m_False())) + UniqueValues.insert(Blend->getIncomingValue(I)); + + if (UniqueValues.size() == 1) { + Blend->replaceAllUsesWith(*UniqueValues.begin()); + Blend->eraseFromParent(); + return; + } + + if (Blend->isNormalized()) + return; + + // Normalize the blend so its first incoming value is used as the initial + // value with the others blended into it. + + unsigned StartIndex = 0; + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + // If a value's mask is used only by the blend then is can be deadcoded. + // TODO: Find the most expensive mask that can be deadcoded, or a mask + // that's used by multiple blends where it can be removed from them all. + VPValue *Mask = Blend->getMask(I); + if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) { + StartIndex = I; + break; + } + } + + SmallVector<VPValue *, 4> OperandsWithMask; + OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); + + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + if (I == StartIndex) + continue; + OperandsWithMask.push_back(Blend->getIncomingValue(I)); + OperandsWithMask.push_back(Blend->getMask(I)); + } + + auto *NewBlend = new VPBlendRecipe( + cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask); + NewBlend->insertBefore(&R); + + VPValue *DeadMask = Blend->getMask(StartIndex); + Blend->replaceAllUsesWith(NewBlend); + Blend->eraseFromParent(); + recursivelyDeleteDeadRecipes(DeadMask); + return; + } + + VPValue *A; + if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) { + VPValue *Trunc = R.getVPSingleValue(); + Type *TruncTy = TypeInfo.inferScalarType(Trunc); + Type *ATy = TypeInfo.inferScalarType(A); + if (TruncTy == ATy) { + Trunc->replaceAllUsesWith(A); + } else { + // Don't replace a scalarizing recipe with a widened cast. + if (isa<VPReplicateRecipe>(&R)) + return; + if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { + + unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) + ? Instruction::SExt + : Instruction::ZExt; + auto *VPC = + new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); + if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { + // UnderlyingExt has distinct return type, used to retain legacy cost. + VPC->setUnderlyingValue(UnderlyingExt); + } + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { + auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } + } +#ifndef NDEBUG + // Verify that the cached type info is for both A and its users is still + // accurate by comparing it to freshly computed types. + VPTypeAnalysis TypeInfo2( + R.getParent()->getPlan()->getCanonicalIV()->getScalarType()); + assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); + for (VPUser *U : A->users()) { + auto *R = cast<VPRecipeBase>(U); + for (VPValue *VPV : R->definedValues()) + assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); + } +#endif + } + + // Simplify (X && Y) || (X && !Y) -> X. + // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X + // && (Y || Z) and (X || !X) into true. This requires queuing newly created + // recipes to be visited during simplification. + VPValue *X, *Y, *X1, *Y1; + if (match(&R, + m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), + m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) && + X == X1 && Y == Y1) { + R.getVPSingleValue()->replaceAllUsesWith(X); + R.eraseFromParent(); + return; + } + + if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) + return R.getVPSingleValue()->replaceAllUsesWith(A); + + if (match(&R, m_Not(m_Not(m_VPValue(A))))) + return R.getVPSingleValue()->replaceAllUsesWith(A); + + // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0. + if ((match(&R, + m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) || + match(&R, + m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) && + TypeInfo.inferScalarType(R.getOperand(1)) == + TypeInfo.inferScalarType(R.getVPSingleValue())) + return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1)); +} + +/// Try to simplify the recipes in \p Plan. Use \p CanonicalIVTy as type for all +/// un-typed live-ins in VPTypeAnalysis. +static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy) { + ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( + Plan.getEntry()); + VPTypeAnalysis TypeInfo(CanonicalIVTy); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + simplifyRecipe(R, TypeInfo); + } + } +} + void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE) { assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); - VPBasicBlock *ExitingVPBB = - Plan.getVectorLoopRegion()->getExitingBasicBlock(); + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); auto *Term = &ExitingVPBB->back(); // Try to simplify the branch condition if TC <= VF * UF when preparing to // execute the plan for the main vector loop. We only do this if the @@ -690,16 +880,44 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) return; - LLVMContext &Ctx = SE.getContext(); - auto *BOC = new VPInstruction( - VPInstruction::BranchOnCond, - {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); + // The vector loop region only executes once. If possible, completely remove + // the region, otherwise replace the terminator controlling the latch with + // (BranchOnCond true). + auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry()); + auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); + if (all_of( + Header->phis(), + IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) { + for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) { + auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR); + HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue()); + HeaderPhiR->eraseFromParent(); + } + + VPBlockBase *Preheader = VectorRegion->getSinglePredecessor(); + VPBlockBase *Exit = VectorRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(Preheader, VectorRegion); + VPBlockUtils::disconnectBlocks(VectorRegion, Exit); + + for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry())) + B->setParent(nullptr); + + VPBlockUtils::connectBlocks(Preheader, Header); + VPBlockUtils::connectBlocks(ExitingVPBB, Exit); + simplifyRecipes(Plan, CanIVTy); + } else { + // The vector region contains header phis for which we cannot remove the + // loop region yet. + LLVMContext &Ctx = SE.getContext(); + auto *BOC = new VPInstruction( + VPInstruction::BranchOnCond, + {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); + ExitingVPBB->appendRecipe(BOC); + } - SmallVector<VPValue *> PossiblyDead(Term->operands()); Term->eraseFromParent(); - for (VPValue *Op : PossiblyDead) - recursivelyDeleteDeadRecipes(Op); - ExitingVPBB->appendRecipe(BOC); + VPlanTransforms::removeDeadRecipes(Plan); + Plan.setVF(BestVF); Plan.setUF(BestUF); // TODO: Further simplifications are possible @@ -910,18 +1128,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, return true; } -static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) { - SetVector<VPUser *> Users(V->user_begin(), V->user_end()); - for (unsigned I = 0; I != Users.size(); ++I) { - VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]); - if (isa<VPHeaderPHIRecipe>(Cur)) - continue; - for (VPValue *V : Cur->definedValues()) - Users.insert(V->user_begin(), V->user_end()); - } - return Users.takeVector(); -} - void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { for (VPRecipeBase &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { @@ -940,138 +1146,6 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { } } -/// Try to simplify recipe \p R. -static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { - using namespace llvm::VPlanPatternMatch; - - if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { - // Try to remove redundant blend recipes. - SmallPtrSet<VPValue *, 4> UniqueValues; - if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) - UniqueValues.insert(Blend->getIncomingValue(0)); - for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) - if (!match(Blend->getMask(I), m_False())) - UniqueValues.insert(Blend->getIncomingValue(I)); - - if (UniqueValues.size() == 1) { - Blend->replaceAllUsesWith(*UniqueValues.begin()); - Blend->eraseFromParent(); - return; - } - - if (Blend->isNormalized()) - return; - - // Normalize the blend so its first incoming value is used as the initial - // value with the others blended into it. - - unsigned StartIndex = 0; - for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { - // If a value's mask is used only by the blend then is can be deadcoded. - // TODO: Find the most expensive mask that can be deadcoded, or a mask - // that's used by multiple blends where it can be removed from them all. - VPValue *Mask = Blend->getMask(I); - if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) { - StartIndex = I; - break; - } - } - - SmallVector<VPValue *, 4> OperandsWithMask; - OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); - - for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { - if (I == StartIndex) - continue; - OperandsWithMask.push_back(Blend->getIncomingValue(I)); - OperandsWithMask.push_back(Blend->getMask(I)); - } - - auto *NewBlend = new VPBlendRecipe( - cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask); - NewBlend->insertBefore(&R); - - VPValue *DeadMask = Blend->getMask(StartIndex); - Blend->replaceAllUsesWith(NewBlend); - Blend->eraseFromParent(); - recursivelyDeleteDeadRecipes(DeadMask); - return; - } - - VPValue *A; - if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) { - VPValue *Trunc = R.getVPSingleValue(); - Type *TruncTy = TypeInfo.inferScalarType(Trunc); - Type *ATy = TypeInfo.inferScalarType(A); - if (TruncTy == ATy) { - Trunc->replaceAllUsesWith(A); - } else { - // Don't replace a scalarizing recipe with a widened cast. - if (isa<VPReplicateRecipe>(&R)) - return; - if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { - - unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) - ? Instruction::SExt - : Instruction::ZExt; - auto *VPC = - new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); - if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { - // UnderlyingExt has distinct return type, used to retain legacy cost. - VPC->setUnderlyingValue(UnderlyingExt); - } - VPC->insertBefore(&R); - Trunc->replaceAllUsesWith(VPC); - } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { - auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); - VPC->insertBefore(&R); - Trunc->replaceAllUsesWith(VPC); - } - } -#ifndef NDEBUG - // Verify that the cached type info is for both A and its users is still - // accurate by comparing it to freshly computed types. - VPTypeAnalysis TypeInfo2( - R.getParent()->getPlan()->getCanonicalIV()->getScalarType()); - assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); - for (VPUser *U : A->users()) { - auto *R = cast<VPRecipeBase>(U); - for (VPValue *VPV : R->definedValues()) - assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); - } -#endif - } - - // Simplify (X && Y) || (X && !Y) -> X. - // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X - // && (Y || Z) and (X || !X) into true. This requires queuing newly created - // recipes to be visited during simplification. - VPValue *X, *Y, *X1, *Y1; - if (match(&R, - m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), - m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) && - X == X1 && Y == Y1) { - R.getVPSingleValue()->replaceAllUsesWith(X); - R.eraseFromParent(); - return; - } - - if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) - return R.getVPSingleValue()->replaceAllUsesWith(A); - - if (match(&R, m_Not(m_Not(m_VPValue(A))))) - return R.getVPSingleValue()->replaceAllUsesWith(A); - - // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0. - if ((match(&R, - m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) || - match(&R, - m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) && - TypeInfo.inferScalarType(R.getOperand(1)) == - TypeInfo.inferScalarType(R.getVPSingleValue())) - return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1)); -} - /// Move loop-invariant recipes out of the vector loop region in \p Plan. static void licm(VPlan &Plan) { VPBasicBlock *Preheader = Plan.getVectorPreheader(); @@ -1106,19 +1180,6 @@ static void licm(VPlan &Plan) { } } -/// Try to simplify the recipes in \p Plan. -static void simplifyRecipes(VPlan &Plan) { - ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( - Plan.getEntry()); - Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); - VPTypeAnalysis TypeInfo(CanonicalIVType); - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { - for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - simplifyRecipe(R, TypeInfo); - } - } -} - void VPlanTransforms::truncateToMinimalBitwidths( VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) { #ifndef NDEBUG @@ -1256,10 +1317,10 @@ void VPlanTransforms::optimize(VPlan &Plan) { removeRedundantCanonicalIVs(Plan); removeRedundantInductionCasts(Plan); - simplifyRecipes(Plan); + simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType()); legalizeAndOptimizeInductions(Plan); removeRedundantExpandSCEVRecipes(Plan); - simplifyRecipes(Plan); + simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType()); removeDeadRecipes(Plan); createAndOptimizeReplicateRegions(Plan); @@ -1496,10 +1557,13 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask, auto *CastR = cast<VPWidenCastRecipe>(CR); VPID = VPIntrinsic::getForOpcode(CastR->getOpcode()); } - assert(VPID != Intrinsic::not_intrinsic && "Expected VP intrinsic"); + + // Not all intrinsics have a corresponding VP intrinsic. + if (VPID == Intrinsic::not_intrinsic) + return nullptr; assert(VPIntrinsic::getMaskParamPos(VPID) && VPIntrinsic::getVectorLengthParamPos(VPID) && - "Expected VP intrinsic"); + "Expected VP intrinsic to have mask and EVL"); SmallVector<VPValue *> Ops(CR->operands()); Ops.push_back(&AllOneMask); @@ -1656,9 +1720,9 @@ bool VPlanTransforms::tryAddExplicitVectorLength( VPSingleDefRecipe *OpVPEVL = VPEVL; if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits(); IVSize != 32) { - OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc - : Instruction::ZExt, - OpVPEVL, CanonicalIVPHI->getScalarType()); + OpVPEVL = new VPScalarCastRecipe( + IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL, + CanonicalIVPHI->getScalarType(), CanonicalIVIncrement->getDebugLoc()); OpVPEVL->insertBefore(CanonicalIVIncrement); } auto *NextEVLIV = @@ -1898,7 +1962,7 @@ void VPlanTransforms::handleUncountableEarlyExit( if (OrigLoop->getUniqueExitBlock()) { VPEarlyExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]); } else { - VPEarlyExitBlock = VPIRBasicBlock::fromBasicBlock( + VPEarlyExitBlock = Plan.createVPIRBasicBlock( !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); } @@ -1908,7 +1972,7 @@ void VPlanTransforms::handleUncountableEarlyExit( IsEarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond}); - VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split"); + VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split"); VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle); VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock); NewMiddle->swapSuccessors(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 9657770..7779442 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -49,7 +49,8 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) { return all_of(GEP->operands(), isUniformAfterVectorization); if (auto *VPI = dyn_cast<VPInstruction>(Def)) return VPI->isSingleScalar() || VPI->isVectorToScalar(); - return false; + // VPExpandSCEVRecipes must be placed in the entry and are alway uniform. + return isa<VPExpandSCEVRecipe>(Def); } /// Return true if \p V is a header mask in \p Plan. diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index ecbc13d..1a669b5 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -128,6 +128,8 @@ private: bool shrinkType(Instruction &I); void replaceValue(Value &Old, Value &New) { + LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); + LLVM_DEBUG(dbgs() << " With: " << New << '\n'); Old.replaceAllUsesWith(&New); if (auto *NewI = dyn_cast<Instruction>(&New)) { New.takeName(&Old); @@ -139,10 +141,17 @@ private: void eraseInstruction(Instruction &I) { LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n'); - for (Value *Op : I.operands()) - Worklist.pushValue(Op); + SmallVector<Value *> Ops(I.operands()); Worklist.remove(&I); I.eraseFromParent(); + + // Push remaining users of the operands and then the operand itself - allows + // further folds that were hindered by OneUse limits. + for (Value *Op : Ops) + if (auto *OpI = dyn_cast<Instruction>(Op)) { + Worklist.pushUsersToWorkList(*OpI); + Worklist.pushValue(OpI); + } } }; } // namespace @@ -696,7 +705,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { InstructionCost NewCost = TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind); + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask, + CostKind); bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; // If the lengths of the two vectors are not equal, @@ -1335,6 +1345,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) { MemoryLocation::get(SI), AA)) return false; + // Ensure we add the load back to the worklist BEFORE its users so they can + // erased in the correct order. + Worklist.push(Load); + if (ScalarizableIdx.isSafeWithFreeze()) ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx)); Value *GEP = Builder.CreateInBoundsGEP( @@ -1360,8 +1374,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (!match(&I, m_Load(m_Value(Ptr)))) return false; - auto *VecTy = cast<VectorType>(I.getType()); auto *LI = cast<LoadInst>(&I); + auto *VecTy = cast<VectorType>(LI->getType()); if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType())) return false; @@ -1401,7 +1415,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { LastCheckedInst = UI; } - auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT); + auto ScalarIdx = + canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT); if (ScalarIdx.isUnsafe()) return false; if (ScalarIdx.isSafeWithFreeze()) { @@ -1409,7 +1424,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { ScalarIdx.discard(); } - auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1)); + auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand()); OriginalCost += TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, Index ? Index->getZExtValue() : -1); @@ -1422,10 +1437,14 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (ScalarizedCost >= OriginalCost) return false; + // Ensure we add the load back to the worklist BEFORE its users so they can + // erased in the correct order. + Worklist.push(LI); + // Replace extracts with narrow scalar loads. for (User *U : LI->users()) { auto *EI = cast<ExtractElementInst>(U); - Value *Idx = EI->getOperand(1); + Value *Idx = EI->getIndexOperand(); // Insert 'freeze' for poison indexes. auto It = NeedFreeze.find(EI); @@ -1669,7 +1688,8 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { Value *X, *Y, *Z, *W; bool IsCommutative = false; - CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; + CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE; + CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE; if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) && match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) { auto *BO = cast<BinaryOperator>(LHS); @@ -1677,8 +1697,9 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem()) return false; IsCommutative = BinaryOperator::isCommutative(BO->getOpcode()); - } else if (match(LHS, m_Cmp(Pred, m_Value(X), m_Value(Y))) && - match(RHS, m_SpecificCmp(Pred, m_Value(Z), m_Value(W)))) { + } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) && + match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) && + (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) { IsCommutative = cast<CmpInst>(LHS)->isCommutative(); } else return false; @@ -1723,18 +1744,48 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I); + // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns + // where one use shuffles have gotten split across the binop/cmp. These + // often allow a major reduction in total cost that wouldn't happen as + // individual folds. + auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask, + TTI::TargetCostKind CostKind) -> bool { + Value *InnerOp; + ArrayRef<int> InnerMask; + if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(), + m_Mask(InnerMask)))) && + InnerOp->getType() == Op->getType() && + all_of(InnerMask, + [NumSrcElts](int M) { return M < (int)NumSrcElts; })) { + for (int &M : Mask) + if (Offset <= M && M < (int)(Offset + NumSrcElts)) { + M = InnerMask[M - Offset]; + M = 0 <= M ? M + Offset : M; + } + OldCost += TTI.getInstructionCost(cast<Instruction>(Op), CostKind); + Op = InnerOp; + return true; + } + return false; + }; + bool ReducedInstCount = false; + ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind); + ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind); + ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind); + ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind); + InstructionCost NewCost = TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) + TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}); - if (Pred == CmpInst::BAD_ICMP_PREDICATE) { + if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) { NewCost += TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind); } else { auto *ShuffleCmpTy = FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy); NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, - ShuffleDstTy, Pred, CostKind); + ShuffleDstTy, PredLHS, CostKind); } LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I @@ -1743,17 +1794,17 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { // If either shuffle will constant fold away, then fold for the same cost as // we will reduce the instruction count. - bool ReducedInstCount = (isa<Constant>(X) && isa<Constant>(Z)) || - (isa<Constant>(Y) && isa<Constant>(W)); + ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) || + (isa<Constant>(Y) && isa<Constant>(W)); if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost)) return false; Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0); Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1); - Value *NewBO = Pred == CmpInst::BAD_ICMP_PREDICATE + Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE ? Builder.CreateBinOp( cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1) - : Builder.CreateCmp(Pred, Shuf0, Shuf1); + : Builder.CreateCmp(PredLHS, Shuf0, Shuf1); // Intersect flags from the old binops. if (auto *NewInst = dyn_cast<Instruction>(NewBO)) { @@ -1972,9 +2023,7 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { if (Match1) InnerCost1 = TTI.getInstructionCost(cast<Instruction>(OuterV1), CostKind); - InstructionCost OuterCost = TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, OuterMask, CostKind, - 0, nullptr, {OuterV0, OuterV1}, &I); + InstructionCost OuterCost = TTI.getInstructionCost(&I, CostKind); InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost; @@ -3047,12 +3096,16 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) { TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx); InstructionCost OldCost = ExtCost + InsCost; - InstructionCost NewCost = TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0, - nullptr, {DstVec, SrcVec}); + // Ignore 'free' identity insertion shuffle. + // TODO: getShuffleCost should return TCC_Free for Identity shuffles. + InstructionCost NewCost = 0; + if (!ShuffleVectorInst::isIdentityMask(Mask, NumElts)) + NewCost += TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0, nullptr, + {DstVec, SrcVec}); if (!Ext->hasOneUse()) NewCost += ExtCost; - LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair : " << I + LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost << "\n"); |