diff options
Diffstat (limited to 'llvm/lib/Transforms')
30 files changed, 441 insertions, 370 deletions
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index 240d089..7b59c39 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -69,7 +69,6 @@ static const char *const CoroIntrinsics[] = { "llvm.coro.async.context.dealloc", "llvm.coro.async.resume", "llvm.coro.async.size.replace", - "llvm.coro.async.store_resume", "llvm.coro.await.suspend.bool", "llvm.coro.await.suspend.handle", "llvm.coro.await.suspend.void", diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 96956481..449d64d 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -66,19 +66,19 @@ static cl::opt<unsigned> MaxCodeSizeGrowth( "Maximum codesize growth allowed per function")); static cl::opt<unsigned> MinCodeSizeSavings( - "funcspec-min-codesize-savings", cl::init(20), cl::Hidden, cl::desc( - "Reject specializations whose codesize savings are less than this" - "much percent of the original function size")); + "funcspec-min-codesize-savings", cl::init(20), cl::Hidden, + cl::desc("Reject specializations whose codesize savings are less than this " + "much percent of the original function size")); static cl::opt<unsigned> MinLatencySavings( "funcspec-min-latency-savings", cl::init(40), cl::Hidden, - cl::desc("Reject specializations whose latency savings are less than this" + cl::desc("Reject specializations whose latency savings are less than this " "much percent of the original function size")); static cl::opt<unsigned> MinInliningBonus( - "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc( - "Reject specializations whose inlining bonus is less than this" - "much percent of the original function size")); + "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, + cl::desc("Reject specializations whose inlining bonus is less than this " + "much percent of the original function size")); static cl::opt<bool> SpecializeOnAddress( "funcspec-on-address", cl::init(false), cl::Hidden, cl::desc( diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 16a80e9..78cd249 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -105,7 +105,7 @@ static cl::opt<int> ColdCCRelFreq( "coldcc-rel-freq", cl::Hidden, cl::init(2), cl::desc( "Maximum block frequency, expressed as a percentage of caller's " - "entry frequency, for a call site to be considered cold for enabling" + "entry frequency, for a call site to be considered cold for enabling " "coldcc")); /// Is this global variable possibly used by a leak checker as a root? If so, diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 1bf7ff4..016db55 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -122,6 +122,20 @@ static cl::opt<unsigned> cl::desc("Max depth to recursively search for missing " "frames through tail calls.")); +// By default enable cloning of callsites involved with recursive cycles +static cl::opt<bool> AllowRecursiveCallsites( + "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, + cl::desc("Allow cloning of callsites involved in recursive cycles")); + +// When disabled, try to detect and prevent cloning of recursive contexts. +// This is only necessary until we support cloning through recursive cycles. +// Leave on by default for now, as disabling requires a little bit of compile +// time overhead and doesn't affect correctness, it will just inflate the cold +// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled. +static cl::opt<bool> AllowRecursiveContexts( + "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, + cl::desc("Allow cloning of contexts through recursive cycles")); + namespace llvm { cl::opt<bool> EnableMemProfContextDisambiguation( "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden, @@ -1236,9 +1250,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( StackEntryIdToContextNodeMap[StackId] = StackNode; StackNode->OrigStackOrAllocId = StackId; } - auto Ins = StackIdSet.insert(StackId); - if (!Ins.second) - StackNode->Recursive = true; + // Marking a node recursive will prevent its cloning completely, even for + // non-recursive contexts flowing through it. + if (!AllowRecursiveCallsites) { + auto Ins = StackIdSet.insert(StackId); + if (!Ins.second) + StackNode->Recursive = true; + } StackNode->AllocTypes |= (uint8_t)AllocType; PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); PrevNode = StackNode; @@ -1375,8 +1393,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, set_union(CallerEdgeContextIds, Edge->ContextIds); } // Node can have more context ids than callers if some contexts terminate at - // node and some are longer. - assert(NodeContextIds == CallerEdgeContextIds || + // node and some are longer. If we are allowing recursive callsites but + // haven't disabled recursive contexts, this will be violated for + // incompletely cloned recursive cycles, so skip the checking in that case. + assert((AllowRecursiveCallsites && AllowRecursiveContexts) || + NodeContextIds == CallerEdgeContextIds || set_is_subset(CallerEdgeContextIds, NodeContextIds)); } if (Node->CalleeEdges.size()) { @@ -3370,6 +3391,21 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( assert(Node->AllocTypes != (uint8_t)AllocationType::None); + DenseSet<uint32_t> RecursiveContextIds; + // If we are allowing recursive callsites, but have also disabled recursive + // contexts, look for context ids that show up in multiple caller edges. + if (AllowRecursiveCallsites && !AllowRecursiveContexts) { + DenseSet<uint32_t> AllCallerContextIds; + for (auto &CE : Node->CallerEdges) { + // Resize to the largest set of caller context ids, since we know the + // final set will be at least that large. + AllCallerContextIds.reserve(CE->getContextIds().size()); + for (auto Id : CE->getContextIds()) + if (!AllCallerContextIds.insert(Id).second) + RecursiveContextIds.insert(Id); + } + } + // Iterate until we find no more opportunities for disambiguating the alloc // types via cloning. In most cases this loop will terminate once the Node // has a single allocation type, in which case no more cloning is needed. @@ -3394,6 +3430,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( // allocation. auto CallerEdgeContextsForAlloc = set_intersection(CallerEdge->getContextIds(), AllocContextIds); + if (!RecursiveContextIds.empty()) + CallerEdgeContextsForAlloc = + set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds); if (CallerEdgeContextsForAlloc.empty()) { ++EI; continue; diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index b40ab35..67585e9 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -129,7 +129,7 @@ static cl::opt<bool> PrintModuleBeforeOptimizations( static cl::opt<bool> AlwaysInlineDeviceFunctions( "openmp-opt-inline-device", - cl::desc("Inline all applicible functions on the device."), cl::Hidden, + cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false)); static cl::opt<bool> diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 603beb3..b978c54 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -162,7 +162,7 @@ static cl::opt<bool> ProfileSampleBlockAccurate( static cl::opt<bool> ProfileAccurateForSymsInList( "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " - "be accurate. It may be overriden by profile-sample-accurate. ")); + "be accurate. It may be overridden by profile-sample-accurate. ")); static cl::opt<bool> ProfileMergeInlinee( "sample-profile-merge-inlinee", cl::Hidden, cl::init(true), @@ -193,9 +193,10 @@ static cl::opt<bool> ProfileSizeInline( // and inline the hot functions (that are skipped in this pass). static cl::opt<bool> DisableSampleLoaderInlining( "disable-sample-loader-inlining", cl::Hidden, cl::init(false), - cl::desc("If true, artifically skip inline transformation in sample-loader " - "pass, and merge (or scale) profiles (as configured by " - "--sample-profile-merge-inlinee).")); + cl::desc( + "If true, artificially skip inline transformation in sample-loader " + "pass, and merge (or scale) profiles (as configured by " + "--sample-profile-merge-inlinee).")); namespace llvm { cl::opt<bool> @@ -255,7 +256,7 @@ static cl::opt<unsigned> PrecentMismatchForStalenessError( static cl::opt<bool> CallsitePrioritizedInline( "sample-profile-prioritized-inline", cl::Hidden, - cl::desc("Use call site prioritized inlining for sample profile loader." + cl::desc("Use call site prioritized inlining for sample profile loader. " "Currently only CSSPGO is supported.")); static cl::opt<bool> UsePreInlinerDecision( diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index e9bb2b8..f82a557 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -39,8 +39,7 @@ static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS, /// This is the complement of getFCmpCode, which turns an opcode and two /// operands into either a FCmp instruction, or a true/false constant. static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS, - InstCombiner::BuilderTy &Builder, - FastMathFlags FMF) { + InstCombiner::BuilderTy &Builder, FMFSource FMF) { FCmpInst::Predicate NewPred; if (Constant *TorF = getPredForFCmpCode(Code, LHS->getType(), NewPred)) return TorF; @@ -514,7 +513,8 @@ static Value *foldLogOpOfMaskedICmpsAsymmetric( /// into a single (icmp(A & X) ==/!= Y). static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, bool IsLogical, - InstCombiner::BuilderTy &Builder) { + InstCombiner::BuilderTy &Builder, + const SimplifyQuery &Q) { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); std::optional<std::pair<unsigned, unsigned>> MaskPair = @@ -587,93 +587,107 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, return Builder.CreateICmp(NewCC, NewAnd2, A); } - // Remaining cases assume at least that B and D are constant, and depend on - // their actual values. This isn't strictly necessary, just a "handle the - // easy cases for now" decision. const APInt *ConstB, *ConstD; - if (!match(B, m_APInt(ConstB)) || !match(D, m_APInt(ConstD))) - return nullptr; - - if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) { - // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and - // (icmp ne (A & B), B) & (icmp ne (A & D), D) - // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0) - // Only valid if one of the masks is a superset of the other (check "B&D" is - // the same as either B or D). - APInt NewMask = *ConstB & *ConstD; - if (NewMask == *ConstB) - return LHS; - else if (NewMask == *ConstD) - return RHS; - } - - if (Mask & AMask_NotAllOnes) { - // (icmp ne (A & B), B) & (icmp ne (A & D), D) - // -> (icmp ne (A & B), A) or (icmp ne (A & D), A) - // Only valid if one of the masks is a superset of the other (check "B|D" is - // the same as either B or D). - APInt NewMask = *ConstB | *ConstD; - if (NewMask == *ConstB) - return LHS; - else if (NewMask == *ConstD) - return RHS; - } - - if (Mask & (BMask_Mixed | BMask_NotMixed)) { - // Mixed: - // (icmp eq (A & B), C) & (icmp eq (A & D), E) - // We already know that B & C == C && D & E == E. - // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of - // C and E, which are shared by both the mask B and the mask D, don't - // contradict, then we can transform to - // -> (icmp eq (A & (B|D)), (C|E)) - // Currently, we only handle the case of B, C, D, and E being constant. - // We can't simply use C and E because we might actually handle - // (icmp ne (A & B), B) & (icmp eq (A & D), D) - // with B and D, having a single bit set. - - // NotMixed: - // (icmp ne (A & B), C) & (icmp ne (A & D), E) - // -> (icmp ne (A & (B & D)), (C & E)) - // Check the intersection (B & D) for inequality. - // Assume that (B & D) == B || (B & D) == D, i.e B/D is a subset of D/B - // and (B & D) & (C ^ E) == 0, bits of C and E, which are shared by both the - // B and the D, don't contradict. - // Note that we can assume (~B & C) == 0 && (~D & E) == 0, previous - // operation should delete these icmps if it hadn't been met. - - const APInt *OldConstC, *OldConstE; - if (!match(C, m_APInt(OldConstC)) || !match(E, m_APInt(OldConstE))) - return nullptr; - - auto FoldBMixed = [&](ICmpInst::Predicate CC, bool IsNot) -> Value * { - CC = IsNot ? CmpInst::getInversePredicate(CC) : CC; - const APInt ConstC = PredL != CC ? *ConstB ^ *OldConstC : *OldConstC; - const APInt ConstE = PredR != CC ? *ConstD ^ *OldConstE : *OldConstE; + if (match(B, m_APInt(ConstB)) && match(D, m_APInt(ConstD))) { + if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) { + // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0) + // Only valid if one of the masks is a superset of the other (check "B&D" + // is the same as either B or D). + APInt NewMask = *ConstB & *ConstD; + if (NewMask == *ConstB) + return LHS; + if (NewMask == *ConstD) + return RHS; + } - if (((*ConstB & *ConstD) & (ConstC ^ ConstE)).getBoolValue()) - return IsNot ? nullptr : ConstantInt::get(LHS->getType(), !IsAnd); + if (Mask & AMask_NotAllOnes) { + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), A) or (icmp ne (A & D), A) + // Only valid if one of the masks is a superset of the other (check "B|D" + // is the same as either B or D). + APInt NewMask = *ConstB | *ConstD; + if (NewMask == *ConstB) + return LHS; + if (NewMask == *ConstD) + return RHS; + } - if (IsNot && !ConstB->isSubsetOf(*ConstD) && !ConstD->isSubsetOf(*ConstB)) + if (Mask & (BMask_Mixed | BMask_NotMixed)) { + // Mixed: + // (icmp eq (A & B), C) & (icmp eq (A & D), E) + // We already know that B & C == C && D & E == E. + // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of + // C and E, which are shared by both the mask B and the mask D, don't + // contradict, then we can transform to + // -> (icmp eq (A & (B|D)), (C|E)) + // Currently, we only handle the case of B, C, D, and E being constant. + // We can't simply use C and E because we might actually handle + // (icmp ne (A & B), B) & (icmp eq (A & D), D) + // with B and D, having a single bit set. + + // NotMixed: + // (icmp ne (A & B), C) & (icmp ne (A & D), E) + // -> (icmp ne (A & (B & D)), (C & E)) + // Check the intersection (B & D) for inequality. + // Assume that (B & D) == B || (B & D) == D, i.e B/D is a subset of D/B + // and (B & D) & (C ^ E) == 0, bits of C and E, which are shared by both + // the B and the D, don't contradict. Note that we can assume (~B & C) == + // 0 && (~D & E) == 0, previous operation should delete these icmps if it + // hadn't been met. + + const APInt *OldConstC, *OldConstE; + if (!match(C, m_APInt(OldConstC)) || !match(E, m_APInt(OldConstE))) return nullptr; - APInt BD, CE; - if (IsNot) { - BD = *ConstB & *ConstD; - CE = ConstC & ConstE; - } else { - BD = *ConstB | *ConstD; - CE = ConstC | ConstE; - } - Value *NewAnd = Builder.CreateAnd(A, BD); - Value *CEVal = ConstantInt::get(A->getType(), CE); - return Builder.CreateICmp(CC, CEVal, NewAnd); - }; + auto FoldBMixed = [&](ICmpInst::Predicate CC, bool IsNot) -> Value * { + CC = IsNot ? CmpInst::getInversePredicate(CC) : CC; + const APInt ConstC = PredL != CC ? *ConstB ^ *OldConstC : *OldConstC; + const APInt ConstE = PredR != CC ? *ConstD ^ *OldConstE : *OldConstE; + + if (((*ConstB & *ConstD) & (ConstC ^ ConstE)).getBoolValue()) + return IsNot ? nullptr : ConstantInt::get(LHS->getType(), !IsAnd); + + if (IsNot && !ConstB->isSubsetOf(*ConstD) && + !ConstD->isSubsetOf(*ConstB)) + return nullptr; + + APInt BD, CE; + if (IsNot) { + BD = *ConstB & *ConstD; + CE = ConstC & ConstE; + } else { + BD = *ConstB | *ConstD; + CE = ConstC | ConstE; + } + Value *NewAnd = Builder.CreateAnd(A, BD); + Value *CEVal = ConstantInt::get(A->getType(), CE); + return Builder.CreateICmp(CC, CEVal, NewAnd); + }; + + if (Mask & BMask_Mixed) + return FoldBMixed(NewCC, false); + if (Mask & BMask_NotMixed) // can be else also + return FoldBMixed(NewCC, true); + } + } - if (Mask & BMask_Mixed) - return FoldBMixed(NewCC, false); - if (Mask & BMask_NotMixed) // can be else also - return FoldBMixed(NewCC, true); + // (icmp eq (A & B), 0) | (icmp eq (A & D), 0) + // -> (icmp ne (A & (B|D)), (B|D)) + // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) + // -> (icmp eq (A & (B|D)), (B|D)) + // iff B and D is known to be a power of two + if (Mask & Mask_NotAllZeros && + isKnownToBeAPowerOfTwo(B, /*OrZero=*/false, /*Depth=*/0, Q) && + isKnownToBeAPowerOfTwo(D, /*OrZero=*/false, /*Depth=*/0, Q)) { + // If this is a logical and/or, then we must prevent propagation of a + // poison value from the RHS by inserting freeze. + if (IsLogical) + D = Builder.CreateFreeze(D); + Value *Mask = Builder.CreateOr(B, D); + Value *Masked = Builder.CreateAnd(A, Mask); + return Builder.CreateICmp(NewCC, Masked, Mask); } return nullptr; } @@ -776,46 +790,6 @@ foldAndOrOfICmpsWithPow2AndWithZero(InstCombiner::BuilderTy &Builder, return Builder.CreateICmp(Pred, And, Op); } -// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) -// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) -Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, - ICmpInst *RHS, - Instruction *CxtI, - bool IsAnd, - bool IsLogical) { - CmpInst::Predicate Pred = IsAnd ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; - if (LHS->getPredicate() != Pred || RHS->getPredicate() != Pred) - return nullptr; - - if (!match(LHS->getOperand(1), m_Zero()) || - !match(RHS->getOperand(1), m_Zero())) - return nullptr; - - Value *L1, *L2, *R1, *R2; - if (match(LHS->getOperand(0), m_And(m_Value(L1), m_Value(L2))) && - match(RHS->getOperand(0), m_And(m_Value(R1), m_Value(R2)))) { - if (L1 == R2 || L2 == R2) - std::swap(R1, R2); - if (L2 == R1) - std::swap(L1, L2); - - if (L1 == R1 && - isKnownToBeAPowerOfTwo(L2, false, 0, CxtI) && - isKnownToBeAPowerOfTwo(R2, false, 0, CxtI)) { - // If this is a logical and/or, then we must prevent propagation of a - // poison value from the RHS by inserting freeze. - if (IsLogical) - R2 = Builder.CreateFreeze(R2); - Value *Mask = Builder.CreateOr(L2, R2); - Value *Masked = Builder.CreateAnd(L1, Mask); - auto NewPred = IsAnd ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE; - return Builder.CreateICmp(NewPred, Masked, Mask); - } - } - - return nullptr; -} - /// General pattern: /// X & Y /// @@ -1431,8 +1405,7 @@ static Value *matchIsFiniteTest(InstCombiner::BuilderTy &Builder, FCmpInst *LHS, return nullptr; return Builder.CreateFCmpFMF(FCmpInst::getOrderedPredicate(PredR), RHS0, RHS1, - LHS->getFastMathFlags() & - RHS->getFastMathFlags()); + FMFSource::intersect(LHS, RHS)); } Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, @@ -1469,7 +1442,7 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, // Intersect the fast math flags. // TODO: We can union the fast math flags unless this is a logical select. return getFCmpValue(NewPred, LHS0, LHS1, Builder, - LHS->getFastMathFlags() & RHS->getFastMathFlags()); + FMFSource::intersect(LHS, RHS)); } // This transform is not valid for a logical select. @@ -1486,8 +1459,8 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, // Ignore the constants because they are obviously not NANs: // (fcmp ord x, 0.0) & (fcmp ord y, 0.0) -> (fcmp ord x, y) // (fcmp uno x, 0.0) | (fcmp uno y, 0.0) -> (fcmp uno x, y) - return Builder.CreateFCmpFMF( - PredL, LHS0, RHS0, LHS->getFastMathFlags() & RHS->getFastMathFlags()); + return Builder.CreateFCmpFMF(PredL, LHS0, RHS0, + FMFSource::intersect(LHS, RHS)); } } @@ -3327,12 +3300,6 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsLogical) { const SimplifyQuery Q = SQ.getWithInstruction(&I); - // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) - // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) - // if K1 and K2 are a one-bit mask. - if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, &I, IsAnd, IsLogical)) - return V; - ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0); Value *LHS1 = LHS->getOperand(1), *RHS1 = RHS->getOperand(1); @@ -3359,7 +3326,7 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, // handle (roughly): // (icmp ne (A & B), C) | (icmp ne (A & D), E) // (icmp eq (A & B), C) & (icmp eq (A & D), E) - if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, IsAnd, IsLogical, Builder)) + if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, IsAnd, IsLogical, Builder, Q)) return V; if (Value *V = @@ -4964,8 +4931,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { // (A & B) ^ (A | C) --> A ? ~B : C -- There are 4 commuted variants. if (I.getType()->isIntOrIntVectorTy(1) && - match(Op0, m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B)))) && - match(Op1, m_OneUse(m_LogicalOr(m_Value(C), m_Value(D))))) { + match(&I, m_c_Xor(m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B))), + m_OneUse(m_LogicalOr(m_Value(C), m_Value(D)))))) { bool NeedFreeze = isa<SelectInst>(Op0) && isa<SelectInst>(Op1) && B == D; if (B == C || B == D) std::swap(A, B); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 5494c70..c55c40c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2674,9 +2674,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // copysign Mag, (copysign ?, X) --> copysign Mag, X Value *X; if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X)))) { - Value *CopySign = Builder.CreateCopySign( - Mag, X, - II->getFastMathFlags() & cast<Instruction>(Sign)->getFastMathFlags()); + Value *CopySign = + Builder.CreateCopySign(Mag, X, FMFSource::intersect(II, Sign)); return replaceInstUsesWith(*II, CopySign); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 8b23583..2e45725 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2485,9 +2485,8 @@ Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp, // icmp ule i64 (shl X, 32), 8589934592 -> // icmp ule i32 (trunc X, i32), 2 -> // icmp ult i32 (trunc X, i32), 3 - if (auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant( - Pred, ConstantInt::get(ShType->getContext(), C))) { + if (auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant( + Pred, ConstantInt::get(ShType->getContext(), C))) { CmpPred = FlippedStrictness->first; RHSC = cast<ConstantInt>(FlippedStrictness->second)->getValue(); } @@ -3091,12 +3090,12 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp, unsigned BW = C.getBitWidth(); std::bitset<4> Table; auto ComputeTable = [&](bool Op0Val, bool Op1Val) { - int Res = 0; + APInt Res(BW, 0); if (Op0Val) - Res += isa<ZExtInst>(Ext0) ? 1 : -1; + Res += APInt(BW, isa<ZExtInst>(Ext0) ? 1 : -1, /*isSigned=*/true); if (Op1Val) - Res += isa<ZExtInst>(Ext1) ? 1 : -1; - return ICmpInst::compare(APInt(BW, Res, true), C, Pred); + Res += APInt(BW, isa<ZExtInst>(Ext1) ? 1 : -1, /*isSigned=*/true); + return ICmpInst::compare(Res, C, Pred); }; Table[0] = ComputeTable(false, false); @@ -3280,8 +3279,7 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) { // x sgt C-1 <--> x sge C <--> not(x slt C) auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant( - PredB, cast<Constant>(RHS2)); + getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2)); if (!FlippedStrictness) return false; assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && @@ -6908,79 +6906,6 @@ Instruction *InstCombinerImpl::foldICmpUsingBoolRange(ICmpInst &I) { return nullptr; } -std::optional<std::pair<CmpPredicate, Constant *>> -InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpPredicate Pred, - Constant *C) { - assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) && - "Only for relational integer predicates."); - - Type *Type = C->getType(); - bool IsSigned = ICmpInst::isSigned(Pred); - - CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred); - bool WillIncrement = - UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT; - - // Check if the constant operand can be safely incremented/decremented - // without overflowing/underflowing. - auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) { - return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned); - }; - - Constant *SafeReplacementConstant = nullptr; - if (auto *CI = dyn_cast<ConstantInt>(C)) { - // Bail out if the constant can't be safely incremented/decremented. - if (!ConstantIsOk(CI)) - return std::nullopt; - } else if (auto *FVTy = dyn_cast<FixedVectorType>(Type)) { - unsigned NumElts = FVTy->getNumElements(); - for (unsigned i = 0; i != NumElts; ++i) { - Constant *Elt = C->getAggregateElement(i); - if (!Elt) - return std::nullopt; - - if (isa<UndefValue>(Elt)) - continue; - - // Bail out if we can't determine if this constant is min/max or if we - // know that this constant is min/max. - auto *CI = dyn_cast<ConstantInt>(Elt); - if (!CI || !ConstantIsOk(CI)) - return std::nullopt; - - if (!SafeReplacementConstant) - SafeReplacementConstant = CI; - } - } else if (isa<VectorType>(C->getType())) { - // Handle scalable splat - Value *SplatC = C->getSplatValue(); - auto *CI = dyn_cast_or_null<ConstantInt>(SplatC); - // Bail out if the constant can't be safely incremented/decremented. - if (!CI || !ConstantIsOk(CI)) - return std::nullopt; - } else { - // ConstantExpr? - return std::nullopt; - } - - // It may not be safe to change a compare predicate in the presence of - // undefined elements, so replace those elements with the first safe constant - // that we found. - // TODO: in case of poison, it is safe; let's replace undefs only. - if (C->containsUndefOrPoisonElement()) { - assert(SafeReplacementConstant && "Replacement constant not set"); - C = Constant::replaceUndefsWith(C, SafeReplacementConstant); - } - - CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred); - - // Increment or decrement the constant. - Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true); - Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne); - - return std::make_pair(NewPred, NewC); -} - /// If we have an icmp le or icmp ge instruction with a constant operand, turn /// it into the appropriate icmp lt or icmp gt instruction. This transform /// allows them to be folded in visitICmpInst. @@ -6996,8 +6921,7 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) { if (!Op1C) return nullptr; - auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, Op1C); + auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C); if (!FlippedStrictness) return nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index b31ae37..f6992119 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -435,9 +435,6 @@ private: Instruction * canonicalizeConditionalNegationViaMathToSelect(BinaryOperator &i); - Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS, - Instruction *CxtI, bool IsAnd, - bool IsLogical = false); Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D, bool InvertFalseVal = false); Value *getSelectCondition(Value *A, Value *B, bool ABIsTheSame); diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 272a194..80308bf 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -765,33 +765,14 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); LoadInst *NewLI = new LoadInst(FirstLI->getType(), NewPN, "", IsVolatile, LoadAlignment); - - unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_range, - LLVMContext::MD_invariant_load, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_nonnull, - LLVMContext::MD_align, - LLVMContext::MD_dereferenceable, - LLVMContext::MD_dereferenceable_or_null, - LLVMContext::MD_access_group, - LLVMContext::MD_noundef, - }; - - for (unsigned ID : KnownIDs) - NewLI->setMetadata(ID, FirstLI->getMetadata(ID)); + NewLI->copyMetadata(*FirstLI); // Add all operands to the new PHI and combine TBAA metadata. for (auto Incoming : drop_begin(zip(PN.blocks(), PN.incoming_values()))) { BasicBlock *BB = std::get<0>(Incoming); Value *V = std::get<1>(Incoming); LoadInst *LI = cast<LoadInst>(V); - // FIXME: https://github.com/llvm/llvm-project/issues/121495 - // Call combineMetadataForCSE instead, so that an explicit set of KnownIDs - // doesn't need to be maintained here. - combineMetadata(NewLI, LI, KnownIDs, true); + combineMetadataForCSE(NewLI, LI, true); Value *NewInVal = LI->getOperand(0); if (NewInVal != InVal) InVal = nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 7fd91c7..1eca177 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1689,8 +1689,7 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp, return nullptr; // Check the constant we'd have with flipped-strictness predicate. - auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, C0); + auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0); if (!FlippedStrictness) return nullptr; @@ -1970,8 +1969,7 @@ static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal, Value *RHS; SelectPatternFlavor SPF; const DataLayout &DL = BOp->getDataLayout(); - auto Flipped = - InstCombiner::getFlippedStrictnessPredicateAndConstant(Predicate, C1); + auto Flipped = getFlippedStrictnessPredicateAndConstant(Predicate, C1); if (C3 == ConstantFoldBinaryOpOperands(Opcode, C1, C2, DL)) { SPF = getSelectPattern(Predicate).Flavor; @@ -2823,9 +2821,9 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC, // %cnd = icmp slt i32 %rem, 0 // %add = add i32 %rem, %n // %sel = select i1 %cnd, i32 %add, i32 %rem - if (match(TrueVal, m_Add(m_Specific(RemRes), m_Value(Remainder))) && + if (match(TrueVal, m_c_Add(m_Specific(RemRes), m_Value(Remainder))) && match(RemRes, m_SRem(m_Value(Op), m_Specific(Remainder))) && - IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero*/ true) && + IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero=*/true) && FalseVal == RemRes) return FoldToBitwiseAnd(Remainder); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index f63de1f..2fb60ef 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -939,12 +939,11 @@ Instruction *InstCombinerImpl::foldBinOpShiftWithShift(BinaryOperator &I) { m_OneUse(m_Shift(m_Value(Y), m_Value(Shift))))) return nullptr; if (!match(I.getOperand(1 - ShOpnum), - m_BinOp(m_Value(ShiftedX), m_Value(Mask)))) + m_c_BinOp(m_CombineAnd( + m_OneUse(m_Shift(m_Value(X), m_Specific(Shift))), + m_Value(ShiftedX)), + m_Value(Mask)))) return nullptr; - - if (!match(ShiftedX, m_OneUse(m_Shift(m_Value(X), m_Specific(Shift))))) - return nullptr; - // Make sure we are matching instruction shifts and not ConstantExpr auto *IY = dyn_cast<Instruction>(I.getOperand(ShOpnum)); auto *IX = dyn_cast<Instruction>(ShiftedX); @@ -1822,12 +1821,29 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN, continue; } - // If the only use of phi is comparing it with a constant then we can - // put this comparison in the incoming BB directly after a ucmp/scmp call - // because we know that it will simplify to a single icmp. - const APInt *Ignored; - if (isa<CmpIntrinsic>(InVal) && InVal->hasOneUser() && - match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored)))) { + // Handle some cases that can't be fully simplified, but where we know that + // the two instructions will fold into one. + auto WillFold = [&]() { + if (!InVal->hasOneUser()) + return false; + + // icmp of ucmp/scmp with constant will fold to icmp. + const APInt *Ignored; + if (isa<CmpIntrinsic>(InVal) && + match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored)))) + return true; + + // icmp eq zext(bool), 0 will fold to !bool. + if (isa<ZExtInst>(InVal) && + cast<ZExtInst>(InVal)->getSrcTy()->isIntOrIntVectorTy(1) && + match(&I, + m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(PN), m_Zero()))) + return true; + + return false; + }; + + if (WillFold()) { OpsToMoveUseToIncomingBB.push_back(i); NewPhiValues.push_back(nullptr); continue; diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 530061e..2031728 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -192,7 +192,7 @@ static cl::opt<bool> cl::Hidden); static cl::opt<int> ClHotPercentileCutoff("hwasan-percentile-cutoff-hot", - cl::desc("Hot percentile cuttoff.")); + cl::desc("Hot percentile cutoff.")); static cl::opt<float> ClRandomSkipRate("hwasan-random-rate", diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp index 2418030..f27798c 100644 --- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp +++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp @@ -30,7 +30,7 @@ using namespace llvm; static cl::opt<int> HotPercentileCutoff("lower-allow-check-percentile-cutoff-hot", - cl::desc("Hot percentile cuttoff.")); + cl::desc("Hot percentile cutoff.")); static cl::opt<float> RandomRate("lower-allow-check-random-rate", diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 471086c..db4d62e 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -158,11 +158,11 @@ STATISTIC(NumCoveredBlocks, "Number of basic blocks that were executed"); // Command line option to specify the file to read profile from. This is // mainly used for testing. -static cl::opt<std::string> - PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden, - cl::value_desc("filename"), - cl::desc("Specify the path of profile data file. This is" - "mainly for test purpose.")); +static cl::opt<std::string> PGOTestProfileFile( + "pgo-test-profile-file", cl::init(""), cl::Hidden, + cl::value_desc("filename"), + cl::desc("Specify the path of profile data file. This is " + "mainly for test purpose.")); static cl::opt<std::string> PGOTestProfileRemappingFile( "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden, cl::value_desc("filename"), @@ -186,7 +186,7 @@ static cl::opt<unsigned> MaxNumAnnotations( // to write to the metadata for a single memop intrinsic. static cl::opt<unsigned> MaxNumMemOPAnnotations( "memop-max-annotations", cl::init(4), cl::Hidden, - cl::desc("Max number of preicise value annotations for a single memop" + cl::desc("Max number of precise value annotations for a single memop" "intrinsic")); // Command line option to control appending FunctionHash to the name of a COMDAT @@ -291,13 +291,13 @@ static cl::opt<bool> PGOVerifyHotBFI( cl::desc("Print out the non-match BFI count if a hot raw profile count " "becomes non-hot, or a cold raw profile count becomes hot. " "The print is enabled under -Rpass-analysis=pgo, or " - "internal option -pass-remakrs-analysis=pgo.")); + "internal option -pass-remarks-analysis=pgo.")); static cl::opt<bool> PGOVerifyBFI( "pgo-verify-bfi", cl::init(false), cl::Hidden, cl::desc("Print out mismatched BFI counts after setting profile metadata " "The print is enabled under -Rpass-analysis=pgo, or " - "internal option -pass-remakrs-analysis=pgo.")); + "internal option -pass-remarks-analysis=pgo.")); static cl::opt<unsigned> PGOVerifyBFIRatio( "pgo-verify-bfi-ratio", cl::init(2), cl::Hidden, diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp index 1961095..9cd81f3 100644 --- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -70,7 +70,7 @@ namespace { /// violations. struct TypeSanitizer { TypeSanitizer(Module &M); - bool run(Function &F, const TargetLibraryInfo &TLI); + bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI); void instrumentGlobals(Module &M); private: @@ -510,7 +510,8 @@ void collectMemAccessInfo( } } -bool TypeSanitizer::run(Function &F, const TargetLibraryInfo &TLI) { +bool TypeSanitizer::sanitizeFunction(Function &F, + const TargetLibraryInfo &TLI) { // This is required to prevent instrumenting call to __tysan_init from within // the module constructor. if (&F == TysanCtorFunction.getCallee() || &F == TysanGlobalsSetTypeFunction) @@ -876,15 +877,8 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, return true; } -PreservedAnalyses TypeSanitizerPass::run(Function &F, - FunctionAnalysisManager &FAM) { - TypeSanitizer TySan(*F.getParent()); - TySan.run(F, FAM.getResult<TargetLibraryAnalysis>(F)); - return PreservedAnalyses::none(); -} - -PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M, - ModuleAnalysisManager &AM) { +PreservedAnalyses TypeSanitizerPass::run(Module &M, + ModuleAnalysisManager &MAM) { Function *TysanCtorFunction; std::tie(TysanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(M, kTysanModuleCtorName, @@ -894,5 +888,12 @@ PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M, TypeSanitizer TySan(M); TySan.instrumentGlobals(M); appendToGlobalCtors(M, TysanCtorFunction, 0); + + auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + for (Function &F : M) { + const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F); + TySan.sanitizeFunction(F, TLI); + } + return PreservedAnalyses::none(); } diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index ba1c224..3c82eed 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -128,7 +128,7 @@ static cl::opt<bool, true> static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", - cl::desc("Use loop idiom recognition code size heuristics when compiling" + cl::desc("Use loop idiom recognition code size heuristics when compiling " "with -Os/-Oz"), cl::init(true), cl::Hidden); diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 260cc72..0903488 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -104,7 +104,7 @@ static cl::opt<unsigned> UnrollMaxPercentThresholdBoost( static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze( "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, - cl::desc("Don't allow loop unrolling to simulate more than this number of" + cl::desc("Don't allow loop unrolling to simulate more than this number of " "iterations when checking full unroll profitability")); static cl::opt<unsigned> UnrollCount( diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index f58dcb5..6e91c4f 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -95,7 +95,7 @@ static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable"; /// invariant instructions in a loop. static cl::opt<float> LVInvarThreshold("licm-versioning-invariant-threshold", - cl::desc("LoopVersioningLICM's minimum allowed percentage" + cl::desc("LoopVersioningLICM's minimum allowed percentage " "of possible invariant instructions per loop"), cl::init(25), cl::Hidden); diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp index 1d4f561..b499ef8 100644 --- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp +++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp @@ -28,8 +28,8 @@ using namespace llvm; namespace llvm { cl::opt<bool> ShouldPreserveAllAttributes( "assume-preserve-all", cl::init(false), cl::Hidden, - cl::desc("enable preservation of all attrbitues. even those that are " - "unlikely to be usefull")); + cl::desc("enable preservation of all attributes. even those that are " + "unlikely to be useful")); cl::opt<bool> EnableKnowledgeRetention( "enable-knowledge-retention", cl::init(false), cl::Hidden, diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp index d829864..b3f9f76 100644 --- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -778,7 +778,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops", - false, true) + false, false) // Publicly exposed interface to pass... char &llvm::LoopSimplifyID = LoopSimplify::ID; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 03dc6c1..e367b01 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -96,8 +96,9 @@ using namespace PatternMatch; cl::opt<bool> llvm::RequireAndPreserveDomTree( "simplifycfg-require-and-preserve-domtree", cl::Hidden, - cl::desc("Temorary development switch used to gradually uplift SimplifyCFG " - "into preserving DomTree,")); + cl::desc( + "Temporary development switch used to gradually uplift SimplifyCFG " + "into preserving DomTree,")); // Chosen as 2 so as to be cheap, but still to have enough power to fold // a select, so the "clamp" idiom (of a min followed by a max) will be caught. @@ -126,7 +127,7 @@ static cl::opt<bool> HoistLoadsStoresWithCondFaulting( static cl::opt<unsigned> HoistLoadsStoresWithCondFaultingThreshold( "hoist-loads-stores-with-cond-faulting-threshold", cl::Hidden, cl::init(6), - cl::desc("Control the maximal conditonal load/store that we are willing " + cl::desc("Control the maximal conditional load/store that we are willing " "to speculatively execute to eliminate conditional branch " "(default = 6)")); diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 02ec1d5..9e81573 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -324,6 +324,11 @@ private: Instruction *ChainElem, Instruction *ChainBegin, const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets); + /// Merges the equivalence classes if they have underlying objects that differ + /// by one level of indirection (i.e., one is a getelementptr and the other is + /// the base pointer in that getelementptr). + void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const; + /// Collects loads and stores grouped by "equivalence class", where: /// - all elements in an eq class are a load or all are a store, /// - they all load/store the same element size (it's OK to have e.g. i8 and @@ -1305,6 +1310,119 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects( return std::nullopt; } +void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const { + if (EQClasses.size() < 2) // There is nothing to merge. + return; + + // The reduced key has all elements of the ECClassKey except the underlying + // object. Check that EqClassKey has 4 elements and define the reduced key. + static_assert(std::tuple_size_v<EqClassKey> == 4, + "EqClassKey has changed - EqClassReducedKey needs changes too"); + using EqClassReducedKey = + std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */, + std::tuple_element_t<2, EqClassKey> /* Element size */, + std::tuple_element_t<3, EqClassKey> /* IsLoad; */>; + using ECReducedKeyToUnderlyingObjectMap = + MapVector<EqClassReducedKey, + SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>; + + // Form a map from the reduced key (without the underlying object) to the + // underlying objects: 1 reduced key to many underlying objects, to form + // groups of potentially merge-able equivalence classes. + ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap; + bool FoundPotentiallyOptimizableEC = false; + for (const auto &EC : EQClasses) { + const auto &Key = EC.first; + EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key), + std::get<3>(Key)}; + RedKeyToUOMap[RedKey].insert(std::get<0>(Key)); + if (RedKeyToUOMap[RedKey].size() > 1) + FoundPotentiallyOptimizableEC = true; + } + if (!FoundPotentiallyOptimizableEC) + return; + + LLVM_DEBUG({ + dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n"; + for (const auto &EC : EQClasses) { + dbgs() << " Key: {" << EC.first << "}\n"; + for (const auto &Inst : EC.second) + dbgs() << " Inst: " << *Inst << '\n'; + } + }); + LLVM_DEBUG({ + dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n"; + for (const auto &RedKeyToUO : RedKeyToUOMap) { + dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", " + << std::get<1>(RedKeyToUO.first) << ", " + << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> " + << RedKeyToUO.second.size() << " underlying objects:\n"; + for (auto UObject : RedKeyToUO.second) + dbgs() << " " << *UObject << '\n'; + } + }); + + using UObjectToUObjectMap = DenseMap<const Value *, const Value *>; + + // Compute the ultimate targets for a set of underlying objects. + auto GetUltimateTargets = + [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap { + UObjectToUObjectMap IndirectionMap; + for (const auto *UObject : UObjects) { + const unsigned MaxLookupDepth = 1; // look for 1-level indirections only + const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth); + if (UltimateTarget != UObject) + IndirectionMap[UObject] = UltimateTarget; + } + UObjectToUObjectMap UltimateTargetsMap; + for (const auto *UObject : UObjects) { + auto Target = UObject; + auto It = IndirectionMap.find(Target); + for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target)) + Target = It->second; + UltimateTargetsMap[UObject] = Target; + } + return UltimateTargetsMap; + }; + + // For each item in RedKeyToUOMap, if it has more than one underlying object, + // try to merge the equivalence classes. + for (auto &[RedKey, UObjects] : RedKeyToUOMap) { + if (UObjects.size() < 2) + continue; + auto UTMap = GetUltimateTargets(UObjects); + for (const auto &[UObject, UltimateTarget] : UTMap) { + if (UObject == UltimateTarget) + continue; + + EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey), + std::get<2>(RedKey)}; + EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey), + std::get<2>(RedKey)}; + // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus, + // request the reference to the instructions vector for KeyTo first. + const auto &VecTo = EQClasses[KeyTo]; + const auto &VecFrom = EQClasses[KeyFrom]; + SmallVector<Instruction *, 8> MergedVec; + std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(), + std::back_inserter(MergedVec), + [](Instruction *A, Instruction *B) { + return A && B && A->comesBefore(B); + }); + EQClasses[KeyTo] = std::move(MergedVec); + EQClasses.erase(KeyFrom); + } + } + LLVM_DEBUG({ + dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n"; + for (const auto &EC : EQClasses) { + dbgs() << " Key: {" << EC.first << "}\n"; + for (const auto &Inst : EC.second) + dbgs() << " Inst: " << *Inst << '\n'; + } + }); +} + EquivalenceClassMap Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, BasicBlock::iterator End) { @@ -1377,6 +1495,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, .emplace_back(&I); } + mergeEquivalenceClasses(Ret); return Ret; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e0f629e..47866da 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8354,17 +8354,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, auto *GEP = dyn_cast<GetElementPtrInst>( Ptr->getUnderlyingValue()->stripPointerCasts()); VPSingleDefRecipe *VectorPtr; - if (Reverse) + if (Reverse) { + // When folding the tail, we may compute an address that we don't in the + // original scalar loop and it may not be inbounds. Drop Inbounds in that + // case. + GEPNoWrapFlags Flags = + (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) + ? GEPNoWrapFlags::none() + : GEPNoWrapFlags::inBounds(); VectorPtr = new VPReverseVectorPointerRecipe( - Ptr, &Plan.getVF(), getLoadStoreType(I), - GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds() - : GEPNoWrapFlags::none(), - I->getDebugLoc()); - else + Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); + } else { VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), I->getDebugLoc()); + } Builder.getInsertBlock()->appendRecipe(VectorPtr); Ptr = VectorPtr; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c4582df..36fed89 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -104,6 +104,7 @@ using namespace llvm; using namespace llvm::PatternMatch; using namespace slpvectorizer; +using namespace std::placeholders; #define SV_NAME "slp-vectorizer" #define DEBUG_TYPE "SLP" @@ -4955,6 +4956,37 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); } +/// Correctly creates insert_subvector, checking that the index is multiple of +/// the subvectors length. Otherwise, generates shuffle using \p Generator or +/// using default shuffle. +static Value *createInsertVector( + IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, + function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) { + const unsigned SubVecVF = getNumElements(V->getType()); + if (Index % SubVecVF == 0) { + Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, + Builder.getInt64(Index)); + } else { + // Create shuffle, insertvector requires that index is multiple of + // the subvector length. + const unsigned VecVF = getNumElements(Vec->getType()); + SmallVector<int> Mask(VecVF, PoisonMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), Index), 0); + for (unsigned I : seq<unsigned>(SubVecVF)) + Mask[I + Index] = I + VecVF; + if (Generator) { + Vec = Generator(Vec, V, Mask); + } else { + // 1. Resize V to the size of Vec. + SmallVector<int> ResizeMask(VecVF, PoisonMaskElem); + std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0); + V = Builder.CreateShuffleVector(V, ResizeMask); + Vec = Builder.CreateShuffleVector(Vec, V, Mask); + } + } + return Vec; +} + BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order, @@ -5355,11 +5387,10 @@ static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, SmallPtrSet<Value *, 13> SecondPointers; Value *P1 = Ptr1; Value *P2 = Ptr2; - if (P1 == P2) - return false; unsigned Depth = 0; - while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1) && - Depth <= RecursionMaxDepth) { + while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) { + if (P1 == P2 || Depth > RecursionMaxDepth) + return false; FirstPointers.insert(P1); SecondPointers.insert(P2); P1 = getUnderlyingObject(P1, /*MaxLookup=*/1); @@ -13884,9 +13915,8 @@ Value *BoUpSLP::gather( Instruction *InsElt; if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) { assert(SLPReVec && "FixedVectorType is not expected."); - Vec = InsElt = Builder.CreateInsertVector( - Vec->getType(), Vec, Scalar, - Builder.getInt64(Pos * VecTy->getNumElements())); + Vec = InsElt = cast<Instruction>(createInsertVector( + Builder, Vec, Scalar, Pos * getNumElements(VecTy))); auto *II = dyn_cast<IntrinsicInst>(InsElt); if (!II || II->getIntrinsicID() != Intrinsic::vector_insert) return Vec; @@ -14486,23 +14516,10 @@ public: V, SimplifyQuery(*R.DL)); })); unsigned InsertionIndex = Idx * ScalarTyNumElements; - const unsigned SubVecVF = - cast<FixedVectorType>(V->getType())->getNumElements(); - if (InsertionIndex % SubVecVF == 0) { - Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, - Builder.getInt64(InsertionIndex)); - } else { - // Create shuffle, insertvector requires that index is multiple of - // the subvectors length. - const unsigned VecVF = - cast<FixedVectorType>(Vec->getType())->getNumElements(); - SmallVector<int> Mask(VecVF, PoisonMaskElem); - std::iota(Mask.begin(), Mask.end(), 0); - for (unsigned I : seq<unsigned>( - InsertionIndex, (Idx + SubVecVF) * ScalarTyNumElements)) - Mask[I] = I - Idx + VecVF; - Vec = createShuffle(Vec, V, Mask); - } + Vec = createInsertVector( + Builder, Vec, V, InsertionIndex, + std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2, + _3)); if (!CommonMask.empty()) { std::iota( std::next(CommonMask.begin(), InsertionIndex), @@ -17748,7 +17765,6 @@ bool BoUpSLP::collectValuesToDemote( BitWidth = std::max(BitWidth, BitWidth1); return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2); }; - using namespace std::placeholders; auto FinalAnalysis = [&]() { if (!IsProfitableToDemote) return false; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9d7bf97..cfbb4ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1351,6 +1351,9 @@ public: } } + /// Returns true if the underlying opcode may read from or write to memory. + bool opcodeMayReadOrWriteFromMemory() const; + /// Returns true if the recipe only uses the first lane of operand \p Op. bool onlyFirstLaneUsed(const VPValue *Op) const override; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 76336ae..e54df8bd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -51,24 +51,7 @@ extern cl::opt<unsigned> ForceTargetInstructionCost; bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { case VPInstructionSC: - if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode())) - return false; - switch (cast<VPInstruction>(this)->getOpcode()) { - case Instruction::Or: - case Instruction::ICmp: - case Instruction::Select: - case VPInstruction::AnyOf: - case VPInstruction::Not: - case VPInstruction::CalculateTripCountMinusVF: - case VPInstruction::CanonicalIVIncrementForPart: - case VPInstruction::ExtractFromEnd: - case VPInstruction::FirstOrderRecurrenceSplice: - case VPInstruction::LogicalAnd: - case VPInstruction::PtrAdd: - return false; - default: - return true; - } + return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); case VPInterleaveSC: return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: @@ -115,6 +98,8 @@ bool VPRecipeBase::mayWriteToMemory() const { bool VPRecipeBase::mayReadFromMemory() const { switch (getVPDefID()) { + case VPInstructionSC: + return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; @@ -707,6 +692,26 @@ void VPInstruction::execute(VPTransformState &State) { /*IsScalar*/ GeneratesPerFirstLaneOnly); } +bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { + if (Instruction::isBinaryOp(getOpcode())) + return false; + switch (getOpcode()) { + case Instruction::ICmp: + case Instruction::Select: + case VPInstruction::AnyOf: + case VPInstruction::CalculateTripCountMinusVF: + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::ExtractFromEnd: + case VPInstruction::FirstOrderRecurrenceSplice: + case VPInstruction::LogicalAnd: + case VPInstruction::Not: + case VPInstruction::PtrAdd: + return false; + default: + return true; + } +} + bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode())) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 9657770..7779442 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -49,7 +49,8 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) { return all_of(GEP->operands(), isUniformAfterVectorization); if (auto *VPI = dyn_cast<VPInstruction>(Def)) return VPI->isSingleScalar() || VPI->isVectorToScalar(); - return false; + // VPExpandSCEVRecipes must be placed in the entry and are alway uniform. + return isa<VPExpandSCEVRecipe>(Def); } /// Return true if \p V is a header mask in \p Plan. diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index ea53a1a..1a669b5 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -705,7 +705,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { InstructionCost NewCost = TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind); + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask, + CostKind); bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; // If the lengths of the two vectors are not equal, @@ -2022,9 +2023,7 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { if (Match1) InnerCost1 = TTI.getInstructionCost(cast<Instruction>(OuterV1), CostKind); - InstructionCost OuterCost = TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, OuterMask, CostKind, - 0, nullptr, {OuterV0, OuterV1}, &I); + InstructionCost OuterCost = TTI.getInstructionCost(&I, CostKind); InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost; |