diff options
Diffstat (limited to 'llvm/lib/Transforms')
32 files changed, 656 insertions, 353 deletions
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 7a95df4..b575d76 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -1378,8 +1378,7 @@ static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU, IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N); // We can't know the precise weights here, as they would depend on the value // distribution of Call->getArgOperand(1). So we just mark it as "unknown". - setExplicitlyUnknownBranchWeightsIfProfiled(*SI, *Call->getFunction(), - DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE); Type *IndexTy = DL.getIndexType(Call->getType()); SmallVector<DominatorTree::UpdateType, 8> Updates; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index d85e4f7..9bdd8cb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -479,7 +479,7 @@ private: const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { auto *Sel = SelectInst::Create(C, S1, S2, NameStr, InsertBefore, nullptr); - setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, F, DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, DEBUG_TYPE, &F); return Sel; } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 67f837c..b158e0f 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2261,11 +2261,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { } Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { - if (!isa<Constant>(I.getOperand(1))) - return nullptr; + bool IsOtherParamConst = isa<Constant>(I.getOperand(1)); if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) { - if (Instruction *NewSel = FoldOpIntoSelect(I, Sel)) + if (Instruction *NewSel = + FoldOpIntoSelect(I, Sel, false, !IsOtherParamConst)) return NewSel; } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) { if (Instruction *NewPhi = foldOpIntoPhi(I, PN)) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 471c6ec..ceeece4 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3903,7 +3903,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // adding/"accumulating" %s. "Accumulation" stores the result in one // of the source registers, but this accumulate vs. add distinction // is lost when dealing with LLVM intrinsics.) + // + // ZeroPurifies means that multiplying a known-zero with an uninitialized + // value results in an initialized value. This is applicable for integer + // multiplication, but not floating-point (counter-example: NaN). void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor, + bool ZeroPurifies, unsigned EltSizeInBits = 0) { IRBuilder<> IRB(&I); @@ -3945,7 +3950,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { assert(AccumulatorType == ReturnType); } - FixedVectorType *ImplicitReturnType = ReturnType; + FixedVectorType *ImplicitReturnType = + cast<FixedVectorType>(getShadowTy(ReturnType)); // Step 1: instrument multiplication of corresponding vector elements if (EltSizeInBits) { ImplicitReturnType = cast<FixedVectorType>( @@ -3964,30 +3970,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { ReturnType->getNumElements() * ReductionFactor); } - // Multiplying an *initialized* zero by an uninitialized element results in - // an initialized zero element. - // - // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value - // results in an unpoisoned value. We can therefore adapt the visitAnd() - // instrumentation: - // OutShadow = (SaNonZero & SbNonZero) - // | (VaNonZero & SbNonZero) - // | (SaNonZero & VbNonZero) - // where non-zero is checked on a per-element basis (not per bit). - Value *SZero = Constant::getNullValue(Va->getType()); - Value *VZero = Constant::getNullValue(Sa->getType()); - Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero); - Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero); - Value *VaNonZero = IRB.CreateICmpNE(Va, VZero); - Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero); - - Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); - Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); - Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); - // Each element of the vector is represented by a single bit (poisoned or // not) e.g., <8 x i1>. - Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + Value *SaNonZero = IRB.CreateIsNotNull(Sa); + Value *SbNonZero = IRB.CreateIsNotNull(Sb); + Value *And; + if (ZeroPurifies) { + // Multiplying an *initialized* zero by an uninitialized element results + // in an initialized zero element. + // + // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value + // results in an unpoisoned value. We can therefore adapt the visitAnd() + // instrumentation: + // OutShadow = (SaNonZero & SbNonZero) + // | (VaNonZero & SbNonZero) + // | (SaNonZero & VbNonZero) + // where non-zero is checked on a per-element basis (not per bit). + Value *VaInt = Va; + Value *VbInt = Vb; + if (!Va->getType()->isIntegerTy()) { + VaInt = CreateAppToShadowCast(IRB, Va); + VbInt = CreateAppToShadowCast(IRB, Vb); + } + + Value *VaNonZero = IRB.CreateIsNotNull(VaInt); + Value *VbNonZero = IRB.CreateIsNotNull(VbInt); + + Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); + Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); + Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); + + And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + } else { + And = IRB.CreateOr({SaNonZero, SbNonZero}); + } // Extend <8 x i1> to <8 x i16>. // (The real pmadd intrinsic would have computed intermediate values of @@ -5752,17 +5768,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_ssse3_pmadd_ub_sw_128: case Intrinsic::x86_avx2_pmadd_ub_sw: case Intrinsic::x86_avx512_pmaddubs_w_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true); break; // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) case Intrinsic::x86_ssse3_pmadd_ub_sw: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) case Intrinsic::x86_mmx_pmadd_wd: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // AVX Vector Neural Network Instructions: bytes @@ -5848,7 +5867,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx2_vpdpbuuds_128: case Intrinsic::x86_avx2_vpdpbuuds_256: case Intrinsic::x86_avx10_vpdpbuuds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // AVX Vector Neural Network Instructions: words @@ -5901,7 +5921,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx512_vpdpwssds_128: case Intrinsic::x86_avx512_vpdpwssds_256: case Intrinsic::x86_avx512_vpdpwssds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index af53fa0..02f06be 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -734,7 +734,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() { FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC(); // Reserve bit 60-63 for other information purpose. - FunctionHash &= 0x0FFFFFFFFFFFFFFF; + FunctionHash &= NamedInstrProfRecord::FUNC_HASH_MASK; if (IsCS) NamedInstrProfRecord::setCSFlagInHash(FunctionHash); LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n" diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 19eccb9..9ffa602 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1796,14 +1796,16 @@ private: // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { @@ -2092,14 +2094,16 @@ private: // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 019536ca..9070d25 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -72,6 +72,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -105,6 +106,7 @@ STATISTIC( STATISTIC(NumShiftUntilZero, "Number of uncountable loops recognized as 'shift until zero' idiom"); +namespace llvm { bool DisableLIRP::All; static cl::opt<bool, true> DisableLIRPAll("disable-" DEBUG_TYPE "-all", @@ -163,6 +165,10 @@ static cl::opt<bool> ForceMemsetPatternIntrinsic( cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), cl::Hidden); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + +} // namespace llvm + namespace { class LoopIdiomRecognize { @@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() { // The loop trip count check. auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount, CurLoop->getName() + ".ivcheck"); - Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = + !ProfcheckDisableMetadataFixes && + extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights); + + auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); + if (HasBranchWeights) { + if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1)) + std::swap(BranchWeights[0], BranchWeights[1]); + // We're not changing the loop profile, so we can reuse the original loop's + // profile. + setBranchWeights(*BI, BranchWeights, + /*IsExpected=*/false); + } + LoopHeaderBB->getTerminator()->eraseFromParent(); // Populate the IV PHI. @@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE, /// %start = <...> /// %extraoffset = <...> /// <...> -/// br label %for.cond +/// br label %loop /// /// loop: -/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ] +/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ] /// %nbits = add nsw i8 %iv, %extraoffset /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0 @@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() { // The loop terminator. Builder.SetInsertPoint(LoopHeaderBB->getTerminator()); - Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB); + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = + !ProfcheckDisableMetadataFixes && + extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights); + + auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB); + if (HasBranchWeights) { + if (InvertedCond) + std::swap(BranchWeights[0], BranchWeights[1]); + // We're not changing the loop profile, so we can reuse the original loop's + // profile. + setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false); + } LoopHeaderBB->getTerminator()->eraseFromParent(); // Populate the IV PHI. diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index a883998..1b770be 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate { /// Return true if the dependence from the store to the load has an /// absolute distance of one. /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop) - bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, - Loop *L) const { + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L, + const DominatorTree &DT) const { Value *LoadPtr = Load->getPointerOperand(); Value *StorePtr = Store->getPointerOperand(); Type *LoadType = getLoadStoreType(Load); @@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate { DL.getTypeSizeInBits(getLoadStoreType(Store)) && "Should be a known dependence"); - int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0); - int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0); + int64_t StrideLoad = + getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0); + int64_t StrideStore = + getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0); if (!StrideLoad || !StrideStore || StrideLoad != StrideStore) return false; @@ -287,8 +289,8 @@ public: // so deciding which one forwards is easy. The later one forwards as // long as they both have a dependence distance of one to the load. if (Cand.Store->getParent() == OtherCand->Store->getParent() && - Cand.isDependenceDistanceOfOne(PSE, L) && - OtherCand->isDependenceDistanceOfOne(PSE, L)) { + Cand.isDependenceDistanceOfOne(PSE, L, *DT) && + OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) { // They are in the same block, the later one will forward to the load. if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) OtherCand = &Cand; @@ -538,7 +540,7 @@ public: // Check whether the SCEV difference is the same as the induction step, // thus we load the value in the next iteration. - if (!Cand.isDependenceDistanceOfOne(PSE, L)) + if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT)) continue; assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) && diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index b9546c5..e902b71 100644 --- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" @@ -393,6 +394,17 @@ private: DTUpdates.push_back({DominatorTree::Insert, Preheader, BB}); ++NumLoopExitsDeleted; } + // We don't really need to add branch weights to DummySwitch, because all + // but one branches are just a temporary artifact - see the comment on top + // of this function. But, it's easy to estimate the weights, and it helps + // maintain a property of the overall compiler - that the branch weights + // don't "just get dropped" accidentally (i.e. profcheck) + if (DummySwitch->getParent()->getParent()->hasProfileData()) { + SmallVector<uint32_t> DummyBranchWeights(1 + DummySwitch->getNumCases()); + // default. 100% probability, the rest are dead. + DummyBranchWeights[0] = 1; + setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false); + } assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?"); if (Loop *OuterLoop = LI.getLoopFor(Preheader)) { diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 2bda9d8..802ae4e 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, } // Do not attempt partial/runtime unrolling in FullLoopUnrolling - if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) { + if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) || + UP.Count < TripCount || UP.Count < MaxTripCount)) { LLVM_DEBUG( dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n"); return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 239526e..0f3e664 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" @@ -329,8 +330,7 @@ static void buildPartialUnswitchConditionalBranch( HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof) : nullptr); if (!HasBranchWeights) - setExplicitlyUnknownBranchWeightsIfProfiled( - *BR, *BR->getParent()->getParent(), DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE); } /// Copy a set of loop invariant values, and conditionally branch on them. @@ -388,8 +388,7 @@ static void buildPartialInvariantUnswitchConditionalBranch( IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, Direction ? &NormalSucc : &UnswitchedSucc, ProfData); if (!ProfData) - setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(), - DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE); } /// Rewrite the PHI nodes in an unswitched loop exit basic block. @@ -2831,9 +2830,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, MSSAU->getMemorySSA()->verifyMemorySSA(); DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - Instruction *DeoptBlockTerm = - SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true, - GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI); + // llvm.experimental.guard doesn't have branch weights. We can assume, + // however, that the deopt path is unlikely. + Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen( + GI->getArgOperand(0), GI, true, + !ProfcheckDisableMetadataFixes && EstimateProfile + ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights() + : nullptr, + &DTU, &LI); BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator()); // SplitBlockAndInsertIfThen inserts control flow that branches to // DeoptBlockTerm if the condition is true. We want the opposite. @@ -3197,10 +3201,14 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, Builder.SetInsertPoint(TI); auto *InvariantBr = Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock); + // We don't know anything about the relation between the limits. + setExplicitlyUnknownBranchWeightsIfProfiled(*InvariantBr, DEBUG_TYPE); Builder.SetInsertPoint(CheckBlock); - Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0), - TI->getSuccessor(1)); + Builder.CreateCondBr( + TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1), + !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof) + : nullptr); TI->eraseFromParent(); // Fixup phis. diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 42b1fdf..8aa8aa2 100644 --- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -39,36 +39,36 @@ using namespace llvm; STATISTIC(NumBroken, "Number of blocks inserted"); namespace { - struct BreakCriticalEdges : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - BreakCriticalEdges() : FunctionPass(ID) { - initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry()); - } +struct BreakCriticalEdges : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + BreakCriticalEdges() : FunctionPass(ID) { + initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry()); + } - bool runOnFunction(Function &F) override { - auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + bool runOnFunction(Function &F) override { + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; - auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>(); - auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr; + auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>(); + auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr; - auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); - auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - unsigned N = - SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT)); - NumBroken += N; - return N > 0; - } + auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + unsigned N = SplitAllCriticalEdges( + F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT)); + NumBroken += N; + return N > 0; + } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); - // No loop canonicalization guarantees are broken by this pass. - AU.addPreservedID(LoopSimplifyID); - } - }; -} + // No loop canonicalization guarantees are broken by this pass. + AU.addPreservedID(LoopSimplifyID); + } +}; +} // namespace char BreakCriticalEdges::ID = 0; INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges", @@ -76,6 +76,7 @@ INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges", // Publicly exposed interface to pass... char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID; + FunctionPass *llvm::createBreakCriticalEdgesPass() { return new BreakCriticalEdges(); } diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 573a781..02b73e8 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1283,6 +1283,12 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F, case LibFunc_ilogbl: case LibFunc_logf: case LibFunc_logl: + case LibFunc_nextafter: + case LibFunc_nextafterf: + case LibFunc_nextafterl: + case LibFunc_nexttoward: + case LibFunc_nexttowardf: + case LibFunc_nexttowardl: case LibFunc_pow: case LibFunc_powf: case LibFunc_powl: diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp index 7343c79..9f6d89e 100644 --- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -40,22 +40,22 @@ using namespace llvm; namespace { - struct QuotRemPair { - Value *Quotient; - Value *Remainder; - - QuotRemPair(Value *InQuotient, Value *InRemainder) - : Quotient(InQuotient), Remainder(InRemainder) {} - }; - - /// A quotient and remainder, plus a BB from which they logically "originate". - /// If you use Quotient or Remainder in a Phi node, you should use BB as its - /// corresponding predecessor. - struct QuotRemWithBB { - BasicBlock *BB = nullptr; - Value *Quotient = nullptr; - Value *Remainder = nullptr; - }; +struct QuotRemPair { + Value *Quotient; + Value *Remainder; + + QuotRemPair(Value *InQuotient, Value *InRemainder) + : Quotient(InQuotient), Remainder(InRemainder) {} +}; + +/// A quotient and remainder, plus a BB from which they logically "originate". +/// If you use Quotient or Remainder in a Phi node, you should use BB as its +/// corresponding predecessor. +struct QuotRemWithBB { + BasicBlock *BB = nullptr; + Value *Quotient = nullptr; + Value *Remainder = nullptr; +}; using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; using BypassWidthsTy = DenseMap<unsigned, unsigned>; diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 5ba6f95f..6086615 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::CoroDestroyOnlyWhenComplete: case Attribute::CoroElideSafe: case Attribute::NoDivergenceSource: + case Attribute::NoCreateUndefOrPoison: continue; // Those attributes should be safe to propagate to the extracted function. case Attribute::AlwaysInline: diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp index 0642d51..dd8706c 100644 --- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp +++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp @@ -16,22 +16,62 @@ using namespace llvm; +static void mergeAttributes(LLVMContext &Ctx, const Module &M, + const DataLayout &DL, const Triple &TT, + Function *Func, FunctionType *FuncTy, + AttributeList FuncAttrs) { + AttributeList OldAttrs = Func->getAttributes(); + AttributeList NewAttrs = OldAttrs; + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder); + } + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder); + } + + for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) { + AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I)); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I)); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder); + } + + Func->setAttributes(NewAttrs); +} + PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M, ModuleAnalysisManager &MAM) { RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple()); LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + const Triple &TT = M.getTargetTriple(); - for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) { - if (Impl == RTLIB::Unsupported) + for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) { + if (!RTLCI.isAvailable(Impl)) continue; - // TODO: Declare with correct type, calling convention, and attributes. + auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl); - FunctionType *FuncTy = - FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); + // TODO: Declare with correct type, calling convention, and attributes. + if (!FuncTy) + FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); StringRef FuncName = RTLCI.getLibcallImplName(Impl); - M.getOrInsertFunction(FuncName, FuncTy); + + Function *Func = + cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee()); + if (Func->getFunctionType() == FuncTy) { + mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs); + Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl)); + } } return PreservedAnalyses::none(); diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 61ffb49..8da6a980 100644 --- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -378,7 +378,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, if (P != Preheader) BackedgeBlocks.push_back(P); } - // Create and insert the new backedge block... + // Create and insert the new backedge block. BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(), Header->getName() + ".backedge", F); BranchInst *BETerminator = BranchInst::Create(Header, BEBlock); @@ -737,39 +737,39 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, } namespace { - struct LoopSimplify : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - LoopSimplify() : FunctionPass(ID) { - initializeLoopSimplifyPass(*PassRegistry::getPassRegistry()); - } +struct LoopSimplify : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + LoopSimplify() : FunctionPass(ID) { + initializeLoopSimplifyPass(*PassRegistry::getPassRegistry()); + } - bool runOnFunction(Function &F) override; + bool runOnFunction(Function &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); - // We need loop information to identify the loops... - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); + // We need loop information to identify the loops. + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<BasicAAWrapperPass>(); - AU.addPreserved<AAResultsWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addPreserved<SCEVAAWrapperPass>(); - AU.addPreservedID(LCSSAID); - AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. - AU.addPreserved<BranchProbabilityInfoWrapperPass>(); - AU.addPreserved<MemorySSAWrapperPass>(); - } + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); + AU.addPreservedID(LCSSAID); + AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. + AU.addPreserved<BranchProbabilityInfoWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); + } - /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees. - void verifyAnalysis() const override; - }; -} + /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees. + void verifyAnalysis() const override; +}; +} // namespace char LoopSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", @@ -780,12 +780,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops", false, false) -// Publicly exposed interface to pass... +// Publicly exposed interface to pass. char &llvm::LoopSimplifyID = LoopSimplify::ID; Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); } /// runOnFunction - Run down all loops in the CFG (recursively, but we could do -/// it in any convenient order) inserting preheaders... +/// it in any convenient order) inserting preheaders. /// bool LoopSimplify::runOnFunction(Function &F) { bool Changed = false; diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 1e8f6cc..6c9467b 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -202,6 +202,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, /// probability of executing at least one more iteration? static BranchProbability probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { + // OriginalLoopProb == 1 would produce a division by zero in the calculation + // below. The problem is that case indicates an always infinite loop, but a + // remainder loop cannot be calculated at run time if the original loop is + // infinite as infinity % UnrollCount is undefined. We then choose + // probabilities indicating that all remainder loop iterations will always + // execute. + // + // Currently, the remainder loop here is an epilogue, which cannot be reached + // if the original loop is infinite, so the aforementioned choice is + // arbitrary. + // + // FIXME: Branch weights still need to be fixed in the case of prologues + // (issue #135812). In that case, the aforementioned choice seems reasonable + // for the goal of maintaining the original loop's block frequencies. That + // is, an infinite loop's initial iterations are not skipped, and the prologue + // loop body might have unique blocks that execute a finite number of times + // if, for example, the original loop body contains conditionals like i < + // UnrollCount. + if (OriginalLoopProb == BranchProbability::getOne()) + return BranchProbability::getOne(); + // Each of these variables holds the original loop's probability that the // number of iterations it will execute is some m in the specified range. BranchProbability ProbOne = OriginalLoopProb; // 1 <= m diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index ec2e6c1..9c8b6ef 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -109,8 +110,12 @@ void LoopVersioning::versionLoop( // Insert the conditional branch based on the result of the memchecks. Instruction *OrigTerm = RuntimeCheckBB->getTerminator(); Builder.SetInsertPoint(OrigTerm); - Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(), - VersionedLoop->getLoopPreheader()); + auto *BI = + Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(), + VersionedLoop->getLoopPreheader()); + // We don't know what the probability of executing the versioned vs the + // unversioned variants is. + setExplicitlyUnknownBranchWeightsIfProfiled(*BI, DEBUG_TYPE); OrigTerm->eraseFromParent(); // The loops merge in the original exit block. This is now dominated by the diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index cbc604e..37c048f 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -778,8 +778,10 @@ private: return false; // Add all values from the range to the set - for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp) + APInt Tmp = Span.getLower(); + do Vals.push_back(ConstantInt::get(I->getContext(), Tmp)); + while (++Tmp != Span.getUpper()); UsedICmps++; return true; @@ -5212,8 +5214,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, // We don't have any info about this condition. auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB) : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB); - setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(), - DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*Br, DEBUG_TYPE); OldTI->eraseFromParent(); @@ -6020,6 +6021,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const DataLayout &DL) { Value *Cond = SI->getCondition(); KnownBits Known = computeKnownBits(Cond, DL, AC, SI); + SmallPtrSet<const Constant *, 4> KnownValues; + bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) @@ -6039,15 +6042,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, UniqueSuccessors.push_back(Successor); ++It->second; } - const APInt &CaseVal = Case.getCaseValue()->getValue(); + ConstantInt *CaseC = Case.getCaseValue(); + const APInt &CaseVal = CaseC->getValue(); if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) || - (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) { - DeadCases.push_back(Case.getCaseValue()); + (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) || + (IsKnownValuesValid && !KnownValues.contains(CaseC))) { + DeadCases.push_back(CaseC); if (DTU) --NumPerSuccessorCases[Successor]; LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n"); - } + } else if (IsKnownValuesValid) + KnownValues.erase(CaseC); } // If we can prove that the cases must cover all possible values, the @@ -6058,33 +6064,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const unsigned NumUnknownBits = Known.getBitWidth() - (Known.Zero | Known.One).popcount(); assert(NumUnknownBits <= Known.getBitWidth()); - if (HasDefault && DeadCases.empty() && - NumUnknownBits < 64 /* avoid overflow */) { - uint64_t AllNumCases = 1ULL << NumUnknownBits; - if (SI->getNumCases() == AllNumCases) { + if (HasDefault && DeadCases.empty()) { + if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) { createUnreachableSwitchDefault(SI, DTU); return true; } - // When only one case value is missing, replace default with that case. - // Eliminating the default branch will provide more opportunities for - // optimization, such as lookup tables. - if (SI->getNumCases() == AllNumCases - 1) { - assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); - IntegerType *CondTy = cast<IntegerType>(Cond->getType()); - if (CondTy->getIntegerBitWidth() > 64 || - !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) - return false; - uint64_t MissingCaseVal = 0; - for (const auto &Case : SI->cases()) - MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); - auto *MissingCase = - cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal)); - SwitchInstProfUpdateWrapper SIW(*SI); - SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0)); - createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false); - SIW.setSuccessorWeight(0, 0); - return true; + if (NumUnknownBits < 64 /* avoid overflow */) { + uint64_t AllNumCases = 1ULL << NumUnknownBits; + if (SI->getNumCases() == AllNumCases) { + createUnreachableSwitchDefault(SI, DTU); + return true; + } + // When only one case value is missing, replace default with that case. + // Eliminating the default branch will provide more opportunities for + // optimization, such as lookup tables. + if (SI->getNumCases() == AllNumCases - 1) { + assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); + IntegerType *CondTy = cast<IntegerType>(Cond->getType()); + if (CondTy->getIntegerBitWidth() > 64 || + !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) + return false; + + uint64_t MissingCaseVal = 0; + for (const auto &Case : SI->cases()) + MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); + auto *MissingCase = cast<ConstantInt>( + ConstantInt::get(Cond->getType(), MissingCaseVal)); + SwitchInstProfUpdateWrapper SIW(*SI); + SIW.addCase(MissingCase, SI->getDefaultDest(), + SIW.getSuccessorWeight(0)); + createUnreachableSwitchDefault(SI, DTU, + /*RemoveOrigDefaultBlock*/ false); + SIW.setSuccessorWeight(0, 0); + return true; + } } } @@ -7570,6 +7584,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, return true; } +/// Tries to transform the switch when the condition is umin with a constant. +/// In that case, the default branch can be replaced by the constant's branch. +/// This method also removes dead cases when the simplification cannot replace +/// the default branch. +/// +/// For example: +/// switch(umin(a, 3)) { +/// case 0: +/// case 1: +/// case 2: +/// case 3: +/// case 4: +/// // ... +/// default: +/// unreachable +/// } +/// +/// Transforms into: +/// +/// switch(a) { +/// case 0: +/// case 1: +/// case 2: +/// default: +/// // This is case 3 +/// } +static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) { + Value *A; + ConstantInt *Constant; + + if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant)))) + return false; + + SmallVector<DominatorTree::UpdateType> Updates; + SwitchInstProfUpdateWrapper SIW(*SI); + BasicBlock *BB = SIW->getParent(); + + // Dead cases are removed even when the simplification fails. + // A case is dead when its value is higher than the Constant. + for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) { + if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) { + ++I; + continue; + } + BasicBlock *DeadCaseBB = I->getCaseSuccessor(); + DeadCaseBB->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB}); + I = SIW->removeCase(I); + E = SIW->case_end(); + } + + auto Case = SI->findCaseValue(Constant); + // If the case value is not found, `findCaseValue` returns the default case. + // In this scenario, since there is no explicit `case 3:`, the simplification + // fails. The simplification also fails when the switch’s default destination + // is reachable. + if (!SI->defaultDestUnreachable() || Case == SI->case_default()) { + if (DTU) + DTU->applyUpdates(Updates); + return !Updates.empty(); + } + + BasicBlock *Unreachable = SI->getDefaultDest(); + SIW.replaceDefaultDest(Case); + SIW.removeCase(Case); + SIW->setCondition(A); + + Updates.push_back({DominatorTree::Delete, BB, Unreachable}); + + if (DTU) + DTU->applyUpdates(Updates); + + return true; +} + /// Tries to transform switch of powers of two to reduce switch range. /// For example, switch like: /// switch (C) { case 1: case 2: case 64: case 128: } @@ -7642,19 +7731,24 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, // label. The other is those powers of 2 that don't appear in the case // statement. We don't know the distribution of the values coming in, so // the safest is to split 50-50 the original probability to `default`. - uint64_t OrigDenominator = sum_of(map_range( - Weights, [](const auto &V) { return static_cast<uint64_t>(V); })); + uint64_t OrigDenominator = + sum_of(map_range(Weights, StaticCastTo<uint64_t>)); SmallVector<uint64_t> NewWeights(2); NewWeights[1] = Weights[0] / 2; NewWeights[0] = OrigDenominator - NewWeights[1]; setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false); - - // For the original switch, we reduce the weight of the default by the - // amount by which the previous branch contributes to getting to default, - // and then make sure the remaining weights have the same relative ratio - // wrt eachother. + // The probability of executing the default block stays constant. It was + // p_d = Weights[0] / OrigDenominator + // we rewrite as W/D + // We want to find the probability of the default branch of the switch + // statement. Let's call it X. We have W/D = W/2D + X * (1-W/2D) + // i.e. the original probability is the probability we go to the default + // branch from the BI branch, or we take the default branch on the SI. + // Meaning X = W / (2D - W), or (W/2) / (D - W/2) + // This matches using W/2 for the default branch probability numerator and + // D-W/2 as the denominator. + Weights[0] = NewWeights[1]; uint64_t CasesDenominator = OrigDenominator - Weights[0]; - Weights[0] /= 2; for (auto &W : drop_begin(Weights)) W = NewWeights[0] * static_cast<double>(W) / CasesDenominator; @@ -8037,6 +8131,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (simplifyDuplicateSwitchArms(SI, DTU)) return requestResimplify(); + if (simplifySwitchWhenUMin(SI, DTU)) + return requestResimplify(); + return false; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index fdfff16..03112c6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, bool CanAddPredicate = !llvm::shouldOptimizeForSize( TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); - int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides, - CanAddPredicate, false).value_or(0); + int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides, + CanAddPredicate, false) + .value_or(0); if (Stride == 1 || Stride == -1) return Stride; return 0; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e5c3f17..b7224a3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7550,13 +7550,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, } if (LoadInst *Load = dyn_cast<LoadInst>(I)) return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, - Load->getAlign(), VPIRMetadata(*Load, LVer), - I->getDebugLoc()); + VPIRMetadata(*Load, LVer), I->getDebugLoc()); StoreInst *Store = cast<StoreInst>(I); return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, - Reverse, Store->getAlign(), - VPIRMetadata(*Store, LVer), I->getDebugLoc()); + Reverse, VPIRMetadata(*Store, LVer), + I->getDebugLoc()); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also @@ -7934,6 +7933,26 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))) ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second); } + + // Check that all partial reductions in a chain are only used by other + // partial reductions with the same scale factor. Otherwise we end up creating + // users of scaled reductions where the types of the other operands don't + // match. + for (const auto &[Chain, Scale] : PartialReductionChains) { + auto AllUsersPartialRdx = [ScaleVal = Scale, this](const User *U) { + auto *UI = cast<Instruction>(U); + if (isa<PHINode>(UI) && UI->getParent() == OrigLoop->getHeader()) { + return all_of(UI->users(), [ScaleVal, this](const User *U) { + auto *UI = cast<Instruction>(U); + return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal; + }); + } + return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal || + !OrigLoop->contains(UI->getParent()); + }; + if (!all_of(Chain.Reduction->users(), AllUsersPartialRdx)) + ScaledReductionMap.erase(Chain.Reduction); + } } bool VPRecipeBuilder::getScaledReductions( @@ -8117,11 +8136,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) return tryToWidenMemory(Instr, Operands, Range); - if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) { - if (auto PartialRed = - tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value())) - return PartialRed; - } + if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) + return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()); if (!shouldWiden(Instr, Range)) return nullptr; @@ -8155,9 +8171,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, isa<VPPartialReductionRecipe>(BinOpRecipe)) std::swap(BinOp, Accumulator); - if (ScaleFactor != - vputils::getVFScaleFactor(Accumulator->getDefiningRecipe())) - return nullptr; + assert(ScaleFactor == + vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) && + "all accumulators in chain must have same scale factor"); unsigned ReductionOpcode = Reduction->getOpcode(); if (ReductionOpcode == Instruction::Sub) { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index bf3f52c..df835a0 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20996,6 +20996,15 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, return false; })) return std::nullopt; + if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() && + EI.UserTE->hasCopyableElements() && + EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() && + all_of(VL, [&](Value *V) { + if (S.isCopyableElement(V)) + return true; + return isUsedOutsideBlock(V); + })) + return std::nullopt; bool HasCopyables = S.areInstructionsWithCopyableElements(); if (((!HasCopyables && doesNotNeedToSchedule(VL)) || all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 428a8f4..dd26a05 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -304,18 +304,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { } bool IsSingleScalar = vputils::isSingleScalar(Def); - VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1); - // Check if there is a scalar value for the selected lane. - if (!hasScalarValue(Def, LastLane)) { - // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and - // VPExpandSCEVRecipes can also be a single scalar. - assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe, - VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && - "unexpected recipe found to be invariant"); - IsSingleScalar = true; - LastLane = 0; - } // We need to construct the vector value for a single-scalar value by // broadcasting the scalar to all lanes. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index cfe1f1e..3062e1c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1163,10 +1163,10 @@ public: bool opcodeMayReadOrWriteFromMemory() const; /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override; + bool usesFirstLaneOnly(const VPValue *Op) const override; /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override; + bool usesFirstPartOnly(const VPValue *Op) const override; /// Returns true if this VPInstruction produces a scalar value from a vector, /// e.g. by performing a reduction or extracting a lane. @@ -1393,13 +1393,13 @@ public: return true; } - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; } - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -1628,7 +1628,7 @@ public: VPSlotTracker &SlotTracker) const override; #endif - bool onlyFirstLaneUsed(const VPValue *Op) const override; + bool usesFirstLaneOnly(const VPValue *Op) const override; }; /// A recipe for widening Call instructions using library calls. @@ -1725,7 +1725,9 @@ public: #endif }; -/// A recipe for widening select instructions. +/// A recipe for widening select instructions. Supports both wide vector and +/// single-scalar conditions, matching the behavior of LLVM IR's select +/// instruction. struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands) @@ -1765,7 +1767,7 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags, } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return Op == getCond() && isInvariantCond(); @@ -1831,7 +1833,7 @@ public: #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Op == getOperand(0)) @@ -1868,7 +1870,7 @@ public: void execute(VPTransformState &State) override; - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -1882,7 +1884,7 @@ public: } /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); assert(getNumOperands() <= 2 && "must have at most two operands"); @@ -1920,14 +1922,14 @@ public: Type *getSourceElementType() const { return SourceElementTy; } - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; } /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); assert(getNumOperands() <= 2 && "must have at most two operands"); @@ -2108,7 +2110,7 @@ public: } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // The recipe creates its own wide start value, so it only requests the @@ -2323,7 +2325,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return Op == getStartValue(); @@ -2397,7 +2399,7 @@ public: bool isInLoop() const { return IsInLoop; } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return isOrdered() || isInLoop(); @@ -2466,13 +2468,13 @@ public: #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Recursing through Blend recipes only, must terminate at header phi's the // latest. return all_of(users(), - [this](VPUser *U) { return U->onlyFirstLaneUsed(this); }); + [this](VPUser *U) { return U->usesFirstLaneOnly(this); }); } }; @@ -2560,7 +2562,7 @@ public: VPCostContext &Ctx) const override; /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override = 0; + bool usesFirstLaneOnly(const VPValue *Op) const override = 0; /// Returns the number of stored operands of this interleave group. Returns 0 /// for load interleave groups. @@ -2606,7 +2608,7 @@ public: VPSlotTracker &SlotTracker) const override; #endif - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); @@ -2654,7 +2656,7 @@ public: #endif /// The recipe only uses the first lane of the address, and EVL operand. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) || @@ -2860,7 +2862,7 @@ public: VPValue *getEVL() const { return getOperand(2); } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return Op == getEVL(); @@ -2922,7 +2924,7 @@ public: bool isPredicated() const { return IsPredicated; } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return isSingleScalar(); @@ -3204,14 +3206,14 @@ protected: VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, std::initializer_list<VPValue *> Operands, - bool Consecutive, bool Reverse, Align Alignment, + bool Consecutive, bool Reverse, const VPIRMetadata &Metadata, DebugLoc DL) : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), - Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) { + Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive), + Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); - assert(isa<VPVectorEndPointerRecipe>(getAddr()) || - !Reverse && - "Reversed acccess without VPVectorEndPointerRecipe address?"); + assert((isa<VPVectorEndPointerRecipe>(getAddr()) || !Reverse) && + "Reversed acccess without VPVectorEndPointerRecipe address?"); } public: @@ -3271,18 +3273,18 @@ public: struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse, Align Alignment, + bool Consecutive, bool Reverse, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive, - Reverse, Alignment, Metadata, DL), + Reverse, Metadata, DL), VPValue(this, &Load) { setMask(Mask); } VPWidenLoadRecipe *clone() override { return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(), - getMask(), Consecutive, Reverse, getAlign(), - *this, getDebugLoc()); + getMask(), Consecutive, Reverse, *this, + getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC); @@ -3297,7 +3299,7 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Widened, consecutive loads operations only demand the first lane of @@ -3313,8 +3315,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), - {Addr, &EVL}, L.isConsecutive(), L.isReverse(), - L.getAlign(), L, L.getDebugLoc()), + {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L, + L.getDebugLoc()), VPValue(this, &getIngredient()) { setMask(Mask); } @@ -3338,7 +3340,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Widened loads only demand the first lane of EVL and consecutive loads @@ -3352,16 +3354,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, VPValue *Mask, bool Consecutive, bool Reverse, - Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL) + const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal}, - Consecutive, Reverse, Alignment, Metadata, DL) { + Consecutive, Reverse, Metadata, DL) { setMask(Mask); } VPWidenStoreRecipe *clone() override { return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(), getStoredValue(), getMask(), Consecutive, - Reverse, getAlign(), *this, getDebugLoc()); + Reverse, *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC); @@ -3379,7 +3381,7 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Widened, consecutive stores only demand the first lane of their address, @@ -3396,7 +3398,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(), - S.isReverse(), S.getAlign(), S, S.getDebugLoc()) { + S.isReverse(), S, S.getDebugLoc()) { setMask(Mask); } @@ -3422,7 +3424,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Op == getEVL()) { @@ -3506,14 +3508,14 @@ public: } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; } /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -3588,7 +3590,7 @@ public: } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -3698,7 +3700,7 @@ public: VPValue *getStepValue() const { return getOperand(2); } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -3763,7 +3765,7 @@ public: VPValue *getStepValue() const { return getOperand(1); } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1ee405a..80cd112 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -659,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) { } case Instruction::Select: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); + Value *Cond = + State.get(getOperand(0), + OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0))); Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); return Builder.CreateSelect(Cond, Op1, Op2, Name); @@ -1274,7 +1276,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { } } -bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { +bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) return vputils::onlyFirstLaneUsed(this); @@ -1323,7 +1325,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { llvm_unreachable("switch should return"); } -bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { +bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode())) return vputils::onlyFirstPartUsed(this); @@ -1690,7 +1692,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { if (!VFTy->getParamType(I.index())->isVectorTy()) Arg = State.get(I.value(), VPLane(0)); else - Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); + Arg = State.get(I.value(), usesFirstLaneOnly(I.value())); Args.push_back(Arg); } @@ -1759,7 +1761,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { State.TTI)) Arg = State.get(I.value(), VPLane(0)); else - Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); + Arg = State.get(I.value(), usesFirstLaneOnly(I.value())); if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(), State.TTI)) TysForDecl.push_back(Arg->getType()); @@ -1841,7 +1843,7 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { return Intrinsic::getBaseName(VectorIntrinsicID); } -bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { +bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return all_of(enumerate(operands()), [this, &Op](const auto &X) { auto [Idx, V] = X; @@ -1968,16 +1970,13 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, getOperand(1)->printAsOperand(O, SlotTracker); O << ", "; getOperand(2)->printAsOperand(O, SlotTracker); - O << (isInvariantCond() ? " (condition is loop invariant)" : ""); + O << (vputils::isSingleScalar(getCond()) ? " (condition is single-scalar)" + : ""); } #endif void VPWidenSelectRecipe::execute(VPTransformState &State) { - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - Value *Cond = State.get(getCond(), isInvariantCond()); + Value *Cond = State.get(getCond(), vputils::isSingleScalar(getCond())); Value *Op0 = State.get(getOperand(1)); Value *Op1 = State.get(getOperand(2)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9d9bb14..48bd697 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -91,14 +91,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) { NewRecipe = new VPWidenLoadRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/, Load->getAlign(), - VPIRMetadata(*Load), Ingredient.getDebugLoc()); + false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load), + Ingredient.getDebugLoc()); } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) { NewRecipe = new VPWidenStoreRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, - Store->getAlign(), VPIRMetadata(*Store), - Ingredient.getDebugLoc()); + VPIRMetadata(*Store), Ingredient.getDebugLoc()); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) { @@ -154,27 +153,31 @@ static bool sinkScalarOperands(VPlan &Plan) { bool ScalarVFOnly = Plan.hasScalarVFOnly(); bool Changed = false; - auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo, - VPSingleDefRecipe *Candidate) { - // We only know how to duplicate VPReplicateRecipes and - // VPScalarIVStepsRecipes for now. + SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; + auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList]( + VPBasicBlock *SinkTo, VPValue *Op) { + auto *Candidate = + dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()); + if (!Candidate) + return; + + // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes + // for now. if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) - return false; + return; - if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() || - Candidate->mayReadOrWriteMemory()) - return false; + if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate)) + return; if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) if (!ScalarVFOnly && RepR->isSingleScalar()) - return false; + return; - return true; + WorkList.insert({SinkTo, Candidate}); }; // First, collect the operands of all recipes in replicate blocks as seeds for // sinking. - SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) { VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) @@ -182,14 +185,9 @@ static bool sinkScalarOperands(VPlan &Plan) { VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front()); if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) continue; - for (auto &Recipe : *VPBB) { - for (VPValue *Op : Recipe.operands()) { - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - if (IsValidSinkCandidate(VPBB, Def)) - WorkList.insert({VPBB, Def}); - } - } + for (auto &Recipe : *VPBB) + for (VPValue *Op : Recipe.operands()) + InsertIfValidSinkCandidate(VPBB, Op); } // Try to sink each replicate or scalar IV steps recipe in the worklist. @@ -198,15 +196,15 @@ static bool sinkScalarOperands(VPlan &Plan) { VPSingleDefRecipe *SinkCandidate; std::tie(SinkTo, SinkCandidate) = WorkList[I]; - // All recipe users of the sink candidate must be in the same block SinkTo - // or all users outside of SinkTo must have only their first lane used. In + // All recipe users of SinkCandidate must be in the same block SinkTo or all + // users outside of SinkTo must only use the first lane of SinkCandidate. In // the latter case, we need to duplicate SinkCandidate. auto UsersOutsideSinkTo = make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) { return cast<VPRecipeBase>(U)->getParent() != SinkTo; }); if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) { - return !U->onlyFirstLaneUsed(SinkCandidate); + return !U->usesFirstLaneOnly(SinkCandidate); })) continue; bool NeedsDuplicating = !UsersOutsideSinkTo.empty(); @@ -234,10 +232,7 @@ static bool sinkScalarOperands(VPlan &Plan) { } SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); for (VPValue *Op : SinkCandidate->operands()) - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - if (IsValidSinkCandidate(SinkTo, Def)) - WorkList.insert({SinkTo, Def}); + InsertIfValidSinkCandidate(SinkTo, Op); Changed = true; } return Changed; @@ -1290,6 +1285,15 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { return; } + // Look through broadcast of single-scalar when used as select conditions; in + // that case the scalar condition can be used directly. + if (match(Def, + m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue())) && + vputils::isSingleScalar(C)) { + Def->setOperand(0, C); + return; + } + if (auto *Phi = dyn_cast<VPPhi>(Def)) { if (Phi->getNumOperands() == 1) Phi->replaceAllUsesWith(Phi->getOperand(0)); @@ -4178,6 +4182,59 @@ static bool isAlreadyNarrow(VPValue *VPV) { return RepR && RepR->isSingleScalar(); } +// Convert a wide recipe defining a VPValue \p V feeding an interleave group to +// a narrow variant. +static VPValue * +narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) { + auto *R = V->getDefiningRecipe(); + if (!R || NarrowedOps.contains(V)) + return V; + + if (isAlreadyNarrow(V)) + return V; + + if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) { + for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) + WideMember0->setOperand( + Idx, + narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps)); + return V; + } + + if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { + // Narrow interleave group to wide load, as transformed VPlan will only + // process one original iteration. + auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()); + auto *L = new VPWidenLoadRecipe( + *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, + /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); + L->insertBefore(LoadGroup); + NarrowedOps.insert(L); + return L; + } + + if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) { + assert(RepR->isSingleScalar() && + isa<LoadInst>(RepR->getUnderlyingInstr()) && + "must be a single scalar load"); + NarrowedOps.insert(RepR); + return RepR; + } + + auto *WideLoad = cast<VPWidenLoadRecipe>(R); + VPValue *PtrOp = WideLoad->getAddr(); + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp)) + PtrOp = VecPtr->getOperand(0); + // Narrow wide load to uniform scalar load, as transformed VPlan will only + // process one original iteration. + auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp}, + /*IsUniform*/ true, + /*Mask*/ nullptr, *WideLoad); + N->insertBefore(WideLoad); + NarrowedOps.insert(N); + return N; +} + void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); @@ -4279,65 +4336,15 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. SmallPtrSet<VPValue *, 4> NarrowedOps; - auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * { - auto *R = V->getDefiningRecipe(); - if (!R || NarrowedOps.contains(V)) - return V; - if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { - // Narrow interleave group to wide load, as transformed VPlan will only - // process one original iteration. - auto *LI = - cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()); - auto *L = new VPWidenLoadRecipe( - *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, - /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc()); - L->insertBefore(LoadGroup); - NarrowedOps.insert(L); - return L; - } - - if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) { - assert(RepR->isSingleScalar() && - isa<LoadInst>(RepR->getUnderlyingInstr()) && - "must be a single scalar load"); - NarrowedOps.insert(RepR); - return RepR; - } - auto *WideLoad = cast<VPWidenLoadRecipe>(R); - - VPValue *PtrOp = WideLoad->getAddr(); - if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp)) - PtrOp = VecPtr->getOperand(0); - // Narrow wide load to uniform scalar load, as transformed VPlan will only - // process one original iteration. - auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp}, - /*IsUniform*/ true, - /*Mask*/ nullptr, *WideLoad); - N->insertBefore(WideLoad); - NarrowedOps.insert(N); - return N; - }; - // Narrow operation tree rooted at store groups. for (auto *StoreGroup : StoreGroups) { - VPValue *Res = nullptr; - VPValue *Member0 = StoreGroup->getStoredValues()[0]; - if (isAlreadyNarrow(Member0)) { - Res = Member0; - } else if (auto *WideMember0 = - dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) { - for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) - WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx))); - Res = WideMember0; - } else { - Res = NarrowOp(Member0); - } - + VPValue *Res = + narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps); auto *SI = cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()); auto *S = new VPWidenStoreRecipe( *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, - /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc()); + /*Reverse=*/false, {}, StoreGroup->getDebugLoc()); S->insertBefore(StoreGroup); StoreGroup->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index d6a0028..d4b8b72b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -582,7 +582,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { /// Users that only demand the first lane can use the definition for lane /// 0. DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) { - return U.onlyFirstLaneUsed(DefR); + return U.usesFirstLaneOnly(DefR); }); // Update each build vector user that currently has DefR as its only diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index c6380d3..e22c5df 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -18,12 +18,12 @@ using namespace llvm::VPlanPatternMatch; bool vputils::onlyFirstLaneUsed(const VPValue *Def) { return all_of(Def->users(), - [Def](const VPUser *U) { return U->onlyFirstLaneUsed(Def); }); + [Def](const VPUser *U) { return U->usesFirstLaneOnly(Def); }); } bool vputils::onlyFirstPartUsed(const VPValue *Def) { return all_of(Def->users(), - [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); }); + [Def](const VPUser *U) { return U->usesFirstPartOnly(Def); }); } bool vputils::onlyScalarValuesUsed(const VPValue *Def) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 83e3fca..5da7463 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -274,12 +274,12 @@ public: virtual bool usesScalars(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return onlyFirstLaneUsed(Op); + return usesFirstLaneOnly(Op); } /// Returns true if the VPUser only uses the first lane of operand \p Op. /// Conservatively returns false. - virtual bool onlyFirstLaneUsed(const VPValue *Op) const { + virtual bool usesFirstLaneOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return false; @@ -287,7 +287,7 @@ public: /// Returns true if the VPUser only uses the first part of operand \p Op. /// Conservatively returns false. - virtual bool onlyFirstPartUsed(const VPValue *Op) const { + virtual bool usesFirstPartOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return false; diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 91734a1..34754a1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -252,6 +252,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { for (const VPUser *U : V->users()) { auto *UI = cast<VPRecipeBase>(U); + if (isa<VPIRPhi>(UI) && + UI->getNumOperands() != UI->getParent()->getNumPredecessors()) { + errs() << "Phi-like recipe with different number of operands and " + "predecessors.\n"; + return false; + } + if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) { for (const auto &[IncomingVPV, IncomingVPBB] : Phi->incoming_values_and_blocks()) { diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d6eb00d..27a8bbd 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -2017,8 +2017,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) { Value *ScalarV = Ext->getOperand(0); if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV), - &DT)) - ScalarV = Builder.CreateFreeze(ScalarV); + &DT)) { + // Check wether all lanes are extracted, all extracts trigger UB + // on poison, and the last extract (and hence all previous ones) + // are guaranteed to execute if Ext executes. If so, we do not + // need to insert a freeze. + SmallDenseSet<ConstantInt *, 8> ExtractedLanes; + bool AllExtractsTriggerUB = true; + ExtractElementInst *LastExtract = nullptr; + BasicBlock *ExtBB = Ext->getParent(); + for (User *U : Ext->users()) { + auto *Extract = cast<ExtractElementInst>(U); + if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) { + AllExtractsTriggerUB = false; + break; + } + ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand())); + if (!LastExtract || LastExtract->comesBefore(Extract)) + LastExtract = Extract; + } + if (ExtractedLanes.size() != DstTy->getNumElements() || + !AllExtractsTriggerUB || + !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(), + LastExtract->getIterator())) + ScalarV = Builder.CreateFreeze(ScalarV); + } ScalarV = Builder.CreateBitCast( ScalarV, IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy))); |
