diff options
Diffstat (limited to 'llvm/lib/Transforms')
40 files changed, 883 insertions, 383 deletions
diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h index e05fe28..1e549f1 100644 --- a/llvm/lib/Transforms/Coroutines/CoroCloner.h +++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h @@ -77,7 +77,7 @@ public: : OrigF(OrigF), Suffix(Suffix), Shape(Shape), FKind(FKind), Builder(OrigF.getContext()), TTI(TTI) {} - virtual ~BaseCloner() {} + virtual ~BaseCloner() = default; /// Create a clone for a continuation lowering. static Function *createClone(Function &OrigF, const Twine &Suffix, diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 5048561..a6ac761 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3619,7 +3619,7 @@ struct AAIntraFnReachabilityFunction final return true; RQITy StackRQI(A, From, To, ExclusionSet, false); - typename RQITy::Reachable Result; + RQITy::Reachable Result; if (!NonConstThis->checkQueryCache(A, StackRQI, Result)) return NonConstThis->isReachableImpl(A, StackRQI, /*IsTemporaryRQI=*/true); @@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final // ------------------------ Align Argument Attribute ------------------------ namespace { + static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, Value &AssociatedValue, const Use *U, const Instruction *I, bool &TrackUse) { @@ -5200,6 +5201,28 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, TrackUse = true; return 0; } + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + // Is it appropriate to pull attribute in initialization? + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE); + const auto *AlignAA = A.getAAFor<AAAlign>( + QueryingAA, IRPosition::value(*II), DepClassTy::NONE); + if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) { + unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Align ConstAlign(UINT64_C(1) << ShiftValue); + if (ConstAlign >= AlignAA->getKnownAlign()) + return Align(1).value(); + } + if (AlignAA) + return AlignAA->getKnownAlign().value(); + break; + } + default: + break; + } MaybeAlign MA; if (const auto *CB = dyn_cast<CallBase>(I)) { @@ -5499,6 +5522,44 @@ struct AAAlignCallSiteReturned final AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {} + ChangeStatus updateImpl(Attributor &A) override { + Instruction *I = getIRPosition().getCtxI(); + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + Align Alignment; + bool Valid = false; + + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED); + if (ConstVals && ConstVals->isValidState()) { + unsigned ShiftValue = + std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Alignment = Align(UINT64_C(1) << ShiftValue); + Valid = true; + } + + const auto *AlignAA = + A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))), + DepClassTy::REQUIRED); + if (AlignAA && AlignAA->isValidState()) { + Alignment = std::max(AlignAA->getAssumedAlign(), Alignment); + Valid = true; + } + + if (Valid) + return clampStateAndIndicateChange<StateType>( + this->getState(), + std::min(this->getAssumedAlign(), Alignment).value()); + break; + } + default: + break; + } + } + return Base::updateImpl(A); + }; /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); } }; @@ -10701,7 +10762,7 @@ struct AAInterFnReachabilityFunction auto *NonConstThis = const_cast<AAInterFnReachabilityFunction *>(this); RQITy StackRQI(A, From, To, ExclusionSet, false); - typename RQITy::Reachable Result; + RQITy::Reachable Result; if (!NonConstThis->checkQueryCache(A, StackRQI, Result)) return NonConstThis->isReachableImpl(A, StackRQI, /*IsTemporaryRQI=*/true); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 894d83f..d35ae47 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -1034,11 +1034,11 @@ private: } // namespace template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph< ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph< IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; template <> diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index d7eb745..2a87a0f 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -208,7 +208,7 @@ namespace KernelInfo { // }; #define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_IDX(Configuration, 0) KERNEL_ENVIRONMENT_IDX(Ident, 1) @@ -216,7 +216,7 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1) #undef KERNEL_ENVIRONMENT_IDX #define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0) KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1) @@ -258,7 +258,7 @@ KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams) GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) { - constexpr const int InitKernelEnvironmentArgNo = 0; + constexpr int InitKernelEnvironmentArgNo = 0; return cast<GlobalVariable>( KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo) ->stripPointerCasts()); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 3ddf182..cbaff29 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3997,6 +3997,27 @@ static Value *foldOrUnsignedUMulOverflowICmp(BinaryOperator &I, return nullptr; } +/// Fold select(X >s 0, 0, -X) | smax(X, 0) --> abs(X) +/// select(X <s 0, -X, 0) | smax(X, 0) --> abs(X) +static Value *FoldOrOfSelectSmaxToAbs(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *X; + Value *Sel; + if (match(&I, + m_c_Or(m_Value(Sel), m_OneUse(m_SMax(m_Value(X), m_ZeroInt()))))) { + auto NegX = m_Neg(m_Specific(X)); + if (match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SGT, m_Specific(X), + m_ZeroInt()), + m_ZeroInt(), NegX)) || + match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Specific(X), + m_ZeroInt()), + NegX, m_ZeroInt()))) + return Builder.CreateBinaryIntrinsic(Intrinsic::abs, X, + Builder.getFalse()); + } + return nullptr; +} + // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. @@ -4545,6 +4566,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V); + if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder)) + return replaceInstUsesWith(I, Res); + return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f5130da..9572f9d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3599,6 +3599,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelCond->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Specific(C), m_Value(A2))) && + SelCond) { + return SelectInst::Create(C, A, B, "", nullptr, SelCond); + } else if (match(FalseVal, + m_LogicalAnd(m_Not(m_Value(C2)), m_Value(B2))) && + SelFVal) { + SelectInst *NewSI = SelectInst::Create(C, A, B, "", nullptr, SelFVal); + NewSI->swapProfMetadata(); + return NewSI; + } else { + return createSelectInstWithUnknownProfile(C, A, B); + } + } return SelectInst::Create(C, A, B); } @@ -3615,6 +3630,20 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelFVal->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Not(m_Value(C2)), m_Value(A2))) && + SelCond) { + SelectInst *NewSI = SelectInst::Create(C, B, A, "", nullptr, SelCond); + NewSI->swapProfMetadata(); + return NewSI; + } else if (match(FalseVal, m_LogicalAnd(m_Specific(C), m_Value(B2))) && + SelFVal) { + return SelectInst::Create(C, B, A, "", nullptr, SelFVal); + } else { + return createSelectInstWithUnknownProfile(C, B, A); + } + } return SelectInst::Create(C, B, A); } } diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 471c6ec..ceeece4 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3903,7 +3903,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // adding/"accumulating" %s. "Accumulation" stores the result in one // of the source registers, but this accumulate vs. add distinction // is lost when dealing with LLVM intrinsics.) + // + // ZeroPurifies means that multiplying a known-zero with an uninitialized + // value results in an initialized value. This is applicable for integer + // multiplication, but not floating-point (counter-example: NaN). void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor, + bool ZeroPurifies, unsigned EltSizeInBits = 0) { IRBuilder<> IRB(&I); @@ -3945,7 +3950,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { assert(AccumulatorType == ReturnType); } - FixedVectorType *ImplicitReturnType = ReturnType; + FixedVectorType *ImplicitReturnType = + cast<FixedVectorType>(getShadowTy(ReturnType)); // Step 1: instrument multiplication of corresponding vector elements if (EltSizeInBits) { ImplicitReturnType = cast<FixedVectorType>( @@ -3964,30 +3970,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { ReturnType->getNumElements() * ReductionFactor); } - // Multiplying an *initialized* zero by an uninitialized element results in - // an initialized zero element. - // - // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value - // results in an unpoisoned value. We can therefore adapt the visitAnd() - // instrumentation: - // OutShadow = (SaNonZero & SbNonZero) - // | (VaNonZero & SbNonZero) - // | (SaNonZero & VbNonZero) - // where non-zero is checked on a per-element basis (not per bit). - Value *SZero = Constant::getNullValue(Va->getType()); - Value *VZero = Constant::getNullValue(Sa->getType()); - Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero); - Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero); - Value *VaNonZero = IRB.CreateICmpNE(Va, VZero); - Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero); - - Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); - Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); - Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); - // Each element of the vector is represented by a single bit (poisoned or // not) e.g., <8 x i1>. - Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + Value *SaNonZero = IRB.CreateIsNotNull(Sa); + Value *SbNonZero = IRB.CreateIsNotNull(Sb); + Value *And; + if (ZeroPurifies) { + // Multiplying an *initialized* zero by an uninitialized element results + // in an initialized zero element. + // + // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value + // results in an unpoisoned value. We can therefore adapt the visitAnd() + // instrumentation: + // OutShadow = (SaNonZero & SbNonZero) + // | (VaNonZero & SbNonZero) + // | (SaNonZero & VbNonZero) + // where non-zero is checked on a per-element basis (not per bit). + Value *VaInt = Va; + Value *VbInt = Vb; + if (!Va->getType()->isIntegerTy()) { + VaInt = CreateAppToShadowCast(IRB, Va); + VbInt = CreateAppToShadowCast(IRB, Vb); + } + + Value *VaNonZero = IRB.CreateIsNotNull(VaInt); + Value *VbNonZero = IRB.CreateIsNotNull(VbInt); + + Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); + Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); + Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); + + And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + } else { + And = IRB.CreateOr({SaNonZero, SbNonZero}); + } // Extend <8 x i1> to <8 x i16>. // (The real pmadd intrinsic would have computed intermediate values of @@ -5752,17 +5768,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_ssse3_pmadd_ub_sw_128: case Intrinsic::x86_avx2_pmadd_ub_sw: case Intrinsic::x86_avx512_pmaddubs_w_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true); break; // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) case Intrinsic::x86_ssse3_pmadd_ub_sw: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) case Intrinsic::x86_mmx_pmadd_wd: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // AVX Vector Neural Network Instructions: bytes @@ -5848,7 +5867,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx2_vpdpbuuds_128: case Intrinsic::x86_avx2_vpdpbuuds_256: case Intrinsic::x86_avx10_vpdpbuuds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // AVX Vector Neural Network Instructions: words @@ -5901,7 +5921,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx512_vpdpwssds_128: case Intrinsic::x86_avx512_vpdpwssds_256: case Intrinsic::x86_avx512_vpdpwssds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp index 80e77e09..a2fad02 100644 --- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp @@ -161,7 +161,7 @@ template <char NsanTypeId> class ShadowTypeConfigImpl : public ShadowTypeConfig { public: char getNsanTypeId() const override { return NsanTypeId; } - static constexpr const char kNsanTypeId = NsanTypeId; + static constexpr char kNsanTypeId = NsanTypeId; }; // `double` (`d`) shadow type. diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp index 89980d5..a577f51 100644 --- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp +++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp @@ -122,7 +122,8 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) { Value *Cond = Assume->getArgOperand(0); // Don't drop type tests, which have special semantics. - if (match(Cond, m_Intrinsic<Intrinsic::type_test>())) + if (match(Cond, m_Intrinsic<Intrinsic::type_test>()) || + match(Cond, m_Intrinsic<Intrinsic::public_type_test>())) continue; SmallVector<Value *> Affected; diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index a06f832..d564e32 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -514,7 +514,7 @@ public: class GVNSink { public: - GVNSink() {} + GVNSink() = default; bool run(Function &F) { LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 19eccb9..9ffa602 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1796,14 +1796,16 @@ private: // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { @@ -2092,14 +2094,16 @@ private: // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index a883998..1b770be 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate { /// Return true if the dependence from the store to the load has an /// absolute distance of one. /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop) - bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, - Loop *L) const { + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L, + const DominatorTree &DT) const { Value *LoadPtr = Load->getPointerOperand(); Value *StorePtr = Store->getPointerOperand(); Type *LoadType = getLoadStoreType(Load); @@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate { DL.getTypeSizeInBits(getLoadStoreType(Store)) && "Should be a known dependence"); - int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0); - int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0); + int64_t StrideLoad = + getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0); + int64_t StrideStore = + getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0); if (!StrideLoad || !StrideStore || StrideLoad != StrideStore) return false; @@ -287,8 +289,8 @@ public: // so deciding which one forwards is easy. The later one forwards as // long as they both have a dependence distance of one to the load. if (Cand.Store->getParent() == OtherCand->Store->getParent() && - Cand.isDependenceDistanceOfOne(PSE, L) && - OtherCand->isDependenceDistanceOfOne(PSE, L)) { + Cand.isDependenceDistanceOfOne(PSE, L, *DT) && + OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) { // They are in the same block, the later one will forward to the load. if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) OtherCand = &Cand; @@ -538,7 +540,7 @@ public: // Check whether the SCEV difference is the same as the induction step, // thus we load the value in the next iteration. - if (!Cand.isDependenceDistanceOfOne(PSE, L)) + if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT)) continue; assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) && diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 2bda9d8..802ae4e 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, } // Do not attempt partial/runtime unrolling in FullLoopUnrolling - if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) { + if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) || + UP.Count < TripCount || UP.Count < MaxTripCount)) { LLVM_DEBUG( dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n"); return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 3487e81..7e70ba2 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -245,11 +245,14 @@ raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI) { } // namespace -static bool isUniformShape(Value *V) { +static bool isShapePreserving(Value *V) { Instruction *I = dyn_cast<Instruction>(V); if (!I) return true; + if (isa<SelectInst>(I)) + return true; + if (I->isBinaryOp()) return true; @@ -300,6 +303,16 @@ static bool isUniformShape(Value *V) { } } +/// Return an iterator over the operands of \p I that should share shape +/// information with \p I. +static iterator_range<Use *> getShapedOperandsForInst(Instruction *I) { + assert(isShapePreserving(I) && + "Can't retrieve shaped operands for an instruction that does not " + "preserve shape information"); + auto Ops = I->operands(); + return isa<SelectInst>(I) ? drop_begin(Ops) : Ops; +} + /// Return the ShapeInfo for the result of \p I, it it can be determined. static std::optional<ShapeInfo> computeShapeInfoForInst(Instruction *I, @@ -329,9 +342,8 @@ computeShapeInfoForInst(Instruction *I, return OpShape->second; } - if (isUniformShape(I) || isa<SelectInst>(I)) { - auto Ops = I->operands(); - auto ShapedOps = isa<SelectInst>(I) ? drop_begin(Ops) : Ops; + if (isShapePreserving(I)) { + auto ShapedOps = getShapedOperandsForInst(I); // Find the first operand that has a known shape and use that. for (auto &Op : ShapedOps) { auto OpShape = ShapeMap.find(Op.get()); @@ -710,10 +722,9 @@ public: case Intrinsic::matrix_column_major_store: return true; default: - return isUniformShape(II); + break; } - return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) || - isa<SelectInst>(V); + return isShapePreserving(V) || isa<StoreInst>(V) || isa<LoadInst>(V); } /// Propagate the shape information of instructions to their users. @@ -800,9 +811,8 @@ public: } else if (isa<StoreInst>(V)) { // Nothing to do. We forward-propagated to this so we would just // backward propagate to an instruction with an already known shape. - } else if (isUniformShape(V) || isa<SelectInst>(V)) { - auto Ops = cast<Instruction>(V)->operands(); - auto ShapedOps = isa<SelectInst>(V) ? drop_begin(Ops) : Ops; + } else if (isShapePreserving(V)) { + auto ShapedOps = getShapedOperandsForInst(cast<Instruction>(V)); // Propagate to all operands. ShapeInfo Shape = ShapeMap[V]; for (Use &U : ShapedOps) { diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index bb6c879..0577ddb 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" @@ -337,7 +338,7 @@ static void buildPartialUnswitchConditionalBranch( static void buildPartialInvariantUnswitchConditionalBranch( BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction, BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) { ValueToValueMapTy VMap; for (auto *Val : reverse(ToDuplicate)) { Instruction *Inst = cast<Instruction>(Val); @@ -377,8 +378,19 @@ static void buildPartialInvariantUnswitchConditionalBranch( IRBuilder<> IRB(&BB); IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); Value *Cond = VMap[ToDuplicate[0]]; - IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, - Direction ? &NormalSucc : &UnswitchedSucc); + // The expectation is that ToDuplicate[0] is the condition used by the + // OriginalBranch, case in which we can clone the profile metadata from there. + auto *ProfData = + !ProfcheckDisableMetadataFixes && + ToDuplicate[0] == skipTrivialSelect(OriginalBranch.getCondition()) + ? OriginalBranch.getMetadata(LLVMContext::MD_prof) + : nullptr; + auto *BR = + IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, + Direction ? &NormalSucc : &UnswitchedSucc, ProfData); + if (!ProfData) + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(), + DEBUG_TYPE); } /// Rewrite the PHI nodes in an unswitched loop exit basic block. @@ -2515,7 +2527,7 @@ static void unswitchNontrivialInvariants( // the branch in the split block. if (PartiallyInvariant) buildPartialInvariantUnswitchConditionalBranch( - *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU); + *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU, *BI); else { buildPartialUnswitchConditionalBranch( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, @@ -2820,9 +2832,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, MSSAU->getMemorySSA()->verifyMemorySSA(); DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - Instruction *DeoptBlockTerm = - SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true, - GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI); + // llvm.experimental.guard doesn't have branch weights. We can assume, + // however, that the deopt path is unlikely. + Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen( + GI->getArgOperand(0), GI, true, + !ProfcheckDisableMetadataFixes && EstimateProfile + ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights() + : nullptr, + &DTU, &LI); BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator()); // SplitBlockAndInsertIfThen inserts control flow that branches to // DeoptBlockTerm if the condition is true. We want the opposite. @@ -3186,10 +3203,15 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, Builder.SetInsertPoint(TI); auto *InvariantBr = Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock); + // We don't know anything about the relation between the limits. + setExplicitlyUnknownBranchWeightsIfProfiled( + *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE); Builder.SetInsertPoint(CheckBlock); - Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0), - TI->getSuccessor(1)); + Builder.CreateCondBr( + TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1), + !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof) + : nullptr); TI->eraseFromParent(); // Fixup phis. diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 0f3978f..0a8f5ea 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -143,8 +143,8 @@ struct SubGraphTraits { class WrappedSuccIterator : public iterator_adaptor_base< WrappedSuccIterator, BaseSuccIterator, - typename std::iterator_traits<BaseSuccIterator>::iterator_category, - NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> { + std::iterator_traits<BaseSuccIterator>::iterator_category, NodeRef, + std::ptrdiff_t, NodeRef *, NodeRef> { SmallDenseSet<RegionNode *> *Nodes; public: @@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { } else { // Test for successors as back edge BasicBlock *BB = N->getNodeAs<BasicBlock>(); - BranchInst *Term = cast<BranchInst>(BB->getTerminator()); - - for (BasicBlock *Succ : Term->successors()) - if (Visited.count(Succ)) - Loops[Succ] = BB; + if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator())) + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) + Loops[Succ] = BB; } } @@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { for (BasicBlock *P : predecessors(BB)) { // Ignore it if it's a branch from outside into our region entry - if (!ParentRegion->contains(P)) + if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator())) continue; Region *R = RI->getRegionFor(P); @@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) { /// Run the transformation for each region found bool StructurizeCFG::run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI) { - if (R->isTopLevelRegion()) + // CallBr and its corresponding direct target blocks are for now ignored by + // this pass. This is not a limitation for the currently intended uses cases + // of callbr in the AMDGPU backend. + // Parent and child regions are not affected by this (current) restriction. + // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details. + if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator())) return false; this->DT = DT; this->TTI = TTI; Func = R->getEntry()->getParent(); - assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator."); ParentRegion = R; diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 5ba6f95f..6086615 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::CoroDestroyOnlyWhenComplete: case Attribute::CoroElideSafe: case Attribute::NoDivergenceSource: + case Attribute::NoCreateUndefOrPoison: continue; // Those attributes should be safe to propagate to the extracted function. case Attribute::AlwaysInline: diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp index 0642d51..6d4436b 100644 --- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp +++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp @@ -16,22 +16,62 @@ using namespace llvm; +static void mergeAttributes(LLVMContext &Ctx, const Module &M, + const DataLayout &DL, const Triple &TT, + Function *Func, FunctionType *FuncTy, + AttributeList FuncAttrs) { + AttributeList OldAttrs = Func->getAttributes(); + AttributeList NewAttrs = OldAttrs; + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder); + } + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder); + } + + for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) { + AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I)); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I)); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder); + } + + Func->setAttributes(NewAttrs); +} + PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M, ModuleAnalysisManager &MAM) { RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple()); LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + const Triple &TT = M.getTargetTriple(); for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) { if (Impl == RTLIB::Unsupported) continue; - // TODO: Declare with correct type, calling convention, and attributes. + auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl); - FunctionType *FuncTy = - FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); + // TODO: Declare with correct type, calling convention, and attributes. + if (!FuncTy) + FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); StringRef FuncName = RTLCI.getLibcallImplName(Impl); - M.getOrInsertFunction(FuncName, FuncTy); + + Function *Func = + cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee()); + if (Func->getFunctionType() == FuncTy) { + mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs); + Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl)); + } } return PreservedAnalyses::none(); diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 46f2903..a03cf6e 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3416,7 +3416,11 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C, // Create integer constant expression. auto createIntegerExpression = [&DIB](const Constant &CV) -> DIExpression * { const APInt &API = cast<ConstantInt>(&CV)->getValue(); - std::optional<int64_t> InitIntOpt = API.trySExtValue(); + std::optional<int64_t> InitIntOpt; + if (API.getBitWidth() == 1) + InitIntOpt = API.tryZExtValue(); + else + InitIntOpt = API.trySExtValue(); return InitIntOpt ? DIB.createConstantValueExpression( static_cast<uint64_t>(*InitIntOpt)) : nullptr; diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 1e8f6cc..6c9467b 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -202,6 +202,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, /// probability of executing at least one more iteration? static BranchProbability probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { + // OriginalLoopProb == 1 would produce a division by zero in the calculation + // below. The problem is that case indicates an always infinite loop, but a + // remainder loop cannot be calculated at run time if the original loop is + // infinite as infinity % UnrollCount is undefined. We then choose + // probabilities indicating that all remainder loop iterations will always + // execute. + // + // Currently, the remainder loop here is an epilogue, which cannot be reached + // if the original loop is infinite, so the aforementioned choice is + // arbitrary. + // + // FIXME: Branch weights still need to be fixed in the case of prologues + // (issue #135812). In that case, the aforementioned choice seems reasonable + // for the goal of maintaining the original loop's block frequencies. That + // is, an infinite loop's initial iterations are not skipped, and the prologue + // loop body might have unique blocks that execute a finite number of times + // if, for example, the original loop body contains conditionals like i < + // UnrollCount. + if (OriginalLoopProb == BranchProbability::getOne()) + return BranchProbability::getOne(); + // Each of these variables holds the original loop's probability that the // number of iterations it will execute is some m in the specified range. BranchProbability ProbOne = OriginalLoopProb; // 1 <= m diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 8be471b..6e60b94 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -992,9 +992,12 @@ BranchProbability llvm::getBranchProbability(BranchInst *B, uint64_t Weight0, Weight1; if (!extractBranchWeights(*B, Weight0, Weight1)) return BranchProbability::getUnknown(); + uint64_t Denominator = Weight0 + Weight1; + if (Denominator == 0) + return BranchProbability::getUnknown(); if (!ForFirstTarget) std::swap(Weight0, Weight1); - return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1); + return BranchProbability::getBranchProbability(Weight0, Denominator); } bool llvm::setBranchProbability(BranchInst *B, BranchProbability P, diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 6addcfa..3a3e3ad 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -778,8 +778,10 @@ private: return false; // Add all values from the range to the set - for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp) + APInt Tmp = Span.getLower(); + do Vals.push_back(ConstantInt::get(I->getContext(), Tmp)); + while (++Tmp != Span.getUpper()); UsedICmps++; return true; @@ -5956,7 +5958,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, } // Update weight for the newly-created conditional branch. - if (hasBranchWeightMD(*SI)) { + if (hasBranchWeightMD(*SI) && NewBI->isConditional()) { SmallVector<uint64_t, 8> Weights; getBranchWeights(SI, Weights); if (Weights.size() == 1 + SI->getNumCases()) { @@ -6020,6 +6022,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const DataLayout &DL) { Value *Cond = SI->getCondition(); KnownBits Known = computeKnownBits(Cond, DL, AC, SI); + SmallPtrSet<const Constant *, 4> KnownValues; + bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) @@ -6039,15 +6043,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, UniqueSuccessors.push_back(Successor); ++It->second; } - const APInt &CaseVal = Case.getCaseValue()->getValue(); + ConstantInt *CaseC = Case.getCaseValue(); + const APInt &CaseVal = CaseC->getValue(); if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) || - (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) { - DeadCases.push_back(Case.getCaseValue()); + (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) || + (IsKnownValuesValid && !KnownValues.contains(CaseC))) { + DeadCases.push_back(CaseC); if (DTU) --NumPerSuccessorCases[Successor]; LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n"); - } + } else if (IsKnownValuesValid) + KnownValues.erase(CaseC); } // If we can prove that the cases must cover all possible values, the @@ -6058,33 +6065,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const unsigned NumUnknownBits = Known.getBitWidth() - (Known.Zero | Known.One).popcount(); assert(NumUnknownBits <= Known.getBitWidth()); - if (HasDefault && DeadCases.empty() && - NumUnknownBits < 64 /* avoid overflow */) { - uint64_t AllNumCases = 1ULL << NumUnknownBits; - if (SI->getNumCases() == AllNumCases) { + if (HasDefault && DeadCases.empty()) { + if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) { createUnreachableSwitchDefault(SI, DTU); return true; } - // When only one case value is missing, replace default with that case. - // Eliminating the default branch will provide more opportunities for - // optimization, such as lookup tables. - if (SI->getNumCases() == AllNumCases - 1) { - assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); - IntegerType *CondTy = cast<IntegerType>(Cond->getType()); - if (CondTy->getIntegerBitWidth() > 64 || - !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) - return false; - uint64_t MissingCaseVal = 0; - for (const auto &Case : SI->cases()) - MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); - auto *MissingCase = - cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal)); - SwitchInstProfUpdateWrapper SIW(*SI); - SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0)); - createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false); - SIW.setSuccessorWeight(0, 0); - return true; + if (NumUnknownBits < 64 /* avoid overflow */) { + uint64_t AllNumCases = 1ULL << NumUnknownBits; + if (SI->getNumCases() == AllNumCases) { + createUnreachableSwitchDefault(SI, DTU); + return true; + } + // When only one case value is missing, replace default with that case. + // Eliminating the default branch will provide more opportunities for + // optimization, such as lookup tables. + if (SI->getNumCases() == AllNumCases - 1) { + assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); + IntegerType *CondTy = cast<IntegerType>(Cond->getType()); + if (CondTy->getIntegerBitWidth() > 64 || + !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) + return false; + + uint64_t MissingCaseVal = 0; + for (const auto &Case : SI->cases()) + MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); + auto *MissingCase = cast<ConstantInt>( + ConstantInt::get(Cond->getType(), MissingCaseVal)); + SwitchInstProfUpdateWrapper SIW(*SI); + SIW.addCase(MissingCase, SI->getDefaultDest(), + SIW.getSuccessorWeight(0)); + createUnreachableSwitchDefault(SI, DTU, + /*RemoveOrigDefaultBlock*/ false); + SIW.setSuccessorWeight(0, 0); + return true; + } } } @@ -7570,6 +7585,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, return true; } +/// Tries to transform the switch when the condition is umin with a constant. +/// In that case, the default branch can be replaced by the constant's branch. +/// This method also removes dead cases when the simplification cannot replace +/// the default branch. +/// +/// For example: +/// switch(umin(a, 3)) { +/// case 0: +/// case 1: +/// case 2: +/// case 3: +/// case 4: +/// // ... +/// default: +/// unreachable +/// } +/// +/// Transforms into: +/// +/// switch(a) { +/// case 0: +/// case 1: +/// case 2: +/// default: +/// // This is case 3 +/// } +static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) { + Value *A; + ConstantInt *Constant; + + if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant)))) + return false; + + SmallVector<DominatorTree::UpdateType> Updates; + SwitchInstProfUpdateWrapper SIW(*SI); + BasicBlock *BB = SIW->getParent(); + + // Dead cases are removed even when the simplification fails. + // A case is dead when its value is higher than the Constant. + for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) { + if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) { + ++I; + continue; + } + BasicBlock *DeadCaseBB = I->getCaseSuccessor(); + DeadCaseBB->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB}); + I = SIW->removeCase(I); + E = SIW->case_end(); + } + + auto Case = SI->findCaseValue(Constant); + // If the case value is not found, `findCaseValue` returns the default case. + // In this scenario, since there is no explicit `case 3:`, the simplification + // fails. The simplification also fails when the switch’s default destination + // is reachable. + if (!SI->defaultDestUnreachable() || Case == SI->case_default()) { + if (DTU) + DTU->applyUpdates(Updates); + return !Updates.empty(); + } + + BasicBlock *Unreachable = SI->getDefaultDest(); + SIW.replaceDefaultDest(Case); + SIW.removeCase(Case); + SIW->setCondition(A); + + Updates.push_back({DominatorTree::Delete, BB, Unreachable}); + + if (DTU) + DTU->applyUpdates(Updates); + + return true; +} + /// Tries to transform switch of powers of two to reduce switch range. /// For example, switch like: /// switch (C) { case 1: case 2: case 64: case 128: } @@ -8037,6 +8127,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (simplifyDuplicateSwitchArms(SI, DTU)) return requestResimplify(); + if (simplifySwitchWhenUMin(SI, DTU)) + return requestResimplify(); + return false; } diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 94c5c170..e86ab13 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix; // Redirect exiting edges through a control flow hub. ControlFlowHub CHub; + bool Changed = false; for (unsigned I = 0; I < ExitingBlocks.size(); ++I) { BasicBlock *BB = ExitingBlocks[I]; @@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { bool UpdatedLI = false; BasicBlock *NewSucc = SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); + // SplitCallBrEdge modifies the CFG because it creates an intermediate + // block. So we need to set the changed flag no matter what the + // ControlFlowHub is going to do later. + Changed = true; // Even if CallBr and Succ do not have a common parent loop, we need to // add the new target block to the parent loop of the current loop. if (!UpdatedLI) @@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { bool ChangedCFG; std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize( &DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue()); + ChangedCFG |= Changed; if (!ChangedCFG) return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index fdfff16..03112c6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, bool CanAddPredicate = !llvm::shouldOptimizeForSize( TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); - int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides, - CanAddPredicate, false).value_or(0); + int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides, + CanAddPredicate, false) + .value_or(0); if (Stride == 1 || Stride == -1) return Stride; return 0; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 3fed003..04b0562 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -167,7 +167,7 @@ public: DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( - new VPInstruction(Opcode, Operands, Flags, DL, Name)); + new VPInstruction(Opcode, Operands, Flags, {}, DL, Name)); } VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, @@ -184,7 +184,7 @@ public: DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( - new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); + new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name)); } VPInstruction *createNot(VPValue *Operand, @@ -205,7 +205,7 @@ public: return tryInsertInstruction(new VPInstruction( Instruction::BinaryOps::Or, {LHS, RHS}, - VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name)); + VPRecipeWithIRFlags::DisjointFlagsTy(false), {}, DL, Name)); } VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS, @@ -221,7 +221,7 @@ public: std::optional<FastMathFlags> FMFs = std::nullopt) { auto *Select = FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, - *FMFs, DL, Name) + *FMFs, {}, DL, Name) : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, DL, Name); return tryInsertInstruction(Select); @@ -235,7 +235,7 @@ public: assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); return tryInsertInstruction( - new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name)); + new VPInstruction(Instruction::ICmp, {A, B}, Pred, {}, DL, Name)); } /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A @@ -246,7 +246,7 @@ public: assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE && Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate"); return tryInsertInstruction( - new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name)); + new VPInstruction(Instruction::FCmp, {A, B}, Pred, {}, DL, Name)); } VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, @@ -254,7 +254,7 @@ public: const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, - GEPNoWrapFlags::none(), DL, Name)); + GEPNoWrapFlags::none(), {}, DL, Name)); } VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset, @@ -262,7 +262,7 @@ public: DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction(new VPInstruction( - VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name)); + VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, {}, DL, Name)); } VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset, @@ -270,7 +270,7 @@ public: const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::WidePtrAdd, {Ptr, Offset}, - GEPNoWrapFlags::none(), DL, Name)); + GEPNoWrapFlags::none(), {}, DL, Name)); } VPPhi *createScalarPhi(ArrayRef<VPValue *> IncomingValues, DebugLoc DL, @@ -280,8 +280,7 @@ public: VPValue *createElementCount(Type *Ty, ElementCount EC) { VPlan &Plan = *getInsertBlock()->getPlan(); - VPValue *RuntimeEC = - Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue())); + VPValue *RuntimeEC = Plan.getConstantInt(Ty, EC.getKnownMinValue()); if (EC.isScalable()) { VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty); RuntimeEC = EC.getKnownMinValue() == 1 @@ -304,9 +303,11 @@ public: } VPInstruction *createScalarCast(Instruction::CastOps Opcode, VPValue *Op, - Type *ResultTy, DebugLoc DL) { + Type *ResultTy, DebugLoc DL, + const VPIRFlags &Flags = {}, + const VPIRMetadata &Metadata = {}) { return tryInsertInstruction( - new VPInstructionWithType(Opcode, Op, ResultTy, {}, DL)); + new VPInstructionWithType(Opcode, Op, ResultTy, DL, Flags, Metadata)); } VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 25bf49d..e5c3f17 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7752,8 +7752,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, if (CM.isPredicatedInst(I)) { SmallVector<VPValue *> Ops(Operands); VPValue *Mask = getBlockInMask(Builder.getInsertBlock()); - VPValue *One = - Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); + VPValue *One = Plan.getConstantInt(I->getType(), 1u); auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); Ops[1] = SafeRHS; return new VPWidenRecipe(*I, Ops); @@ -7806,11 +7805,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, } case Instruction::ExtractValue: { SmallVector<VPValue *> NewOps(Operands); - Type *I32Ty = IntegerType::getInt32Ty(I->getContext()); auto *EVI = cast<ExtractValueInst>(I); assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index"); unsigned Idx = EVI->getIndices()[0]; - NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false))); + NewOps.push_back(Plan.getConstantInt(32, Idx)); return new VPWidenRecipe(*I, NewOps); } }; @@ -8179,8 +8177,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, "Expected an ADD or SUB operation for predicated partial " "reductions (because the neutral element in the mask is zero)!"); Cond = getBlockInMask(Builder.getInsertBlock()); - VPValue *Zero = - Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0)); + VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0); BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc()); } return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, @@ -8643,7 +8640,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs && CurrentLinkI->getOpcode() == Instruction::Sub) { Type *PhiTy = PhiR->getUnderlyingValue()->getType(); - auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0)); + auto *Zero = Plan->getConstantInt(PhiTy, 0); VPWidenRecipe *Sub = new VPWidenRecipe( Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {}, VPIRMetadata(), CurrentLinkI->getDebugLoc()); @@ -8857,8 +8854,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( ToDelete.push_back(Select); // Convert the reduction phi to operate on bools. - PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( - OrigLoop->getHeader()->getContext()))); + PhiR->setOperand(0, Plan->getFalse()); continue; } @@ -8880,9 +8876,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( unsigned ScaleFactor = RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr()) .value_or(1); - Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext()); - auto *ScaleFactorVPV = - Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor)); + auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor); VPValue *StartV = PHBuilder.createNaryOp( VPInstruction::ReductionStartVector, {PhiR->getStartValue(), Iden, ScaleFactorVPV}, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1b55a3b..bf3f52c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20975,6 +20975,27 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, if (isa<PHINode>(S.getMainOp()) || isVectorLikeInstWithConstOps(S.getMainOp())) return nullptr; + // If the parent node is non-schedulable and the current node is copyable, and + // any of parent instructions are used outside several basic blocks or in + // bin-op node - cancel scheduling, it may cause wrong def-use deps in + // analysis, leading to a crash. + // Non-scheduled nodes may not have related ScheduleData model, which may lead + // to a skipped dep analysis. + if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() && + EI.UserTE->doesNotNeedToSchedule() && + EI.UserTE->getOpcode() != Instruction::PHI && + any_of(EI.UserTE->Scalars, [](Value *V) { + auto *I = dyn_cast<Instruction>(V); + if (!I || I->hasOneUser()) + return false; + for (User *U : I->users()) { + auto *UI = cast<Instruction>(U); + if (isa<BinaryOperator>(UI)) + return true; + } + return false; + })) + return std::nullopt; bool HasCopyables = S.areInstructionsWithCopyableElements(); if (((!HasCopyables && doesNotNeedToSchedule(VL)) || all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { @@ -22134,6 +22155,27 @@ bool BoUpSLP::collectValuesToDemote( {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(), VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()}); + if (E.isAltShuffle()) { + // Combining these opcodes may lead to incorrect analysis, skip for now. + auto IsDangerousOpcode = [](unsigned Opcode) { + switch (Opcode) { + case Instruction::Shl: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return true; + default: + break; + } + return false; + }; + if (IsDangerousOpcode(E.getAltOpcode())) + return FinalAnalysis(); + } + switch (E.getOpcode()) { // We can always demote truncations and extensions. Since truncations can diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 9c869dd..d354933 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -92,7 +92,7 @@ void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const { DGNode::print(OS, false); if (PrintDeps) { // Print memory preds. - static constexpr const unsigned Indent = 4; + static constexpr unsigned Indent = 4; for (auto *Pred : MemPreds) OS.indent(Indent) << "<-" << *Pred->getInstruction() << "\n"; } diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 86dbd21..5534da9 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -25,14 +25,14 @@ static cl::opt<bool> "emit new instructions (*very* expensive).")); #endif // NDEBUG -static constexpr const unsigned long StopAtDisabled = +static constexpr unsigned long StopAtDisabled = std::numeric_limits<unsigned long>::max(); static cl::opt<unsigned long> StopAt("sbvec-stop-at", cl::init(StopAtDisabled), cl::Hidden, cl::desc("Vectorize if the invocation count is < than this. 0 " "disables vectorization.")); -static constexpr const unsigned long StopBundleDisabled = +static constexpr unsigned long StopBundleDisabled = std::numeric_limits<unsigned long>::max(); static cl::opt<unsigned long> StopBundle("sbvec-stop-bndl", cl::init(StopBundleDisabled), cl::Hidden, diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index ed2f80b..2de6921 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -43,7 +43,7 @@ cl::opt<std::string> AllowFiles( "sbvec-allow-files", cl::init(".*"), cl::Hidden, cl::desc("Run the vectorizer only on file paths that match any in the " "list of comma-separated regex's.")); -static constexpr const char AllowFilesDelim = ','; +static constexpr char AllowFilesDelim = ','; SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") { if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1f10058..cfe1f1e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -939,7 +939,7 @@ class VPIRMetadata { SmallVector<std::pair<unsigned, MDNode *>> Metadata; public: - VPIRMetadata() {} + VPIRMetadata() = default; /// Adds metatadata that can be preserved from the original instruction /// \p I. @@ -950,12 +950,9 @@ public: VPIRMetadata(Instruction &I, LoopVersioning *LVer); /// Copy constructor for cloning. - VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {} + VPIRMetadata(const VPIRMetadata &Other) = default; - VPIRMetadata &operator=(const VPIRMetadata &Other) { - Metadata = Other.Metadata; - return *this; - } + VPIRMetadata &operator=(const VPIRMetadata &Other) = default; /// Add all metadata to \p I. void applyMetadata(Instruction &I) const; @@ -1107,14 +1104,14 @@ public: VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {} VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(), - const Twine &Name = ""); + const VPIRFlags &Flags, const VPIRMetadata &MD = {}, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = ""); VP_CLASSOF_IMPL(VPDef::VPInstructionSC) VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); - auto *New = new VPInstruction(Opcode, Operands, *this, getDebugLoc(), Name); + auto *New = new VPInstruction(Opcode, operands(), *this, *this, + getDebugLoc(), Name); if (getUnderlyingValue()) New->setUnderlyingValue(getUnderlyingInstr()); return New; @@ -1196,7 +1193,14 @@ public: VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands, Type *ResultTy, const VPIRFlags &Flags, DebugLoc DL, const Twine &Name = "") - : VPInstruction(Opcode, Operands, Flags, DL, Name), ResultTy(ResultTy) {} + : VPInstruction(Opcode, Operands, Flags, {}, DL, Name), + ResultTy(ResultTy) {} + + VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands, + Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags, + const VPIRMetadata &Metadata, const Twine &Name = "") + : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name), + ResultTy(ResultTy) {} static inline bool classof(const VPRecipeBase *R) { // VPInstructionWithType are VPInstructions with specific opcodes requiring @@ -1221,10 +1225,9 @@ public: } VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); auto *New = - new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this, - getDebugLoc(), getName()); + new VPInstructionWithType(getOpcode(), operands(), getResultType(), + *this, getDebugLoc(), getName()); New->setUnderlyingValue(getUnderlyingValue()); return New; } @@ -3206,6 +3209,9 @@ protected: : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); + assert(isa<VPVectorEndPointerRecipe>(getAddr()) || + !Reverse && + "Reversed acccess without VPVectorEndPointerRecipe address?"); } public: @@ -3977,7 +3983,7 @@ class VPIRBasicBlock : public VPBasicBlock { IRBB(IRBB) {} public: - ~VPIRBasicBlock() override {} + ~VPIRBasicBlock() override = default; static inline bool classof(const VPBlockBase *V) { return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; @@ -4029,7 +4035,7 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase { IsReplicator(IsReplicator) {} public: - ~VPRegionBlock() override {} + ~VPRegionBlock() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPBlockBase *V) { @@ -4109,6 +4115,12 @@ public: const VPCanonicalIVPHIRecipe *getCanonicalIV() const { return const_cast<VPRegionBlock *>(this)->getCanonicalIV(); } + + /// Return the type of the canonical IV for loop regions. + Type *getCanonicalIVType() { return getCanonicalIV()->getScalarType(); } + const Type *getCanonicalIVType() const { + return getCanonicalIV()->getScalarType(); + } }; inline VPRegionBlock *VPRecipeBase::getRegion() { @@ -4387,15 +4399,25 @@ public: } /// Return a VPValue wrapping i1 true. - VPValue *getTrue() { - LLVMContext &Ctx = getContext(); - return getOrAddLiveIn(ConstantInt::getTrue(Ctx)); - } + VPValue *getTrue() { return getConstantInt(1, 1); } /// Return a VPValue wrapping i1 false. - VPValue *getFalse() { - LLVMContext &Ctx = getContext(); - return getOrAddLiveIn(ConstantInt::getFalse(Ctx)); + VPValue *getFalse() { return getConstantInt(1, 0); } + + /// Return a VPValue wrapping a ConstantInt with the given type and value. + VPValue *getConstantInt(Type *Ty, uint64_t Val, bool IsSigned = false) { + return getOrAddLiveIn(ConstantInt::get(Ty, Val, IsSigned)); + } + + /// Return a VPValue wrapping a ConstantInt with the given bitwidth and value. + VPValue *getConstantInt(unsigned BitWidth, uint64_t Val, + bool IsSigned = false) { + return getConstantInt(APInt(BitWidth, Val, IsSigned)); + } + + /// Return a VPValue wrapping a ConstantInt with the given APInt value. + VPValue *getConstantInt(const APInt &Val) { + return getOrAddLiveIn(ConstantInt::get(getContext(), Val)); } /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 65688a3..1a66d20 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -612,8 +612,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan, if (!RequiresScalarEpilogueCheck) Cmp = Plan.getFalse(); else if (TailFolded) - Cmp = Plan.getOrAddLiveIn( - ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext()))); + Cmp = Plan.getTrue(); else Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(), &Plan.getVectorTripCount(), LatchDL, "cmp.n"); @@ -712,8 +711,8 @@ void VPlanTransforms::addMinimumIterationCheck( // additional overflow check is required before entering the vector loop. // Get the maximum unsigned value for the type. - VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get( - TripCountTy, cast<IntegerType>(TripCountTy)->getMask())); + VPValue *MaxUIntTripCount = + Plan.getConstantInt(cast<IntegerType>(TripCountTy)->getMask()); VPValue *DistanceToMax = Builder.createNaryOp( Instruction::Sub, {MaxUIntTripCount, TripCountVPV}, DebugLoc::getUnknown()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index b5b98c6..b57c448 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -313,7 +313,8 @@ private: // Check for recipes that do not have opcodes. if constexpr (std::is_same_v<RecipeTy, VPScalarIVStepsRecipe> || std::is_same_v<RecipeTy, VPCanonicalIVPHIRecipe> || - std::is_same_v<RecipeTy, VPDerivedIVRecipe>) + std::is_same_v<RecipeTy, VPDerivedIVRecipe> || + std::is_same_v<RecipeTy, VPVectorEndPointerRecipe>) return DefR; else return DefR && DefR->getOpcode() == Opcode; @@ -686,6 +687,64 @@ m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2}); } +template <typename Addr_t, typename Mask_t> struct Load_match { + Addr_t Addr; + Mask_t Mask; + + Load_match(Addr_t Addr, Mask_t Mask) : Addr(Addr), Mask(Mask) {} + + template <typename OpTy> bool match(const OpTy *V) const { + auto *Load = dyn_cast<VPWidenLoadRecipe>(V); + if (!Load || !Addr.match(Load->getAddr()) || !Load->isMasked() || + !Mask.match(Load->getMask())) + return false; + return true; + } +}; + +/// Match a (possibly reversed) masked load. +template <typename Addr_t, typename Mask_t> +inline Load_match<Addr_t, Mask_t> m_MaskedLoad(const Addr_t &Addr, + const Mask_t &Mask) { + return Load_match<Addr_t, Mask_t>(Addr, Mask); +} + +template <typename Addr_t, typename Val_t, typename Mask_t> struct Store_match { + Addr_t Addr; + Val_t Val; + Mask_t Mask; + + Store_match(Addr_t Addr, Val_t Val, Mask_t Mask) + : Addr(Addr), Val(Val), Mask(Mask) {} + + template <typename OpTy> bool match(const OpTy *V) const { + auto *Store = dyn_cast<VPWidenStoreRecipe>(V); + if (!Store || !Addr.match(Store->getAddr()) || + !Val.match(Store->getStoredValue()) || !Store->isMasked() || + !Mask.match(Store->getMask())) + return false; + return true; + } +}; + +/// Match a (possibly reversed) masked store. +template <typename Addr_t, typename Val_t, typename Mask_t> +inline Store_match<Addr_t, Val_t, Mask_t> +m_MaskedStore(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) { + return Store_match<Addr_t, Val_t, Mask_t>(Addr, Val, Mask); +} + +template <typename Op0_t, typename Op1_t> +using VectorEndPointerRecipe_match = + Recipe_match<std::tuple<Op0_t, Op1_t>, 0, + /*Commutative*/ false, VPVectorEndPointerRecipe>; + +template <typename Op0_t, typename Op1_t> +VectorEndPointerRecipe_match<Op0_t, Op1_t> m_VecEndPtr(const Op0_t &Op0, + const Op1_t &Op1) { + return VectorEndPointerRecipe_match<Op0_t, Op1_t>(Op0, Op1); +} + /// Match a call argument at a given argument index. template <typename Opnd_t> struct Argument_match { /// Call argument index to match. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bde62dd..1ee405a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -162,8 +162,12 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPPredInstPHISC: case VPVectorEndPointerSC: return false; - case VPInstructionSC: - return mayWriteToMemory(); + case VPInstructionSC: { + auto *VPI = cast<VPInstruction>(this); + return mayWriteToMemory() || + VPI->getOpcode() == VPInstruction::BranchOnCount || + VPI->getOpcode() == VPInstruction::BranchOnCond; + } case VPWidenCallSC: { Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); @@ -490,10 +494,10 @@ template class VPUnrollPartAccessor<3>; } VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL, - const Twine &Name) + const VPIRFlags &Flags, const VPIRMetadata &MD, + DebugLoc DL, const Twine &Name) : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), - VPIRMetadata(), Opcode(Opcode), Name(Name.str()) { + VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) { assert(flagsValidForOpcode(getOpcode()) && "Set flags not supported for the provided opcode"); assert((getNumOperandsForOpcode(Opcode) == -1u || @@ -1241,6 +1245,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case Instruction::Select: case Instruction::PHI: case VPInstruction::AnyOf: + case VPInstruction::BranchOnCond: + case VPInstruction::BranchOnCount: case VPInstruction::Broadcast: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: @@ -2372,9 +2378,8 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return false; auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *CanIV = getRegion()->getCanonicalIV(); return StartC && StartC->isZero() && StepC && StepC->isOne() && - getScalarType() == CanIV->getScalarType(); + getScalarType() == getRegion()->getCanonicalIVType(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.h b/llvm/lib/Transforms/Vectorize/VPlanSLP.h index 77ff36c..44972c68 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.h +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.h @@ -89,8 +89,7 @@ class VPlanSlp { /// Width of the widest combined bundle in bits. unsigned WidestBundleBits = 0; - using MultiNodeOpTy = - typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; + using MultiNodeOpTy = std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; // Input operand bundles for the current multi node. Each multi node operand // bundle contains values not matching the multi node's opcode. They will diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 986c801..2588c87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -151,59 +151,65 @@ static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) { static bool sinkScalarOperands(VPlan &Plan) { auto Iter = vp_depth_first_deep(Plan.getEntry()); + bool ScalarVFOnly = Plan.hasScalarVFOnly(); bool Changed = false; + + SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; + auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList]( + VPBasicBlock *SinkTo, VPValue *Op) { + auto *Candidate = + dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()); + if (!Candidate) + return; + + // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes + // for now. + if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) + return; + + if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() || + Candidate->mayReadOrWriteMemory()) + return; + + if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) + if (!ScalarVFOnly && RepR->isSingleScalar()) + return; + + WorkList.insert({SinkTo, Candidate}); + }; + // First, collect the operands of all recipes in replicate blocks as seeds for // sinking. - SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) { VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) continue; - VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]); - if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) + VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front()); + if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) continue; - for (auto &Recipe : *VPBB) { + for (auto &Recipe : *VPBB) for (VPValue *Op : Recipe.operands()) - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert({VPBB, Def}); - } + InsertIfValidSinkCandidate(VPBB, Op); } - bool ScalarVFOnly = Plan.hasScalarVFOnly(); // Try to sink each replicate or scalar IV steps recipe in the worklist. for (unsigned I = 0; I != WorkList.size(); ++I) { VPBasicBlock *SinkTo; VPSingleDefRecipe *SinkCandidate; std::tie(SinkTo, SinkCandidate) = WorkList[I]; - if (SinkCandidate->getParent() == SinkTo || - SinkCandidate->mayHaveSideEffects() || - SinkCandidate->mayReadOrWriteMemory()) - continue; - if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) { - if (!ScalarVFOnly && RepR->isSingleScalar()) - continue; - } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate)) - continue; - bool NeedsDuplicating = false; - // All recipe users of the sink candidate must be in the same block SinkTo - // or all users outside of SinkTo must be uniform-after-vectorization ( - // i.e., only first lane is used) . In the latter case, we need to duplicate - // SinkCandidate. - auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, - SinkCandidate](VPUser *U) { - auto *UI = cast<VPRecipeBase>(U); - if (UI->getParent() == SinkTo) - return true; - NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate); - // We only know how to duplicate VPReplicateRecipes and - // VPScalarIVStepsRecipes for now. - return NeedsDuplicating && - isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate); - }; - if (!all_of(SinkCandidate->users(), CanSinkWithUser)) + // All recipe users of SinkCandidate must be in the same block SinkTo or all + // users outside of SinkTo must only use the first lane of SinkCandidate. In + // the latter case, we need to duplicate SinkCandidate. + auto UsersOutsideSinkTo = + make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) { + return cast<VPRecipeBase>(U)->getParent() != SinkTo; + }); + if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) { + return !U->onlyFirstLaneUsed(SinkCandidate); + })) continue; + bool NeedsDuplicating = !UsersOutsideSinkTo.empty(); if (NeedsDuplicating) { if (ScalarVFOnly) @@ -228,9 +234,7 @@ static bool sinkScalarOperands(VPlan &Plan) { } SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); for (VPValue *Op : SinkCandidate->operands()) - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert({SinkTo, Def}); + InsertIfValidSinkCandidate(SinkTo, Op); Changed = true; } return Changed; @@ -699,8 +703,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { continue; const InductionDescriptor &ID = PtrIV->getInductionDescriptor(); - VPValue *StartV = - Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0)); + VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0); VPValue *StepV = PtrIV->getOperand(1); VPScalarIVStepsRecipe *Steps = createScalarIVSteps( Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr, @@ -820,7 +823,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, // Calculate the final index. VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); auto *CanonicalIV = LoopRegion->getCanonicalIV(); - Type *CanonicalIVType = CanonicalIV->getScalarType(); + Type *CanonicalIVType = LoopRegion->getCanonicalIVType(); VPBuilder B(cast<VPBasicBlock>(PredVPBB)); DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc(); @@ -836,7 +839,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, // changed it means the exit is using the incremented value, so we need to // add the step. if (Incoming != WideIV) { - VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1)); + VPValue *One = Plan.getConstantInt(CanonicalIVType, 1); EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL); } @@ -882,7 +885,7 @@ static VPValue *optimizeLatchExitInductionUser( return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape"); if (ScalarTy->isPointerTy()) { Type *StepTy = TypeInfo.inferScalarType(Step); - auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0)); + auto *Zero = Plan.getConstantInt(StepTy, 0); return B.createPtrAdd(EndValue, B.createNaryOp(Instruction::Sub, {Zero, Step}), DebugLoc::getUnknown(), "ind.escape"); @@ -1057,13 +1060,9 @@ static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R, return nullptr; } -/// Try to simplify recipe \p R. -static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { - VPlan *Plan = R.getParent()->getPlan(); - - auto *Def = dyn_cast<VPSingleDefRecipe>(&R); - if (!Def) - return; +/// Try to simplify VPSingleDefRecipe \p Def. +static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { + VPlan *Plan = Def->getParent()->getPlan(); // Simplification of live-in IR values for SingleDef recipes using // InstSimplifyFolder. @@ -1073,7 +1072,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return Def->replaceAllUsesWith(V); // Fold PredPHI LiveIn -> LiveIn. - if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) { + if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) { VPValue *Op = PredPHI->getOperand(0); if (Op->isLiveIn()) PredPHI->replaceAllUsesWith(Op); @@ -1092,12 +1091,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { - unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) + unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue())) ? Instruction::SExt : Instruction::ZExt; auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A, TruncTy); - if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { + if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) { // UnderlyingExt has distinct return type, used to retain legacy cost. Ext->setUnderlyingValue(UnderlyingExt); } @@ -1160,7 +1159,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { Builder.createLogicalAnd(X, Builder.createOr(Y, Z))); // x && !x -> 0 - if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) + if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) return Def->replaceAllUsesWith(Plan->getFalse()); if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) @@ -1188,8 +1187,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return Def->replaceAllUsesWith(A); if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt()))) - return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1) - : R.getOperand(0)); + return Def->replaceAllUsesWith( + Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0)); if (match(Def, m_Not(m_VPValue(A)))) { if (match(A, m_Not(m_VPValue(A)))) @@ -1218,8 +1217,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } // If Cmp doesn't have a debug location, use the one from the negation, // to preserve the location. - if (!Cmp->getDebugLoc() && R.getDebugLoc()) - Cmp->setDebugLoc(R.getDebugLoc()); + if (!Cmp->getDebugLoc() && Def->getDebugLoc()) + Cmp->setDebugLoc(Def->getDebugLoc()); } } } @@ -1245,7 +1244,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A), m_VPValue(X), m_VPValue())) && match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) && - TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) { + TypeInfo.inferScalarType(Def)->isIntegerTy(1)) { Def->setOperand(1, Def->getOperand(0)); Def->setOperand(0, Y); return; @@ -1253,35 +1252,41 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) { if (Phi->getOperand(0) == Phi->getOperand(1)) - Def->replaceAllUsesWith(Phi->getOperand(0)); + Phi->replaceAllUsesWith(Phi->getOperand(0)); return; } // Look through ExtractLastElement (BuildVector ....). - if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()), - m_ExtractLastLanePerPart(m_BuildVector())))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_CombineOr(m_ExtractLastElement(m_BuildVector()), + m_ExtractLastLanePerPart(m_BuildVector())))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith( BuildVector->getOperand(BuildVector->getNumOperands() - 1)); return; } // Look through ExtractPenultimateElement (BuildVector ....). - if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( - m_BuildVector()))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( + m_BuildVector()))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith( BuildVector->getOperand(BuildVector->getNumOperands() - 2)); return; } uint64_t Idx; - if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith(BuildVector->getOperand(Idx)); return; } + if (match(Def, m_BuildVector()) && all_equal(Def->operands())) { + Def->replaceAllUsesWith( + Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0))); + return; + } + if (auto *Phi = dyn_cast<VPPhi>(Def)) { if (Phi->getNumOperands() == 1) Phi->replaceAllUsesWith(Phi->getOperand(0)); @@ -1298,7 +1303,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { isa<VPPhi>(X)) { auto *Phi = cast<VPPhi>(X); if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) && - Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) { + Phi->getNumUsers() == 1 && (*Phi->user_begin() == Def)) { Phi->setOperand(0, Y); Def->replaceAllUsesWith(Phi); return; @@ -1306,7 +1311,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } // VPVectorPointer for part 0 can be replaced by their start pointer. - if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) { + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) { if (VecPtr->isFirstPart()) { VecPtr->replaceAllUsesWith(VecPtr->getOperand(0)); return; @@ -1361,9 +1366,9 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan) { Plan.getEntry()); VPTypeAnalysis TypeInfo(Plan); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { - for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - simplifyRecipe(R, TypeInfo); - } + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) + if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R)) + simplifyRecipe(Def, TypeInfo); } } @@ -1574,9 +1579,9 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, continue; // Update IV operands and comparison bound to use new narrower type. - auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0)); + auto *NewStart = Plan.getConstantInt(NewIVTy, 0); WideIV->setStartValue(NewStart); - auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1)); + auto *NewStep = Plan.getConstantInt(NewIVTy, 1); WideIV->setStepValue(NewStep); auto *NewBTC = new VPWidenCastRecipe( @@ -1695,8 +1700,7 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, // When using wide lane masks, the return type of the get.active.lane.mask // intrinsic is VF x UF (last operand). - VPValue *ALMMultiplier = - Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); + VPValue *ALMMultiplier = Plan.getConstantInt(64, UF); EntryALM->setOperand(2, ALMMultiplier); LoopALM->setOperand(2, ALMMultiplier); @@ -2402,8 +2406,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1)); + VPValue *ALMMultiplier = + Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1); auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC, ALMMultiplier}, DL, "active.lane.mask.entry"); @@ -2503,7 +2507,7 @@ void VPlanTransforms::addActiveLaneMask( } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1)); + ConstantInt::get(LoopRegion->getCanonicalIVType(), 1)); LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, @@ -2517,90 +2521,102 @@ void VPlanTransforms::addActiveLaneMask( HeaderMask->eraseFromParent(); } +template <typename Op0_t, typename Op1_t> struct RemoveMask_match { + Op0_t In; + Op1_t &Out; + + RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {} + + template <typename OpTy> bool match(OpTy *V) const { + if (m_Specific(In).match(V)) { + Out = nullptr; + return true; + } + if (m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V)) + return true; + return false; + } +}; + +/// Match a specific mask \p In, or a combination of it (logical-and In, Out). +/// Returns the remaining part \p Out if so, or nullptr otherwise. +template <typename Op0_t, typename Op1_t> +static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In, + Op1_t &Out) { + return RemoveMask_match<Op0_t, Op1_t>(In, Out); +} + /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based /// recipe could be created. /// \p HeaderMask Header Mask. /// \p CurRecipe Recipe to be transform. /// \p TypeInfo VPlan-based type analysis. -/// \p AllOneMask The vector mask parameter of vector-predication intrinsics. /// \p EVL The explicit vector length parameter of vector-predication /// intrinsics. static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, - VPTypeAnalysis &TypeInfo, - VPValue &AllOneMask, VPValue &EVL) { - // FIXME: Don't transform recipes to EVL recipes if they're not masked by the - // header mask. - auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { - assert(OrigMask && "Unmasked recipe when folding tail"); - // HeaderMask will be handled using EVL. - VPValue *Mask; - if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) - return Mask; - return HeaderMask == OrigMask ? nullptr : OrigMask; - }; + VPTypeAnalysis &TypeInfo, VPValue &EVL) { + VPlan *Plan = CurRecipe.getParent()->getPlan(); + VPValue *Addr, *Mask, *EndPtr; /// Adjust any end pointers so that they point to the end of EVL lanes not VF. - auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * { - auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr); - if (!EndPtr) - return Addr; - assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() && - "VPVectorEndPointerRecipe with non-VF VF operand?"); - assert( - all_of(EndPtr->users(), - [](VPUser *U) { - return cast<VPWidenMemoryRecipe>(U)->isReverse(); - }) && - "VPVectorEndPointRecipe not used by reversed widened memory recipe?"); - VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone(); - EVLAddr->insertBefore(&CurRecipe); - EVLAddr->setOperand(1, &EVL); - return EVLAddr; + auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) { + auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone(); + EVLEndPtr->insertBefore(&CurRecipe); + EVLEndPtr->setOperand(1, &EVL); + return EVLEndPtr; }; - return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe) - .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) { - VPValue *NewMask = GetNewMask(L->getMask()); - VPValue *NewAddr = GetNewAddr(L->getAddr()); - return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask); - }) - .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) { - VPValue *NewMask = GetNewMask(S->getMask()); - VPValue *NewAddr = GetNewAddr(S->getAddr()); - return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask); - }) - .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) { - VPValue *NewMask = GetNewMask(IR->getMask()); - return new VPInterleaveEVLRecipe(*IR, EVL, NewMask); - }) - .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { - VPValue *NewMask = GetNewMask(Red->getCondOp()); - return new VPReductionEVLRecipe(*Red, EVL, NewMask); - }) - .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * { - VPValue *LHS, *RHS; - // Transform select with a header mask condition - // select(header_mask, LHS, RHS) - // into vector predication merge. - // vp.merge(all-true, LHS, RHS, EVL) - if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), - m_VPValue(RHS)))) - return nullptr; - // Use all true as the condition because this transformation is - // limited to selects whose condition is a header mask. - return new VPWidenIntrinsicRecipe( - Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL}, - TypeInfo.inferScalarType(LHS), VPI->getDebugLoc()); - }) - .Default([&](VPRecipeBase *R) { return nullptr; }); + if (match(&CurRecipe, + m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) && + !cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) + return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr, + EVL, Mask); + + if (match(&CurRecipe, + m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) + return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), + AdjustEndPtr(EndPtr), EVL, Mask); + + if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(), + m_RemoveMask(HeaderMask, Mask))) && + !cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) + return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr, + EVL, Mask); + + if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(), + m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) + return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), + AdjustEndPtr(EndPtr), EVL, Mask); + + if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe)) + if (Rdx->isConditional() && + match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask))) + return new VPReductionEVLRecipe(*Rdx, EVL, Mask); + + if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe)) + if (Interleave->getMask() && + match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask))) + return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask); + + VPValue *LHS, *RHS; + if (match(&CurRecipe, + m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) + return new VPWidenIntrinsicRecipe( + Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL}, + TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc()); + + return nullptr; } /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPTypeAnalysis TypeInfo(Plan); - VPValue *AllOneMask = Plan.getTrue(); VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); @@ -2660,7 +2676,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1)); VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe( Intrinsic::experimental_vp_splice, - {V1, V2, Imm, AllOneMask, PrevEVL, &EVL}, + {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL}, TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc()); VPSplice->insertBefore(&R); R.getVPSingleValue()->replaceAllUsesWith(VPSplice); @@ -2694,7 +2710,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { for (VPUser *U : collectUsersRecursively(EVLMask)) { auto *CurRecipe = cast<VPRecipeBase>(U); VPRecipeBase *EVLRecipe = - optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL); + optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL); if (!EVLRecipe) continue; @@ -2775,7 +2791,7 @@ void VPlanTransforms::addExplicitVectorLength( VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); auto *CanonicalIVPHI = LoopRegion->getCanonicalIV(); - auto *CanIVTy = CanonicalIVPHI->getScalarType(); + auto *CanIVTy = LoopRegion->getCanonicalIVType(); VPValue *StartV = CanonicalIVPHI->getStartValue(); // Create the ExplicitVectorLengthPhi recipe in the main loop. @@ -2790,8 +2806,7 @@ void VPlanTransforms::addExplicitVectorLength( if (MaxSafeElements) { // Support for MaxSafeDist for correct loop emission. - VPValue *AVLSafe = - Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements)); + VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements); VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(), "safe_avl"); @@ -2904,9 +2919,8 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext); VPBuilder Builder(LatchExitingBr); - VPValue *Cmp = - Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, - Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy))); + VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, + Plan.getConstantInt(AVLTy, 0)); Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp); LatchExitingBr->eraseFromParent(); } @@ -2930,8 +2944,7 @@ void VPlanTransforms::replaceSymbolicStrides( // Only handle constant strides for now. continue; - auto *CI = - Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst)); + auto *CI = Plan.getConstantInt(*StrideConst); if (VPValue *StrideVPV = Plan.getLiveIn(StrideV)) StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); @@ -2946,7 +2959,7 @@ void VPlanTransforms::replaceSymbolicStrides( unsigned BW = U->getType()->getScalarSizeInBits(); APInt C = isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW); - VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C)); + VPValue *CI = Plan.getConstantInt(C); StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); } RewriteMap[StrideV] = PSE.getSCEV(StrideV); @@ -3125,8 +3138,7 @@ void VPlanTransforms::createInterleaveGroups( DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) * IG->getIndex(IRInsertPos), /*IsSigned=*/true); - VPValue *OffsetVPV = - Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset)); + VPValue *OffsetVPV = Plan.getConstantInt(-Offset); VPBuilder B(InsertPos); Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW); } @@ -3867,8 +3879,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, VPBuilder Builder(VectorPH, VectorPH->begin()); auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); auto *TCMO = Builder.createNaryOp( - Instruction::Sub, - {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))}, + Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)}, DebugLoc::getCompilerGenerated(), "trip.count.minus.1"); BTC->replaceAllUsesWith(TCMO); } @@ -3993,9 +4004,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, if (TailByMasking) { TC = Builder.createNaryOp( Instruction::Add, - {TC, Builder.createNaryOp( - Instruction::Sub, - {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})}, + {TC, Builder.createNaryOp(Instruction::Sub, + {Step, Plan.getConstantInt(TCTy, 1)})}, DebugLoc::getCompilerGenerated(), "n.rnd.up"); } @@ -4017,8 +4027,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, if (RequiresScalarEpilogue) { assert(!TailByMasking && "requiring scalar epilogue is not supported with fail folding"); - VPValue *IsZero = Builder.createICmp( - CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0))); + VPValue *IsZero = + Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0)); R = Builder.createSelect(IsZero, Step, R); } @@ -4056,7 +4066,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, } VF.replaceAllUsesWith(RuntimeVF); - VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); + VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF()); VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF}); VFxUF.replaceAllUsesWith(MulByUF); } @@ -4176,7 +4186,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VFMinVal = VF.getKnownMinValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; for (auto &R : *VectorLoop->getEntryBasicBlock()) { - if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount())) + if (isa<VPCanonicalIVPHIRecipe>(&R)) continue; if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) && @@ -4336,17 +4346,17 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, VPBuilder PHBuilder(Plan.getVectorPreheader()); VPValue *UF = Plan.getOrAddLiveIn( - ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); + ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF())); if (VF.isScalable()) { VPValue *VScale = PHBuilder.createElementCount( - CanIV->getScalarType(), ElementCount::getScalable(1)); + VectorLoop->getCanonicalIVType(), ElementCount::getScalable(1)); VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); Inc->setOperand(1, VScaleUF); Plan.getVF().replaceAllUsesWith(VScale); } else { Inc->setOperand(1, UF); Plan.getVF().replaceAllUsesWith( - Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + Plan.getConstantInt(CanIV->getScalarType(), 1)); } removeDeadRecipes(Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index cfd1a74..d6a0028 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -68,10 +68,9 @@ class UnrollState { void unrollWidenInductionByUF(VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi); - VPValue *getConstantVPV(unsigned Part) { - Type *CanIVIntTy = - Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType(); - return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); + VPValue *getConstantInt(unsigned Part) { + Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType(); + return Plan.getConstantInt(CanIVIntTy, Part); } public: @@ -138,7 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { remapOperands(&PartIR, Part); if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) { - ScalarIVSteps->addOperand(getConstantVPV(Part)); + ScalarIVSteps->addOperand(getConstantInt(Part)); } addRecipeForPart(&Part0R, &PartIR, Part); @@ -250,7 +249,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, for (unsigned Part = 1; Part != UF; ++Part) VPV2Parts[VPI][Part - 1] = StartV; } - Copy->addOperand(getConstantVPV(Part)); + Copy->addOperand(getConstantInt(Part)); } else { assert(isa<VPActiveLaneMaskPHIRecipe>(R) && "unexpected header phi recipe not needing unrolled part"); @@ -319,7 +318,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) || match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) - Copy->addOperand(getConstantVPV(Part)); + Copy->addOperand(getConstantInt(Part)); if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R)) Copy->setOperand(0, R.getOperand(0)); @@ -475,8 +474,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, if (LaneDefs != Def2LaneDefs.end()) return LaneDefs->second[Lane.getKnownLane()]; - VPValue *Idx = - Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); } @@ -510,8 +508,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane())); continue; } - VPValue *Idx = - Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); NewOps.push_back(Ext); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 8c23e78..c6380d3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -32,22 +32,17 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) { } VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { - VPValue *Expanded = nullptr; if (auto *E = dyn_cast<SCEVConstant>(Expr)) - Expanded = Plan.getOrAddLiveIn(E->getValue()); - else { - auto *U = dyn_cast<SCEVUnknown>(Expr); - // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction - // value. Otherwise the value may be defined in a loop and using it directly - // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA - // form. - if (U && !isa<Instruction>(U->getValue())) { - Expanded = Plan.getOrAddLiveIn(U->getValue()); - } else { - Expanded = new VPExpandSCEVRecipe(Expr); - Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe()); - } - } + return Plan.getOrAddLiveIn(E->getValue()); + // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction + // value. Otherwise the value may be defined in a loop and using it directly + // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA + // form. + auto *U = dyn_cast<SCEVUnknown>(Expr); + if (U && !isa<Instruction>(U->getValue())) + return Plan.getOrAddLiveIn(U->getValue()); + auto *Expanded = new VPExpandSCEVRecipe(Expr); + Plan.getEntry()->appendRecipe(Expanded); return Expanded; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 91734a1..34754a1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -252,6 +252,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { for (const VPUser *U : V->users()) { auto *UI = cast<VPRecipeBase>(U); + if (isa<VPIRPhi>(UI) && + UI->getNumOperands() != UI->getParent()->getNumPredecessors()) { + errs() << "Phi-like recipe with different number of operands and " + "predecessors.\n"; + return false; + } + if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) { for (const auto &[IncomingVPV, IncomingVPBB] : Phi->incoming_values_and_blocks()) { diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d6eb00d..27a8bbd 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -2017,8 +2017,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) { Value *ScalarV = Ext->getOperand(0); if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV), - &DT)) - ScalarV = Builder.CreateFreeze(ScalarV); + &DT)) { + // Check wether all lanes are extracted, all extracts trigger UB + // on poison, and the last extract (and hence all previous ones) + // are guaranteed to execute if Ext executes. If so, we do not + // need to insert a freeze. + SmallDenseSet<ConstantInt *, 8> ExtractedLanes; + bool AllExtractsTriggerUB = true; + ExtractElementInst *LastExtract = nullptr; + BasicBlock *ExtBB = Ext->getParent(); + for (User *U : Ext->users()) { + auto *Extract = cast<ExtractElementInst>(U); + if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) { + AllExtractsTriggerUB = false; + break; + } + ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand())); + if (!LastExtract || LastExtract->comesBefore(Extract)) + LastExtract = Extract; + } + if (ExtractedLanes.size() != DstTy->getNumElements() || + !AllExtractsTriggerUB || + !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(), + LastExtract->getIterator())) + ScalarV = Builder.CreateFreeze(ScalarV); + } ScalarV = Builder.CreateBitCast( ScalarV, IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy))); |
