diff options
Diffstat (limited to 'llvm/lib/Transforms')
44 files changed, 1329 insertions, 575 deletions
diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h index e05fe28..1e549f1 100644 --- a/llvm/lib/Transforms/Coroutines/CoroCloner.h +++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h @@ -77,7 +77,7 @@ public: : OrigF(OrigF), Suffix(Suffix), Shape(Shape), FKind(FKind), Builder(OrigF.getContext()), TTI(TTI) {} - virtual ~BaseCloner() {} + virtual ~BaseCloner() = default; /// Create a clone for a continuation lowering. static Function *createClone(Function &OrigF, const Twine &Suffix, diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 5048561..5ed47ae 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3619,7 +3619,7 @@ struct AAIntraFnReachabilityFunction final return true; RQITy StackRQI(A, From, To, ExclusionSet, false); - typename RQITy::Reachable Result; + RQITy::Reachable Result; if (!NonConstThis->checkQueryCache(A, StackRQI, Result)) return NonConstThis->isReachableImpl(A, StackRQI, /*IsTemporaryRQI=*/true); @@ -10701,7 +10701,7 @@ struct AAInterFnReachabilityFunction auto *NonConstThis = const_cast<AAInterFnReachabilityFunction *>(this); RQITy StackRQI(A, From, To, ExclusionSet, false); - typename RQITy::Reachable Result; + RQITy::Reachable Result; if (!NonConstThis->checkQueryCache(A, StackRQI, Result)) return NonConstThis->isReachableImpl(A, StackRQI, /*IsTemporaryRQI=*/true); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 894d83f..d35ae47 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -1034,11 +1034,11 @@ private: } // namespace template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph< ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph< IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; template <> diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index d7eb745..2a87a0f 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -208,7 +208,7 @@ namespace KernelInfo { // }; #define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_IDX(Configuration, 0) KERNEL_ENVIRONMENT_IDX(Ident, 1) @@ -216,7 +216,7 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1) #undef KERNEL_ENVIRONMENT_IDX #define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0) KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1) @@ -258,7 +258,7 @@ KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams) GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) { - constexpr const int InitKernelEnvironmentArgNo = 0; + constexpr int InitKernelEnvironmentArgNo = 0; return cast<GlobalVariable>( KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo) ->stripPointerCasts()); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 3ddf182..cbaff29 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3997,6 +3997,27 @@ static Value *foldOrUnsignedUMulOverflowICmp(BinaryOperator &I, return nullptr; } +/// Fold select(X >s 0, 0, -X) | smax(X, 0) --> abs(X) +/// select(X <s 0, -X, 0) | smax(X, 0) --> abs(X) +static Value *FoldOrOfSelectSmaxToAbs(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *X; + Value *Sel; + if (match(&I, + m_c_Or(m_Value(Sel), m_OneUse(m_SMax(m_Value(X), m_ZeroInt()))))) { + auto NegX = m_Neg(m_Specific(X)); + if (match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SGT, m_Specific(X), + m_ZeroInt()), + m_ZeroInt(), NegX)) || + match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Specific(X), + m_ZeroInt()), + NegX, m_ZeroInt()))) + return Builder.CreateBinaryIntrinsic(Intrinsic::abs, X, + Builder.getFalse()); + } + return nullptr; +} + // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. @@ -4545,6 +4566,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V); + if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder)) + return replaceInstUsesWith(I, Res); + return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f5130da..9572f9d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3599,6 +3599,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelCond->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Specific(C), m_Value(A2))) && + SelCond) { + return SelectInst::Create(C, A, B, "", nullptr, SelCond); + } else if (match(FalseVal, + m_LogicalAnd(m_Not(m_Value(C2)), m_Value(B2))) && + SelFVal) { + SelectInst *NewSI = SelectInst::Create(C, A, B, "", nullptr, SelFVal); + NewSI->swapProfMetadata(); + return NewSI; + } else { + return createSelectInstWithUnknownProfile(C, A, B); + } + } return SelectInst::Create(C, A, B); } @@ -3615,6 +3630,20 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelFVal->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Not(m_Value(C2)), m_Value(A2))) && + SelCond) { + SelectInst *NewSI = SelectInst::Create(C, B, A, "", nullptr, SelCond); + NewSI->swapProfMetadata(); + return NewSI; + } else if (match(FalseVal, m_LogicalAnd(m_Specific(C), m_Value(B2))) && + SelFVal) { + return SelectInst::Create(C, B, A, "", nullptr, SelFVal); + } else { + return createSelectInstWithUnknownProfile(C, B, A); + } + } return SelectInst::Create(C, B, A); } } diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 7795cce..b5548d4 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -69,14 +69,6 @@ namespace llvm { // Command line option to enable vtable value profiling. Defined in // ProfileData/InstrProf.cpp: -enable-vtable-value-profiling= extern cl::opt<bool> EnableVTableValueProfiling; -// TODO: Remove -debug-info-correlate in next LLVM release, in favor of -// -profile-correlate=debug-info. -cl::opt<bool> DebugInfoCorrelate( - "debug-info-correlate", - cl::desc("Use debug info to correlate profiles. (Deprecated, use " - "-profile-correlate=debug-info)"), - cl::init(false)); - LLVM_ABI cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate( "profile-correlate", cl::desc("Use debug info or binary file to correlate profiles."), @@ -1047,7 +1039,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { // in lightweight mode. We need to move the value profile pointer to the // Counter struct to get this working. assert( - !DebugInfoCorrelate && ProfileCorrelate == InstrProfCorrelator::NONE && + ProfileCorrelate == InstrProfCorrelator::NONE && "Value profiling is not yet supported with lightweight instrumentation"); GlobalVariable *Name = Ind->getName(); auto It = ProfileDataMap.find(Name); @@ -1504,7 +1496,7 @@ static inline Constant *getVTableAddrForProfData(GlobalVariable *GV) { } void InstrLowerer::getOrCreateVTableProfData(GlobalVariable *GV) { - assert(!DebugInfoCorrelate && + assert(ProfileCorrelate != InstrProfCorrelator::DEBUG_INFO && "Value profiling is not supported with lightweight instrumentation"); if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage()) return; @@ -1584,8 +1576,7 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc, // Use internal rather than private linkage so the counter variable shows up // in the symbol table when using debug info for correlation. - if ((DebugInfoCorrelate || - ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) && + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO && TT.isOSBinFormatMachO() && Linkage == GlobalValue::PrivateLinkage) Linkage = GlobalValue::InternalLinkage; @@ -1691,8 +1682,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) { auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts); PD.RegionCounters = CounterPtr; - if (DebugInfoCorrelate || - ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) { + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) { LLVMContext &Ctx = M.getContext(); Function *Fn = Inc->getParent()->getParent(); if (auto *SP = Fn->getSubprogram()) { @@ -1737,7 +1727,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) { void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // When debug information is correlated to profile data, a data variable // is not needed. - if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) return; GlobalVariable *NamePtr = Inc->getName(); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index 2f256df..b72d41a 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -127,15 +127,19 @@ static uint64_t computeStackId(const memprof::Frame &Frame) { return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column); } +static AllocationType getAllocType(const AllocationInfo *AllocInfo) { + return getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), + AllocInfo->Info.getAllocCount(), + AllocInfo->Info.getTotalLifetime()); +} + static AllocationType addCallStack(CallStackTrie &AllocTrie, const AllocationInfo *AllocInfo, uint64_t FullStackId) { SmallVector<uint64_t> StackIds; for (const auto &StackFrame : AllocInfo->CallStack) StackIds.push_back(computeStackId(StackFrame)); - auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), - AllocInfo->Info.getAllocCount(), - AllocInfo->Info.getTotalLifetime()); + auto AllocType = getAllocType(AllocInfo); std::vector<ContextTotalSize> ContextSizeInfo; if (recordContextSizeInfoForAnalysis()) { auto TotalSize = AllocInfo->Info.getTotalSize(); @@ -405,22 +409,39 @@ handleAllocSite(Instruction &I, CallBase *CI, const std::set<const AllocationInfo *> &AllocInfoSet, std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo> &FullStackIdToAllocMatchInfo) { + // TODO: Remove this once the profile creation logic deduplicates contexts + // that are the same other than the IsInlineFrame bool. Until then, keep the + // largest. + DenseMap<uint64_t, const AllocationInfo *> UniqueFullContextIdAllocInfo; + for (auto *AllocInfo : AllocInfoSet) { + auto FullStackId = computeFullStackId(AllocInfo->CallStack); + auto [It, Inserted] = + UniqueFullContextIdAllocInfo.insert({FullStackId, AllocInfo}); + // If inserted entry, done. + if (Inserted) + continue; + // Keep the larger one, or the noncold one if they are the same size. + auto CurSize = It->second->Info.getTotalSize(); + auto NewSize = AllocInfo->Info.getTotalSize(); + if ((CurSize > NewSize) || + (CurSize == NewSize && + getAllocType(AllocInfo) != AllocationType::NotCold)) + continue; + It->second = AllocInfo; + } // We may match this instruction's location list to multiple MIB // contexts. Add them to a Trie specialized for trimming the contexts to // the minimal needed to disambiguate contexts with unique behavior. CallStackTrie AllocTrie(&ORE, MaxColdSize); uint64_t TotalSize = 0; uint64_t TotalColdSize = 0; - for (auto *AllocInfo : AllocInfoSet) { + for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) { // Check the full inlined call stack against this one. // If we found and thus matched all frames on the call, include // this MIB. if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack, InlinedCallStack)) { NumOfMemProfMatchedAllocContexts++; - uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis()) - FullStackId = computeFullStackId(AllocInfo->CallStack); auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); TotalSize += AllocInfo->Info.getTotalSize(); if (AllocType == AllocationType::Cold) diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp index 80e77e09..a2fad02 100644 --- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp @@ -161,7 +161,7 @@ template <char NsanTypeId> class ShadowTypeConfigImpl : public ShadowTypeConfig { public: char getNsanTypeId() const override { return NsanTypeId; } - static constexpr const char kNsanTypeId = NsanTypeId; + static constexpr char kNsanTypeId = NsanTypeId; }; // `double` (`d`) shadow type. diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 71736cf..af53fa0 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -456,7 +456,7 @@ createIRLevelProfileFlagVar(Module &M, ProfileVersion |= VARIANT_MASK_INSTR_ENTRY; if (PGOInstrumentLoopEntries) ProfileVersion |= VARIANT_MASK_INSTR_LOOP_ENTRIES; - if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) + if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) ProfileVersion |= VARIANT_MASK_DBG_CORRELATE; if (PGOFunctionEntryCoverage) ProfileVersion |= diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp index 78d4a57e..87eba5f 100644 --- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -58,6 +58,18 @@ static cl::opt<bool> cl::desc("Writes always set the type"), cl::Hidden, cl::init(false)); +static cl::opt<bool> ClOutlineInstrumentation( + "tysan-outline-instrumentation", + cl::desc("Uses function calls for all TySan instrumentation, reducing " + "ELF size"), + cl::Hidden, cl::init(false)); + +static cl::opt<bool> ClVerifyOutlinedInstrumentation( + "tysan-verify-outlined-instrumentation", + cl::desc("Check types twice with both inlined instrumentation and " + "function calls. This verifies that they behave the same."), + cl::Hidden, cl::init(false)); + STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses"); namespace { @@ -105,12 +117,16 @@ private: Regex AnonNameRegex; Type *IntptrTy; uint64_t PtrShift; - IntegerType *OrdTy; + IntegerType *OrdTy, *U64Ty; /// Callbacks to run-time library are computed in initializeCallbacks. FunctionCallee TysanCheck; FunctionCallee TysanCtorFunction; + FunctionCallee TysanIntrumentMemInst; + FunctionCallee TysanInstrumentWithShadowUpdate; + FunctionCallee TysanSetShadowType; + /// Callback to set types for gloabls. Function *TysanGlobalsSetTypeFunction; }; @@ -130,6 +146,8 @@ TypeSanitizer::TypeSanitizer(Module &M) void TypeSanitizer::initializeCallbacks(Module &M) { IRBuilder<> IRB(M.getContext()); OrdTy = IRB.getInt32Ty(); + U64Ty = IRB.getInt64Ty(); + Type *BoolType = IRB.getInt1Ty(); AttributeList Attr; Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind); @@ -144,6 +162,30 @@ void TypeSanitizer::initializeCallbacks(Module &M) { TysanCtorFunction = M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy()); + + TysanIntrumentMemInst = M.getOrInsertFunction( + "__tysan_instrument_mem_inst", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer of data to be written to + IRB.getPtrTy(), // Pointer of data to write + U64Ty, // Size of the data in bytes + BoolType // Do we need to call memmove + ); + + TysanInstrumentWithShadowUpdate = M.getOrInsertFunction( + "__tysan_instrument_with_shadow_update", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer to data to be read + IRB.getPtrTy(), // Pointer to type descriptor + BoolType, // Do we need to type check this + U64Ty, // Size of data we access in bytes + OrdTy // Flags + ); + + TysanSetShadowType = M.getOrInsertFunction( + "__tysan_set_shadow_type", Attr, IRB.getVoidTy(), + IRB.getPtrTy(), // Pointer of data to be written to + IRB.getPtrTy(), // Pointer to the new type descriptor + U64Ty // Size of data we access in bytes + ); } void TypeSanitizer::instrumentGlobals(Module &M) { @@ -587,6 +629,29 @@ bool TypeSanitizer::instrumentWithShadowUpdate( Value *TD = IRB.CreateBitCast(TDGV, IRB.getPtrTy()); + if (ClOutlineInstrumentation) { + if (!ForceSetType && (!ClWritesAlwaysSetType || IsRead)) { + // We need to check the type here. If the type is unknown, then the read + // sets the type. If the type is known, then it is checked. If the type + // doesn't match, then we call the runtime type check (which may yet + // determine that the mismatch is okay). + + Constant *Flags = + ConstantInt::get(OrdTy, (int)IsRead | (((int)IsWrite) << 1)); + + IRB.CreateCall(TysanInstrumentWithShadowUpdate, + {Ptr, TD, + SanitizeFunction ? IRB.getTrue() : IRB.getFalse(), + IRB.getInt64(AccessSize), Flags}); + } else if (ForceSetType || IsWrite) { + // In the mode where writes always set the type, for a write (which does + // not also read), we just set the type. + IRB.CreateCall(TysanSetShadowType, {Ptr, TD, IRB.getInt64(AccessSize)}); + } + + return true; + } + Value *ShadowDataInt = convertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift, ShadowBase, AppMemMask); Type *Int8PtrPtrTy = PointerType::get(IRB.getContext(), 0); @@ -838,37 +903,47 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, } } - if (!ShadowBase) - ShadowBase = getShadowBase(*F); - if (!AppMemMask) - AppMemMask = getAppMemMask(*F); + if (ClOutlineInstrumentation) { + if (!Src) + Src = ConstantPointerNull::get(IRB.getPtrTy()); - Value *ShadowDataInt = IRB.CreateAdd( - IRB.CreateShl( - IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask), - PtrShift), - ShadowBase); - Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy()); - - if (!Src) { - IRB.CreateMemSet(ShadowData, IRB.getInt8(0), IRB.CreateShl(Size, PtrShift), - Align(1ull << PtrShift)); + IRB.CreateCall( + TysanIntrumentMemInst, + {Dest, Src, Size, NeedsMemMove ? IRB.getTrue() : IRB.getFalse()}); return true; - } - - Value *SrcShadowDataInt = IRB.CreateAdd( - IRB.CreateShl( - IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask), - PtrShift), - ShadowBase); - Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy()); - - if (NeedsMemMove) { - IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData, - Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); } else { - IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData, - Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + if (!ShadowBase) + ShadowBase = getShadowBase(*F); + if (!AppMemMask) + AppMemMask = getAppMemMask(*F); + + Value *ShadowDataInt = IRB.CreateAdd( + IRB.CreateShl( + IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask), + PtrShift), + ShadowBase); + Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy()); + + if (!Src) { + IRB.CreateMemSet(ShadowData, IRB.getInt8(0), + IRB.CreateShl(Size, PtrShift), Align(1ull << PtrShift)); + return true; + } + + Value *SrcShadowDataInt = IRB.CreateAdd( + IRB.CreateShl( + IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask), + PtrShift), + ShadowBase); + Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy()); + + if (NeedsMemMove) { + IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData, + Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + } else { + IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData, + Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift)); + } } return true; @@ -890,6 +965,16 @@ PreservedAnalyses TypeSanitizerPass::run(Module &M, for (Function &F : M) { const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F); TySan.sanitizeFunction(F, TLI); + if (ClVerifyOutlinedInstrumentation && ClOutlineInstrumentation) { + // Outlined instrumentation is a new option, and so this exists to + // verify there is no difference in behaviour between the options. + // If the outlined instrumentation triggers a verification failure + // when the original inlined instrumentation does not, or vice versa, + // then there is a discrepency which should be investigated. + ClOutlineInstrumentation = false; + TySan.sanitizeFunction(F, TLI); + ClOutlineInstrumentation = true; + } } return PreservedAnalyses::none(); diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp index 89980d5..a577f51 100644 --- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp +++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp @@ -122,7 +122,8 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) { Value *Cond = Assume->getArgOperand(0); // Don't drop type tests, which have special semantics. - if (match(Cond, m_Intrinsic<Intrinsic::type_test>())) + if (match(Cond, m_Intrinsic<Intrinsic::type_test>()) || + match(Cond, m_Intrinsic<Intrinsic::public_type_test>())) continue; SmallVector<Value *> Affected; diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index a06f832..d564e32 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -514,7 +514,7 @@ public: class GVNSink { public: - GVNSink() {} + GVNSink() = default; bool run(Function &F) { LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 7ebcc21..4ba4ba3 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -162,8 +162,6 @@ class IndVarSimplify { const SCEV *ExitCount, PHINode *IndVar, SCEVExpander &Rewriter); - bool sinkUnusedInvariants(Loop *L); - public: IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, const DataLayout &DL, TargetLibraryInfo *TLI, @@ -1079,85 +1077,6 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB, return true; } -//===----------------------------------------------------------------------===// -// sinkUnusedInvariants. A late subpass to cleanup loop preheaders. -//===----------------------------------------------------------------------===// - -/// If there's a single exit block, sink any loop-invariant values that -/// were defined in the preheader but not used inside the loop into the -/// exit block to reduce register pressure in the loop. -bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { - BasicBlock *ExitBlock = L->getExitBlock(); - if (!ExitBlock) return false; - - BasicBlock *Preheader = L->getLoopPreheader(); - if (!Preheader) return false; - - bool MadeAnyChanges = false; - for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { - - // Skip BB Terminator. - if (Preheader->getTerminator() == &I) - continue; - - // New instructions were inserted at the end of the preheader. - if (isa<PHINode>(I)) - break; - - // Don't move instructions which might have side effects, since the side - // effects need to complete before instructions inside the loop. Also don't - // move instructions which might read memory, since the loop may modify - // memory. Note that it's okay if the instruction might have undefined - // behavior: LoopSimplify guarantees that the preheader dominates the exit - // block. - if (I.mayHaveSideEffects() || I.mayReadFromMemory()) - continue; - - // Skip debug or pseudo instructions. - if (I.isDebugOrPseudoInst()) - continue; - - // Skip eh pad instructions. - if (I.isEHPad()) - continue; - - // Don't sink alloca: we never want to sink static alloca's out of the - // entry block, and correctly sinking dynamic alloca's requires - // checks for stacksave/stackrestore intrinsics. - // FIXME: Refactor this check somehow? - if (isa<AllocaInst>(&I)) - continue; - - // Determine if there is a use in or before the loop (direct or - // otherwise). - bool UsedInLoop = false; - for (Use &U : I.uses()) { - Instruction *User = cast<Instruction>(U.getUser()); - BasicBlock *UseBB = User->getParent(); - if (PHINode *P = dyn_cast<PHINode>(User)) { - unsigned i = - PHINode::getIncomingValueNumForOperand(U.getOperandNo()); - UseBB = P->getIncomingBlock(i); - } - if (UseBB == Preheader || L->contains(UseBB)) { - UsedInLoop = true; - break; - } - } - - // If there is, the def must remain in the preheader. - if (UsedInLoop) - continue; - - // Otherwise, sink it to the exit block. - I.moveBefore(ExitBlock->getFirstInsertionPt()); - SE->forgetValue(&I); - MadeAnyChanges = true; - } - - return MadeAnyChanges; -} - static void replaceExitCond(BranchInst *BI, Value *NewCond, SmallVectorImpl<WeakTrackingVH> &DeadInsts) { auto *OldCond = BI->getCondition(); @@ -2065,10 +1984,6 @@ bool IndVarSimplify::run(Loop *L) { // The Rewriter may not be used from this point on. - // Loop-invariant instructions in the preheader that aren't used in the - // loop may be sunk below the loop to reduce register pressure. - Changed |= sinkUnusedInvariants(L); - // rewriteFirstIterationLoopExitValues does not rely on the computation of // trip count and therefore can further simplify exit values in addition to // rewriteLoopExitValues. diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index b2c526b..d13b990 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -211,9 +211,15 @@ static Instruction *cloneInstructionInExitBlock( static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU); -static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest, - ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater &MSSAU, ScalarEvolution *SE); +static void moveInstructionBefore( + Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, + MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator); + +static bool sinkUnusedInvariantsFromPreheaderToExit( + Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT, + SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE); static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L, function_ref<void(Instruction *)> Fn); @@ -471,6 +477,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE) : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE); + + // sink pre-header defs that are unused in-loop into the unique exit to reduce + // pressure. + Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU, + SE, DT, Flags, ORE); + Flags.setIsSink(false); if (Preheader) Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L, @@ -1456,19 +1468,80 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater &MSSAU, - ScalarEvolution *SE) { + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, + MemorySSA::InsertionPlace Point) { SafetyInfo.removeInstruction(&I); SafetyInfo.insertInstructionTo(&I, Dest->getParent()); I.moveBefore(*Dest->getParent(), Dest); if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>( MSSAU.getMemorySSA()->getMemoryAccess(&I))) - MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), - MemorySSA::BeforeTerminator); + MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point); if (SE) SE->forgetBlockAndLoopDispositions(&I); } +// If there's a single exit block, sink any loop-invariant values that were +// defined in the preheader but not used inside the loop into the exit block +// to reduce register pressure in the loop. +static bool sinkUnusedInvariantsFromPreheaderToExit( + Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT, + SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) { + BasicBlock *ExitBlock = L->getExitBlock(); + if (!ExitBlock) + return false; + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + return false; + + bool MadeAnyChanges = false; + + for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { + + // Skip terminator. + if (Preheader->getTerminator() == &I) + continue; + + // New instructions were inserted at the end of the preheader. + if (isa<PHINode>(I)) + break; + + // Don't move instructions which might have side effects, since the side + // effects need to complete before instructions inside the loop. Note that + // it's okay if the instruction might have undefined behavior: LoopSimplify + // guarantees that the preheader dominates the exit block. + if (I.mayHaveSideEffects()) + continue; + + if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr)) + continue; + + // Determine if there is a use in or before the loop (direct or + // otherwise). + bool UsedInLoopOrPreheader = false; + for (Use &U : I.uses()) { + auto *UserI = cast<Instruction>(U.getUser()); + BasicBlock *UseBB = UserI->getParent(); + if (auto *PN = dyn_cast<PHINode>(UserI)) { + UseBB = PN->getIncomingBlock(U); + } + if (UseBB == Preheader || L->contains(UseBB)) { + UsedInLoopOrPreheader = true; + break; + } + } + if (UsedInLoopOrPreheader) + continue; + + moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo, + MSSAU, SE, MemorySSA::Beginning); + MadeAnyChanges = true; + } + + return MadeAnyChanges; +} + static Instruction *sinkThroughTriviallyReplaceablePHI( PHINode *TPN, Instruction *I, LoopInfo *LI, SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies, diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 1a279b6..001215a 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1318,6 +1318,11 @@ public: /// the loop, in which case some special-case heuristics may be used. bool AllFixupsOutsideLoop = true; + /// This records whether all of the fixups using this LSRUse are unconditional + /// within the loop, meaning they will be executed on every path to the loop + /// latch. This includes fixups before early exits. + bool AllFixupsUnconditional = true; + /// RigidFormula is set to true to guarantee that this use will be associated /// with a single formula--the one that initially matched. Some SCEV /// expressions cannot be expanded. This allows LSR to consider the registers @@ -1421,16 +1426,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg, if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) || TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) { const SCEV *Start; - const SCEVConstant *Step; - if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step)))) + const APInt *Step; + if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) { // If the step size matches the base offset, we could use pre-indexed // addressing. - if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() && - Step->getAPInt() == F.BaseOffset.getFixedValue()) || - ((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) && - SE->isLoopInvariant(Start, L))) + bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) && + F.BaseOffset.isFixed() && + *Step == F.BaseOffset.getFixedValue(); + bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) && + !isa<SCEVConstant>(Start) && + SE->isLoopInvariant(Start, L); + // We can only pre or post index when the load/store is unconditional. + if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional) LoopCost = 0; + } } + // If the loop counts down to zero and we'll be using a hardware loop then // the addrec will be combined into the hardware loop instruction. if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() && @@ -1783,6 +1794,9 @@ void LSRUse::print(raw_ostream &OS) const { if (AllFixupsOutsideLoop) OS << ", all-fixups-outside-loop"; + if (AllFixupsUnconditional) + OS << ", all-fixups-unconditional"; + if (WidestFixupType) OS << ", widest fixup type: " << *WidestFixupType; } @@ -2213,6 +2227,7 @@ class LSRInstance { void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); void CountRegisters(const Formula &F, size_t LUIdx); bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); + bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const; void CollectLoopInvariantFixupsAndFormulae(); @@ -3607,6 +3622,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = TmpPostIncLoops; LF.Offset = Offset; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF); // Create SCEV as Formula for calculating baseline cost if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) { @@ -3680,6 +3696,14 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { return true; } +/// Test whether this fixup will be executed each time the corresponding IV +/// increment instruction is executed. +bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const { + // If the fixup block dominates the IV increment block then there is no path + // through the loop to the increment that doesn't pass through the fixup. + return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent()); +} + /// Check for other uses of loop-invariant values which we're tracking. These /// other uses will pin these values in registers, making them less profitable /// for elimination. @@ -3803,6 +3827,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LF.OperandValToReplace = U; LF.Offset = Offset; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF); if (!LU.WidestFixupType || SE.getTypeSizeInBits(LU.WidestFixupType) < SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) @@ -4940,6 +4965,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n'); LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; + LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional; // Transfer the fixups of LU to LUThatHas. for (LSRFixup &Fixup : LU.Fixups) { diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 3487e81..7e70ba2 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -245,11 +245,14 @@ raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI) { } // namespace -static bool isUniformShape(Value *V) { +static bool isShapePreserving(Value *V) { Instruction *I = dyn_cast<Instruction>(V); if (!I) return true; + if (isa<SelectInst>(I)) + return true; + if (I->isBinaryOp()) return true; @@ -300,6 +303,16 @@ static bool isUniformShape(Value *V) { } } +/// Return an iterator over the operands of \p I that should share shape +/// information with \p I. +static iterator_range<Use *> getShapedOperandsForInst(Instruction *I) { + assert(isShapePreserving(I) && + "Can't retrieve shaped operands for an instruction that does not " + "preserve shape information"); + auto Ops = I->operands(); + return isa<SelectInst>(I) ? drop_begin(Ops) : Ops; +} + /// Return the ShapeInfo for the result of \p I, it it can be determined. static std::optional<ShapeInfo> computeShapeInfoForInst(Instruction *I, @@ -329,9 +342,8 @@ computeShapeInfoForInst(Instruction *I, return OpShape->second; } - if (isUniformShape(I) || isa<SelectInst>(I)) { - auto Ops = I->operands(); - auto ShapedOps = isa<SelectInst>(I) ? drop_begin(Ops) : Ops; + if (isShapePreserving(I)) { + auto ShapedOps = getShapedOperandsForInst(I); // Find the first operand that has a known shape and use that. for (auto &Op : ShapedOps) { auto OpShape = ShapeMap.find(Op.get()); @@ -710,10 +722,9 @@ public: case Intrinsic::matrix_column_major_store: return true; default: - return isUniformShape(II); + break; } - return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) || - isa<SelectInst>(V); + return isShapePreserving(V) || isa<StoreInst>(V) || isa<LoadInst>(V); } /// Propagate the shape information of instructions to their users. @@ -800,9 +811,8 @@ public: } else if (isa<StoreInst>(V)) { // Nothing to do. We forward-propagated to this so we would just // backward propagate to an instruction with an already known shape. - } else if (isUniformShape(V) || isa<SelectInst>(V)) { - auto Ops = cast<Instruction>(V)->operands(); - auto ShapedOps = isa<SelectInst>(V) ? drop_begin(Ops) : Ops; + } else if (isShapePreserving(V)) { + auto ShapedOps = getShapedOperandsForInst(cast<Instruction>(V)); // Propagate to all operands. ShapeInfo Shape = ShapeMap[V]; for (Use &U : ShapedOps) { diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index e043d07..08be5df 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1534,8 +1534,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, bool SrcNotDom = false; auto CaptureTrackingWithModRef = - [&](Instruction *AI, - function_ref<bool(Instruction *)> ModRefCallback) -> bool { + [&](Instruction *AI, function_ref<bool(Instruction *)> ModRefCallback, + bool &AddressCaptured) -> bool { SmallVector<Instruction *, 8> Worklist; Worklist.push_back(AI); unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking(); @@ -1559,8 +1559,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, if (!Visited.insert(&U).second) continue; UseCaptureInfo CI = DetermineUseCaptureKind(U, AI); - if (capturesAnything(CI.UseCC)) + if (capturesAnyProvenance(CI.UseCC)) return false; + AddressCaptured |= capturesAddress(CI.UseCC); if (UI->mayReadOrWriteMemory()) { if (UI->isLifetimeStartOrEnd()) { @@ -1627,7 +1628,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, return true; }; - if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback)) + bool DestAddressCaptured = false; + if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback, + DestAddressCaptured)) return false; // Bailout if Dest may have any ModRef before Store. if (!ReachabilityWorklist.empty() && @@ -1653,7 +1656,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, return true; }; - if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback)) + bool SrcAddressCaptured = false; + if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback, + SrcAddressCaptured)) + return false; + + // If both the source and destination address are captured, the fact that they + // are no longer two separate allocations may be observed. + if (DestAddressCaptured && SrcAddressCaptured) return false; // We can do the transformation. First, move the SrcAlloca to the start of the diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 5af6c96..bb6c879 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -81,6 +81,7 @@ STATISTIC( STATISTIC(NumInvariantConditionsInjected, "Number of invariant conditions injected and unswitched"); +namespace llvm { static cl::opt<bool> EnableNonTrivialUnswitch( "enable-nontrivial-unswitch", cl::init(false), cl::Hidden, cl::desc("Forcibly enables non-trivial loop unswitching rather than " @@ -131,11 +132,17 @@ static cl::opt<bool> InjectInvariantConditions( static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold( "simple-loop-unswitch-inject-invariant-condition-hotness-threshold", - cl::Hidden, cl::desc("Only try to inject loop invariant conditions and " - "unswitch on them to eliminate branches that are " - "not-taken 1/<this option> times or less."), + cl::Hidden, + cl::desc("Only try to inject loop invariant conditions and " + "unswitch on them to eliminate branches that are " + "not-taken 1/<this option> times or less."), cl::init(16)); +static cl::opt<bool> EstimateProfile("simple-loop-unswitch-estimate-profile", + cl::Hidden, cl::init(true)); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} // namespace llvm + AnalysisKey ShouldRunExtraSimpleLoopUnswitch::Key; namespace { struct CompareDesc { @@ -268,13 +275,42 @@ static bool areLoopExitPHIsLoopInvariant(const Loop &L, llvm_unreachable("Basic blocks should never be empty!"); } -/// Copy a set of loop invariant values \p ToDuplicate and insert them at the +/// Copy a set of loop invariant values \p Invariants and insert them at the /// end of \p BB and conditionally branch on the copied condition. We only /// branch on a single value. +/// We attempt to estimate the profile of the resulting conditional branch from +/// \p ComputeProfFrom, which is the original conditional branch we're +/// unswitching. +/// When \p Direction is true, the \p Invariants form a disjunction, and the +/// branch conditioned on it exits the loop on the "true" case. When \p +/// Direction is false, the \p Invariants form a conjunction and the branch +/// exits on the "false" case. static void buildPartialUnswitchConditionalBranch( BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction, BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, - const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { + const Instruction *I, AssumptionCache *AC, const DominatorTree &DT, + const BranchInst &ComputeProfFrom) { + + SmallVector<uint32_t> BranchWeights; + bool HasBranchWeights = EstimateProfile && !ProfcheckDisableMetadataFixes && + extractBranchWeights(ComputeProfFrom, BranchWeights); + // If Direction is true, that means we had a disjunction and that the "true" + // case exits. The probability of the disjunction of the subset of terms is at + // most as high as the original one. So, if the probability is higher than the + // one we'd assign in absence of a profile (i.e. 0.5), we will use 0.5, + // but if it's lower, we will use the original probability. + // Conversely, if Direction is false, that means we had a conjunction, and the + // probability of exiting is captured in the second branch weight. That + // probability is a disjunction (of the negation of the original terms). The + // same reasoning applies as above. + // Issue #165649: should we expect BFI to conserve, and use that to calculate + // the branch weights? + if (HasBranchWeights && + static_cast<double>(BranchWeights[Direction ? 0 : 1]) / + static_cast<double>(sum_of(BranchWeights)) > + 0.5) + HasBranchWeights = false; + IRBuilder<> IRB(&BB); IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); @@ -287,8 +323,14 @@ static void buildPartialUnswitchConditionalBranch( Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants) : IRB.CreateAnd(FrozenInvariants); - IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, - Direction ? &NormalSucc : &UnswitchedSucc); + auto *BR = IRB.CreateCondBr( + Cond, Direction ? &UnswitchedSucc : &NormalSucc, + Direction ? &NormalSucc : &UnswitchedSucc, + HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof) + : nullptr); + if (!HasBranchWeights) + setExplicitlyUnknownBranchWeightsIfProfiled( + *BR, *BR->getParent()->getParent(), DEBUG_TYPE); } /// Copy a set of loop invariant values, and conditionally branch on them. @@ -658,7 +700,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, " condition!"); buildPartialUnswitchConditionalBranch( *OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH, - FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT); + FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT, BI); } // Update the dominator tree with the added edge. @@ -2477,7 +2519,7 @@ static void unswitchNontrivialInvariants( else { buildPartialUnswitchConditionalBranch( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, - FreezeLoopUnswitchCond, BI, &AC, DT); + FreezeLoopUnswitchCond, BI, &AC, DT, *BI); } DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 0f3978f..5f6f66a 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -143,8 +143,8 @@ struct SubGraphTraits { class WrappedSuccIterator : public iterator_adaptor_base< WrappedSuccIterator, BaseSuccIterator, - typename std::iterator_traits<BaseSuccIterator>::iterator_category, - NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> { + std::iterator_traits<BaseSuccIterator>::iterator_category, NodeRef, + std::ptrdiff_t, NodeRef *, NodeRef> { SmallDenseSet<RegionNode *> *Nodes; public: diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 9829d4d..11db0ec 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -674,6 +674,79 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName); } +/// Helper function to update the cycle or loop information after inserting a +/// new block between a callbr instruction and one of its target blocks. Adds +/// the new block to the innermost cycle or loop that the callbr instruction and +/// the original target block share. +/// \p LCI cycle or loop information to update +/// \p CallBrBlock block containing the callbr instruction +/// \p CallBrTarget new target block of the callbr instruction +/// \p Succ original target block of the callbr instruction +template <typename TI, typename T> +static bool updateCycleLoopInfo(TI *LCI, BasicBlock *CallBrBlock, + BasicBlock *CallBrTarget, BasicBlock *Succ) { + static_assert(std::is_same_v<TI, CycleInfo> || std::is_same_v<TI, LoopInfo>, + "type must be CycleInfo or LoopInfo"); + if (!LCI) + return false; + + T *LC; + if constexpr (std::is_same_v<TI, CycleInfo>) + LC = LCI->getSmallestCommonCycle(CallBrBlock, Succ); + else + LC = LCI->getSmallestCommonLoop(CallBrBlock, Succ); + if (!LC) + return false; + + if constexpr (std::is_same_v<TI, CycleInfo>) + LCI->addBlockToCycle(CallBrTarget, LC); + else + LC->addBasicBlockToLoop(CallBrTarget, *LCI); + + return true; +} + +BasicBlock *llvm::SplitCallBrEdge(BasicBlock *CallBrBlock, BasicBlock *Succ, + unsigned SuccIdx, DomTreeUpdater *DTU, + CycleInfo *CI, LoopInfo *LI, + bool *UpdatedLI) { + CallBrInst *CallBr = dyn_cast<CallBrInst>(CallBrBlock->getTerminator()); + assert(CallBr && "expected callbr terminator"); + assert(SuccIdx < CallBr->getNumSuccessors() && + Succ == CallBr->getSuccessor(SuccIdx) && "invalid successor index"); + + // Create a new block between callbr and the specified successor. + // splitBlockBefore cannot be re-used here since it cannot split if the split + // point is a PHI node (because BasicBlock::splitBasicBlockBefore cannot + // handle that). But we don't need to rewire every part of a potential PHI + // node. We only care about the edge between CallBrBlock and the original + // successor. + BasicBlock *CallBrTarget = + BasicBlock::Create(CallBrBlock->getContext(), + CallBrBlock->getName() + ".target." + Succ->getName(), + CallBrBlock->getParent()); + // Rewire control flow from the new target block to the original successor. + Succ->replacePhiUsesWith(CallBrBlock, CallBrTarget); + // Rewire control flow from callbr to the new target block. + CallBr->setSuccessor(SuccIdx, CallBrTarget); + // Jump from the new target block to the original successor. + BranchInst::Create(Succ, CallBrTarget); + + bool Updated = + updateCycleLoopInfo<LoopInfo, Loop>(LI, CallBrBlock, CallBrTarget, Succ); + if (UpdatedLI) + *UpdatedLI = Updated; + updateCycleLoopInfo<CycleInfo, Cycle>(CI, CallBrBlock, CallBrTarget, Succ); + if (DTU) { + DTU->applyUpdates({{DominatorTree::Insert, CallBrBlock, CallBrTarget}}); + if (DTU->getDomTree().dominates(CallBrBlock, Succ)) + DTU->applyUpdates({{DominatorTree::Delete, CallBrBlock, Succ}, + {DominatorTree::Insert, CallBrTarget, Succ}}); + } + + return CallBrTarget; +} + void llvm::setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) { if (auto *II = dyn_cast<InvokeInst>(TI)) II->setUnwindDest(Succ); diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp index 0046a00..287a177 100644 --- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp +++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Utils/ControlFlowUtils.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/ValueHandle.h" @@ -281,7 +282,9 @@ std::pair<BasicBlock *, bool> ControlFlowHub::finalize( for (auto [BB, Succ0, Succ1] : Branches) { #ifndef NDEBUG - assert(Incoming.insert(BB).second && "Duplicate entry for incoming block."); + assert( + (Incoming.insert(BB).second || isa<CallBrInst>(BB->getTerminator())) && + "Duplicate entry for incoming block."); #endif if (Succ0) Outgoing.insert(Succ0); diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp index 45e1d12..804af22 100644 --- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp +++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp @@ -79,6 +79,53 @@ // Limitation: The pass cannot handle switch statements and indirect // branches. Both must be lowered to plain branches first. // +// CallBr support: CallBr is handled as a more general branch instruction which +// can have multiple successors. The pass redirects the edges to intermediate +// target blocks that unconditionally branch to the original callbr target +// blocks. This allows the control flow hub to know to which of the original +// target blocks to jump to. +// Example input CFG: +// Entry (callbr) +// / \ +// v v +// H ----> B +// ^ /| +// `----' | +// v +// Exit +// +// becomes: +// Entry (callbr) +// / \ +// v v +// target.H target.B +// | | +// v v +// H ----> B +// ^ /| +// `----' | +// v +// Exit +// +// Note +// OUTPUT CFG: Converted to a natural loop with a new header N. +// +// Entry (callbr) +// / \ +// v v +// target.H target.B +// \ / +// \ / +// v v +// N <---. +// / \ \ +// / \ | +// v v / +// H --> B --' +// | +// v +// Exit +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/FixIrreducible.h" @@ -231,6 +278,7 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, return false; LLVM_DEBUG(dbgs() << "Processing cycle:\n" << CI.print(&C) << "\n";); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); ControlFlowHub CHub; SetVector<BasicBlock *> Predecessors; @@ -242,18 +290,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, } for (BasicBlock *P : Predecessors) { - auto *Branch = cast<BranchInst>(P->getTerminator()); - // Exactly one of the two successors is the header. - BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; - BasicBlock *Succ1 = Succ0 ? nullptr : Header; - if (!Succ0) - assert(Branch->getSuccessor(1) == Header); - assert(Succ0 || Succ1); - CHub.addBranch(P, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added internal branch: " << P->getName() << " -> " - << (Succ0 ? Succ0->getName() : "") << " " - << (Succ1 ? Succ1->getName() : "") << "\n"); + if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator())) { + // Exactly one of the two successors is the header. + BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; + BasicBlock *Succ1 = Succ0 ? nullptr : Header; + assert(Succ0 || Branch->getSuccessor(1) == Header); + assert(Succ0 || Succ1); + CHub.addBranch(P, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added internal branch: " << printBasicBlock(P) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) { + for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { + BasicBlock *Succ = CallBr->getSuccessor(I); + if (Succ != Header) + continue; + BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added internal branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } // Redirect external incoming edges. This includes the edges on the header. @@ -266,17 +328,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, } for (BasicBlock *P : Predecessors) { - auto *Branch = cast<BranchInst>(P->getTerminator()); - BasicBlock *Succ0 = Branch->getSuccessor(0); - Succ0 = C.contains(Succ0) ? Succ0 : nullptr; - BasicBlock *Succ1 = - Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); - Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; - CHub.addBranch(P, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added external branch: " << P->getName() << " -> " - << (Succ0 ? Succ0->getName() : "") << " " - << (Succ1 ? Succ1->getName() : "") << "\n"); + if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator()); Branch) { + BasicBlock *Succ0 = Branch->getSuccessor(0); + Succ0 = C.contains(Succ0) ? Succ0 : nullptr; + BasicBlock *Succ1 = + Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); + Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; + CHub.addBranch(P, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added external branch: " << printBasicBlock(P) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) { + for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { + BasicBlock *Succ = CallBr->getSuccessor(I); + if (!C.contains(Succ)) + continue; + BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added external branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } // Redirect all the backedges through a "hub" consisting of a series @@ -292,7 +369,6 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT, SetVector<BasicBlock *> Entries; Entries.insert(C.entry_rbegin(), C.entry_rend()); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); CHub.finalize(&DTU, GuardBlocks, "irr"); #if defined(EXPENSIVE_CHECKS) assert(DT.verify(DominatorTree::VerificationLevel::Full)); @@ -325,8 +401,6 @@ static bool FixIrreducibleImpl(Function &F, CycleInfo &CI, DominatorTree &DT, LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: " << F.getName() << "\n"); - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - bool Changed = false; for (Cycle *TopCycle : CI.toplevel_cycles()) { for (Cycle *C : depth_first(TopCycle)) { diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 4fe736a..94dfd3a 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -499,9 +499,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L); const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); - unsigned EstimatedLoopInvocationWeight = 0; std::optional<unsigned> OriginalTripCount = - llvm::getLoopEstimatedTripCount(L, &EstimatedLoopInvocationWeight); + llvm::getLoopEstimatedTripCount(L); + BranchProbability OriginalLoopProb = llvm::getLoopProbability(L); // Effectively "DCE" unrolled iterations that are beyond the max tripcount // and will never be executed. @@ -592,11 +592,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, : isEpilogProfitable(L); if (ULO.Runtime && - !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount, - EpilogProfitability, ULO.UnrollRemainder, - ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, - PreserveLCSSA, ULO.SCEVExpansionBudget, - ULO.RuntimeUnrollMultiExit, RemainderLoop)) { + !UnrollRuntimeLoopRemainder( + L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability, + ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, + PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit, + RemainderLoop, OriginalTripCount, OriginalLoopProb)) { if (ULO.Force) ULO.Runtime = false; else { @@ -1130,11 +1130,46 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, LI->erase(L); // We shouldn't try to use `L` anymore. L = nullptr; - } else if (OriginalTripCount) { - // Update the trip count. Note that the remainder has already logic - // computing it in `UnrollRuntimeLoopRemainder`. - setLoopEstimatedTripCount(L, *OriginalTripCount / ULO.Count, - EstimatedLoopInvocationWeight); + } else { + // Update metadata for the loop's branch weights and estimated trip count: + // - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch + // weights, latch branch weights, and estimated trip count of the + // remainder loop it creates. It also sets the branch weights for the + // unrolled loop guard it creates. The branch weights for the unrolled + // loop latch are adjusted below. FIXME: Handle prologue loops. + // - Otherwise, if unrolled loop iteration latches become unconditional, + // branch weights are adjusted above. FIXME: Actually handle such + // unconditional latches. + // - Otherwise, the original loop's branch weights are correct for the + // unrolled loop, so do not adjust them. + // - In all cases, the unrolled loop's estimated trip count is set below. + // + // As an example of the last case, consider what happens if the unroll count + // is 4 for a loop with an estimated trip count of 10 when we do not create + // a remainder loop and all iterations' latches remain conditional. Each + // unrolled iteration's latch still has the same probability of exiting the + // loop as it did when in the original loop, and thus it should still have + // the same branch weights. Each unrolled iteration's non-zero probability + // of exiting already appropriately reduces the probability of reaching the + // remaining iterations just as it did in the original loop. Trying to also + // adjust the branch weights of the final unrolled iteration's latch (i.e., + // the backedge for the unrolled loop as a whole) to reflect its new trip + // count of 3 will erroneously further reduce its block frequencies. + // However, in case an analysis later needs to estimate the trip count of + // the unrolled loop as a whole without considering the branch weights for + // each unrolled iteration's latch within it, we store the new trip count as + // separate metadata. + if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) { + // Where p is always the probability of executing at least 1 more + // iteration, the probability for at least n more iterations is p^n. + setLoopProbability(L, OriginalLoopProb.pow(ULO.Count)); + } + if (OriginalTripCount) { + unsigned NewTripCount = *OriginalTripCount / ULO.Count; + if (!ULO.Runtime && *OriginalTripCount % ULO.Count) + ++NewTripCount; + setLoopEstimatedTripCount(L, NewTripCount); + } } // LoopInfo should not be valid, confirm that. diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 6312831..1e8f6cc 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -40,6 +40,7 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include <cmath> using namespace llvm; @@ -195,6 +196,21 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, } } +/// Assume, due to our position in the remainder loop or its guard, anywhere +/// from 0 to \p N more iterations can possibly execute. Among such cases in +/// the original loop (with loop probability \p OriginalLoopProb), what is the +/// probability of executing at least one more iteration? +static BranchProbability +probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { + // Each of these variables holds the original loop's probability that the + // number of iterations it will execute is some m in the specified range. + BranchProbability ProbOne = OriginalLoopProb; // 1 <= m + BranchProbability ProbTooMany = ProbOne.pow(N + 1); // N + 1 <= m + BranchProbability ProbNotTooMany = ProbTooMany.getCompl(); // 0 <= m <= N + BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N + return ProbOneNotTooMany / ProbNotTooMany; +} + /// Connect the unrolling epilog code to the original loop. /// The unrolling epilog code contains code to execute the /// 'extra' iterations if the run-time trip count modulo the @@ -221,7 +237,8 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader, ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE, - unsigned Count, AssumptionCache &AC) { + unsigned Count, AssumptionCache &AC, + BranchProbability OriginalLoopProb) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]); @@ -332,12 +349,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, PreserveLCSSA); // Add the branch to the exit block (around the epilog loop) MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*Latch->getTerminator())) { + if (OriginalLoopProb.isUnknown() && + hasBranchWeightMD(*Latch->getTerminator())) { // Assume equal distribution in interval [0, Count). MDBuilder MDB(B.getContext()); BranchWeights = MDB.createBranchWeights(1, Count - 1); } - B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); + BranchInst *RemainderLoopGuard = + B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); + if (!OriginalLoopProb.isUnknown()) { + setBranchProbability(RemainderLoopGuard, + probOfNextInRemainder(OriginalLoopProb, Count - 1), + /*ForFirstTarget=*/true); + } InsertPt->eraseFromParent(); if (DT) { auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit); @@ -357,14 +381,15 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, /// The cloned blocks should be inserted between InsertTop and InsertBot. /// InsertTop should be new preheader, InsertBot new loop exit. /// Returns the new cloned loop that is created. -static Loop * -CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, - const bool UnrollRemainder, - BasicBlock *InsertTop, - BasicBlock *InsertBot, BasicBlock *Preheader, +static Loop *CloneLoopBlocks(Loop *L, Value *NewIter, + const bool UseEpilogRemainder, + const bool UnrollRemainder, BasicBlock *InsertTop, + BasicBlock *InsertBot, BasicBlock *Preheader, std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, - DominatorTree *DT, LoopInfo *LI, unsigned Count) { + DominatorTree *DT, LoopInfo *LI, unsigned Count, + std::optional<unsigned> OriginalTripCount, + BranchProbability OriginalLoopProb) { StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); @@ -419,7 +444,8 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next"); Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp"); MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*LatchBR)) { + if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && + hasBranchWeightMD(*LatchBR)) { uint32_t ExitWeight; uint32_t BackEdgeWeight; if (Count >= 3) { @@ -437,7 +463,29 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, MDBuilder MDB(Builder.getContext()); BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight); } - Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); + BranchInst *RemainderLoopLatch = + Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); + if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { + // Compute the total frequency of the original loop body from the + // remainder iterations. Once we've reached them, the first of them + // always executes, so its frequency and probability are 1. + double FreqRemIters = 1; + if (Count > 2) { + BranchProbability ProbReaching = BranchProbability::getOne(); + for (unsigned N = Count - 2; N >= 1; --N) { + ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N); + FreqRemIters += double(ProbReaching.getNumerator()) / + ProbReaching.getDenominator(); + } + } + // Solve for the loop probability that would produce that frequency. + // Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters. + double ProbDouble = 1 - 1 / FreqRemIters; + BranchProbability Prob = BranchProbability::getBranchProbability( + std::round(ProbDouble * BranchProbability::getDenominator()), + BranchProbability::getDenominator()); + setBranchProbability(RemainderLoopLatch, Prob, /*ForFirstTarget=*/true); + } NewIdx->addIncoming(Zero, InsertTop); NewIdx->addIncoming(IdxNext, NewBB); LatchBR->eraseFromParent(); @@ -460,25 +508,13 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Loop *NewLoop = NewLoops[L]; assert(NewLoop && "L should have been cloned"); - MDNode *LoopID = NewLoop->getLoopID(); - // Only add loop metadata if the loop is not going to be completely - // unrolled. - if (UnrollRemainder) - return NewLoop; - - std::optional<MDNode *> NewLoopID = makeFollowupLoopID( - LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); - if (NewLoopID) { - NewLoop->setLoopID(*NewLoopID); - - // Do not setLoopAlreadyUnrolled if loop attributes have been defined - // explicitly. - return NewLoop; - } + if (OriginalTripCount && UseEpilogRemainder) + setLoopEstimatedTripCount(NewLoop, *OriginalTripCount % Count); // Add unroll disable metadata to disable future unrolling for this loop. - NewLoop->setLoopAlreadyUnrolled(); + if (!UnrollRemainder) + NewLoop->setLoopAlreadyUnrolled(); return NewLoop; } @@ -603,7 +639,8 @@ bool llvm::UnrollRuntimeLoopRemainder( LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI, bool PreserveLCSSA, unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit, - Loop **ResultLoop) { + Loop **ResultLoop, std::optional<unsigned> OriginalTripCount, + BranchProbability OriginalLoopProb) { LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n"); LLVM_DEBUG(L->dump()); LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" @@ -823,12 +860,23 @@ bool llvm::UnrollRuntimeLoopRemainder( BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit; // Branch to either remainder (extra iterations) loop or unrolling loop. MDNode *BranchWeights = nullptr; - if (hasBranchWeightMD(*Latch->getTerminator())) { + if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && + hasBranchWeightMD(*Latch->getTerminator())) { // Assume loop is nearly always entered. MDBuilder MDB(B.getContext()); BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights); } - B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); + BranchInst *UnrollingLoopGuard = + B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); + if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { + // The original loop's first iteration always happens. Compute the + // probability of the original loop executing Count-1 iterations after that + // to complete the first iteration of the unrolled loop. + BranchProbability ProbOne = OriginalLoopProb; + BranchProbability ProbRest = ProbOne.pow(Count - 1); + setBranchProbability(UnrollingLoopGuard, ProbRest, + /*ForFirstTarget=*/false); + } PreHeaderBR->eraseFromParent(); if (DT) { if (UseEpilogRemainder) @@ -855,9 +903,10 @@ bool llvm::UnrollRuntimeLoopRemainder( // iterations. This function adds the appropriate CFG connections. BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit; BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; - Loop *remainderLoop = CloneLoopBlocks( - L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot, - NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count); + Loop *remainderLoop = + CloneLoopBlocks(L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, + InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, + LI, Count, OriginalTripCount, OriginalLoopProb); // Insert the cloned blocks into the function. F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end()); @@ -956,7 +1005,8 @@ bool llvm::UnrollRuntimeLoopRemainder( // Connect the epilog code to the original loop and update the // PHI functions. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader, - NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC); + NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC, + OriginalLoopProb); // Update counter in loop for unrolling. // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index b6ba822..6e60b94 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -962,13 +962,54 @@ bool llvm::setLoopEstimatedTripCount( if (LatchBranch->getSuccessor(0) != L->getHeader()) std::swap(BackedgeTakenWeight, LatchExitWeight); - MDBuilder MDB(LatchBranch->getContext()); - // Set/Update profile metadata. - LatchBranch->setMetadata( - LLVMContext::MD_prof, - MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight)); + setBranchWeights(*LatchBranch, {BackedgeTakenWeight, LatchExitWeight}, + /*IsExpected=*/false); + + return true; +} + +BranchProbability llvm::getLoopProbability(Loop *L) { + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return BranchProbability::getUnknown(); + bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); + return getBranchProbability(LatchBranch, FirstTargetIsLoop); +} +bool llvm::setLoopProbability(Loop *L, BranchProbability P) { + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return false; + bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); + return setBranchProbability(LatchBranch, P, FirstTargetIsLoop); +} + +BranchProbability llvm::getBranchProbability(BranchInst *B, + bool ForFirstTarget) { + if (B->getNumSuccessors() != 2) + return BranchProbability::getUnknown(); + uint64_t Weight0, Weight1; + if (!extractBranchWeights(*B, Weight0, Weight1)) + return BranchProbability::getUnknown(); + uint64_t Denominator = Weight0 + Weight1; + if (Denominator == 0) + return BranchProbability::getUnknown(); + if (!ForFirstTarget) + std::swap(Weight0, Weight1); + return BranchProbability::getBranchProbability(Weight0, Denominator); +} + +bool llvm::setBranchProbability(BranchInst *B, BranchProbability P, + bool ForFirstTarget) { + if (B->getNumSuccessors() != 2) + return false; + BranchProbability Prob0 = P; + BranchProbability Prob1 = P.getCompl(); + if (!ForFirstTarget) + std::swap(Prob0, Prob1); + setBranchWeights(*B, {Prob0.getNumerator(), Prob1.getNumerator()}, + /*IsExpected=*/false); return true; } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index b03fb62..cbc604e 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -80,6 +80,7 @@ #include <algorithm> #include <cassert> #include <climits> +#include <cmath> #include <cstddef> #include <cstdint> #include <iterator> @@ -5955,7 +5956,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, } // Update weight for the newly-created conditional branch. - if (hasBranchWeightMD(*SI)) { + if (hasBranchWeightMD(*SI) && NewBI->isConditional()) { SmallVector<uint64_t, 8> Weights; getBranchWeights(SI, Weights); if (Weights.size() == 1 + SI->getNumCases()) { @@ -5977,14 +5978,14 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, } // Prune obsolete incoming values off the successors' PHI nodes. - for (auto BBI = Dest->begin(); isa<PHINode>(BBI); ++BBI) { + for (auto &PHI : make_early_inc_range(Dest->phis())) { unsigned PreviousEdges = Cases->size(); if (Dest == SI->getDefaultDest()) ++PreviousEdges; for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) - cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); + PHI.removeIncomingValue(SI->getParent()); } - for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) { + for (auto &PHI : make_early_inc_range(OtherDest->phis())) { unsigned PreviousEdges = OtherCases->size(); if (OtherDest == SI->getDefaultDest()) ++PreviousEdges; @@ -5993,7 +5994,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, if (NewBI->isUnconditional()) ++E; for (unsigned I = 0; I != E; ++I) - cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); + PHI.removeIncomingValue(SI->getParent()); } // Clean up the default block - it may have phis or other instructions before @@ -7632,7 +7633,33 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, auto *DefaultCaseBB = SI->getDefaultDest(); BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU); auto It = OrigBB->getTerminator()->getIterator(); + SmallVector<uint32_t> Weights; + auto HasWeights = + !ProfcheckDisableMetadataFixes && extractBranchWeights(*SI, Weights); auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + if (HasWeights && any_of(Weights, [](const auto &V) { return V != 0; })) { + // IsPow2 covers a subset of the cases in which we'd go to the default + // label. The other is those powers of 2 that don't appear in the case + // statement. We don't know the distribution of the values coming in, so + // the safest is to split 50-50 the original probability to `default`. + uint64_t OrigDenominator = sum_of(map_range( + Weights, [](const auto &V) { return static_cast<uint64_t>(V); })); + SmallVector<uint64_t> NewWeights(2); + NewWeights[1] = Weights[0] / 2; + NewWeights[0] = OrigDenominator - NewWeights[1]; + setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false); + + // For the original switch, we reduce the weight of the default by the + // amount by which the previous branch contributes to getting to default, + // and then make sure the remaining weights have the same relative ratio + // wrt eachother. + uint64_t CasesDenominator = OrigDenominator - Weights[0]; + Weights[0] /= 2; + for (auto &W : drop_begin(Weights)) + W = NewWeights[0] * static_cast<double>(W) / CasesDenominator; + + setBranchWeights(*SI, Weights, /*IsExpected=*/false); + } // BI is handling the default case for SI, and so should share its DebugLoc. BI->setDebugLoc(SI->getDebugLoc()); It->eraseFromParent(); diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 9f338db..94c5c170 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -12,7 +12,11 @@ // // Limitation: This assumes that all terminators in the CFG are direct branches // (the "br" instruction). The presence of any other control flow -// such as indirectbr, switch or callbr will cause an assert. +// such as indirectbr or switch will cause an assert. +// The callbr terminator is supported by creating intermediate +// target blocks that unconditionally branch to the original target +// blocks. These intermediate target blocks can then be redirected +// through the ControlFlowHub as usual. // //===----------------------------------------------------------------------===// @@ -150,25 +154,55 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { SmallVector<BasicBlock *, 8> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix; // Redirect exiting edges through a control flow hub. ControlFlowHub CHub; - for (auto *BB : ExitingBlocks) { - auto *Branch = cast<BranchInst>(BB->getTerminator()); - BasicBlock *Succ0 = Branch->getSuccessor(0); - Succ0 = L->contains(Succ0) ? nullptr : Succ0; - - BasicBlock *Succ1 = - Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); - Succ1 = L->contains(Succ1) ? nullptr : Succ1; - CHub.addBranch(BB, Succ0, Succ1); - - LLVM_DEBUG(dbgs() << "Added exiting branch: " << BB->getName() << " -> {" - << (Succ0 ? Succ0->getName() : "<none>") << ", " - << (Succ1 ? Succ1->getName() : "<none>") << "}\n"); + + for (unsigned I = 0; I < ExitingBlocks.size(); ++I) { + BasicBlock *BB = ExitingBlocks[I]; + if (BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator())) { + BasicBlock *Succ0 = Branch->getSuccessor(0); + Succ0 = L->contains(Succ0) ? nullptr : Succ0; + + BasicBlock *Succ1 = + Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); + Succ1 = L->contains(Succ1) ? nullptr : Succ1; + CHub.addBranch(BB, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added extiting branch: " << printBasicBlock(BB) + << " -> " << printBasicBlock(Succ0) + << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) + << '\n'); + } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(BB->getTerminator())) { + for (unsigned J = 0; J < CallBr->getNumSuccessors(); ++J) { + BasicBlock *Succ = CallBr->getSuccessor(J); + if (L->contains(Succ)) + continue; + bool UpdatedLI = false; + BasicBlock *NewSucc = + SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); + // Even if CallBr and Succ do not have a common parent loop, we need to + // add the new target block to the parent loop of the current loop. + if (!UpdatedLI) + CallBrTargetBlocksToFix.push_back(NewSucc); + // ExitingBlocks is later used to restore SSA, so we need to make sure + // that the blocks used for phi nodes in the guard blocks match the + // predecessors of the guard blocks, which, in the case of callbr, are + // the new intermediate target blocks instead of the callbr blocks + // themselves. + ExitingBlocks[I] = NewSucc; + CHub.addBranch(NewSucc, Succ); + LLVM_DEBUG(dbgs() << "Added exiting branch: " + << printBasicBlock(NewSucc) << " -> " + << printBasicBlock(Succ) << '\n'); + } + } else { + llvm_unreachable("unsupported block terminator"); + } } SmallVector<BasicBlock *, 8> GuardBlocks; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); BasicBlock *LoopExitBlock; bool ChangedCFG; std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize( @@ -187,10 +221,19 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { // The guard blocks were created outside the loop, so they need to become // members of the parent loop. - if (auto ParentLoop = L->getParentLoop()) { + // Same goes for the callbr target blocks. Although we try to add them to the + // smallest common parent loop of the callbr block and the corresponding + // original target block, there might not have been such a loop, in which case + // the newly created callbr target blocks are not part of any loop. For nested + // loops, this might result in them leading to a loop with multiple entry + // points. + if (auto *ParentLoop = L->getParentLoop()) { for (auto *G : GuardBlocks) { ParentLoop->addBasicBlockToLoop(G, LI); } + for (auto *C : CallBrTargetBlocksToFix) { + ParentLoop->addBasicBlockToLoop(C, LI); + } ParentLoop->verifyLoop(); } @@ -218,8 +261,6 @@ bool UnifyLoopExitsLegacyPass::runOnFunction(Function &F) { auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - return runImpl(LI, DT); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 3fed003..04b0562 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -167,7 +167,7 @@ public: DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( - new VPInstruction(Opcode, Operands, Flags, DL, Name)); + new VPInstruction(Opcode, Operands, Flags, {}, DL, Name)); } VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, @@ -184,7 +184,7 @@ public: DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( - new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); + new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name)); } VPInstruction *createNot(VPValue *Operand, @@ -205,7 +205,7 @@ public: return tryInsertInstruction(new VPInstruction( Instruction::BinaryOps::Or, {LHS, RHS}, - VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name)); + VPRecipeWithIRFlags::DisjointFlagsTy(false), {}, DL, Name)); } VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS, @@ -221,7 +221,7 @@ public: std::optional<FastMathFlags> FMFs = std::nullopt) { auto *Select = FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, - *FMFs, DL, Name) + *FMFs, {}, DL, Name) : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, DL, Name); return tryInsertInstruction(Select); @@ -235,7 +235,7 @@ public: assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); return tryInsertInstruction( - new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name)); + new VPInstruction(Instruction::ICmp, {A, B}, Pred, {}, DL, Name)); } /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A @@ -246,7 +246,7 @@ public: assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE && Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate"); return tryInsertInstruction( - new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name)); + new VPInstruction(Instruction::FCmp, {A, B}, Pred, {}, DL, Name)); } VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, @@ -254,7 +254,7 @@ public: const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, - GEPNoWrapFlags::none(), DL, Name)); + GEPNoWrapFlags::none(), {}, DL, Name)); } VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset, @@ -262,7 +262,7 @@ public: DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction(new VPInstruction( - VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name)); + VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, {}, DL, Name)); } VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset, @@ -270,7 +270,7 @@ public: const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::WidePtrAdd, {Ptr, Offset}, - GEPNoWrapFlags::none(), DL, Name)); + GEPNoWrapFlags::none(), {}, DL, Name)); } VPPhi *createScalarPhi(ArrayRef<VPValue *> IncomingValues, DebugLoc DL, @@ -280,8 +280,7 @@ public: VPValue *createElementCount(Type *Ty, ElementCount EC) { VPlan &Plan = *getInsertBlock()->getPlan(); - VPValue *RuntimeEC = - Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue())); + VPValue *RuntimeEC = Plan.getConstantInt(Ty, EC.getKnownMinValue()); if (EC.isScalable()) { VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty); RuntimeEC = EC.getKnownMinValue() == 1 @@ -304,9 +303,11 @@ public: } VPInstruction *createScalarCast(Instruction::CastOps Opcode, VPValue *Op, - Type *ResultTy, DebugLoc DL) { + Type *ResultTy, DebugLoc DL, + const VPIRFlags &Flags = {}, + const VPIRMetadata &Metadata = {}) { return tryInsertInstruction( - new VPInstructionWithType(Opcode, Op, ResultTy, {}, DL)); + new VPInstructionWithType(Opcode, Op, ResultTy, DL, Flags, Metadata)); } VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f7968ab..e5c3f17 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3908,7 +3908,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( continue; VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4166,7 +4166,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -5750,13 +5750,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { getMemoryInstructionCost(I, ElementCount::getFixed(1)))); UpdateMemOpUserCost(cast<LoadInst>(I)); } else if (const auto *Group = getInterleavedAccessGroup(I)) { - // Scalarize an interleave group of address loads. - for (unsigned I = 0; I < Group->getFactor(); ++I) { - if (Instruction *Member = Group->getMember(I)) { - setWideningDecision( - Member, VF, CM_Scalarize, - (VF.getKnownMinValue() * - getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); + // Scalarize all members of this interleaved group when any member + // is used as an address. The address-used load skips scalarization + // overhead, other members include it. + for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) { + if (Instruction *Member = Group->getMember(Idx)) { + InstructionCost Cost = + AddrDefs.contains(Member) + ? (VF.getKnownMinValue() * + getMemoryInstructionCost(Member, + ElementCount::getFixed(1))) + : getMemInstScalarizationCost(Member, VF); + setWideningDecision(Member, VF, CM_Scalarize, Cost); UpdateMemOpUserCost(cast<LoadInst>(Member)); } } @@ -6871,7 +6876,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(), + OrigLoop); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7105,12 +7111,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The // legacy cost model doesn't properly model costs for such loops. assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() || + !Legal->getLAI()->getSymbolicStrides().empty() || planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) || @@ -7745,8 +7752,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, if (CM.isPredicatedInst(I)) { SmallVector<VPValue *> Ops(Operands); VPValue *Mask = getBlockInMask(Builder.getInsertBlock()); - VPValue *One = - Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); + VPValue *One = Plan.getConstantInt(I->getType(), 1u); auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); Ops[1] = SafeRHS; return new VPWidenRecipe(*I, Ops); @@ -7799,11 +7805,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, } case Instruction::ExtractValue: { SmallVector<VPValue *> NewOps(Operands); - Type *I32Ty = IntegerType::getInt32Ty(I->getContext()); auto *EVI = cast<ExtractValueInst>(I); assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index"); unsigned Idx = EVI->getIndices()[0]; - NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false))); + NewOps.push_back(Plan.getConstantInt(32, Idx)); return new VPWidenRecipe(*I, NewOps); } }; @@ -8172,8 +8177,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, "Expected an ADD or SUB operation for predicated partial " "reductions (because the neutral element in the mask is zero)!"); Cond = getBlockInMask(Builder.getInsertBlock()); - VPValue *Zero = - Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0)); + VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0); BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc()); } return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, @@ -8335,11 +8339,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( &R) || (isa<VPInstruction>(&R) && !UnderlyingValue)) continue; - - // FIXME: VPlan0, which models a copy of the original scalar loop, should - // not use VPWidenPHIRecipe to model the phis. - assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) && - UnderlyingValue && "unsupported recipe"); + assert(isa<VPInstruction>(&R) && UnderlyingValue && "unsupported recipe"); // TODO: Gradually replace uses of underlying instruction by analyses on // VPlan. @@ -8440,7 +8440,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -8640,7 +8640,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs && CurrentLinkI->getOpcode() == Instruction::Sub) { Type *PhiTy = PhiR->getUnderlyingValue()->getType(); - auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0)); + auto *Zero = Plan->getConstantInt(PhiTy, 0); VPWidenRecipe *Sub = new VPWidenRecipe( Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {}, VPIRMetadata(), CurrentLinkI->getDebugLoc()); @@ -8854,8 +8854,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( ToDelete.push_back(Select); // Convert the reduction phi to operate on bools. - PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( - OrigLoop->getHeader()->getContext()))); + PhiR->setOperand(0, Plan->getFalse()); continue; } @@ -8877,9 +8876,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( unsigned ScaleFactor = RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr()) .value_or(1); - Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext()); - auto *ScaleFactorVPV = - Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor)); + auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor); VPValue *StartV = PHBuilder.createNaryOp( VPInstruction::ReductionStartVector, {PhiR->getStartValue(), Iden, ScaleFactorVPV}, @@ -9910,7 +9907,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind, *CM.PSE.getSE()); + CM.CostKind, *CM.PSE.getSE(), L); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1b55a3b..bf3f52c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20975,6 +20975,27 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, if (isa<PHINode>(S.getMainOp()) || isVectorLikeInstWithConstOps(S.getMainOp())) return nullptr; + // If the parent node is non-schedulable and the current node is copyable, and + // any of parent instructions are used outside several basic blocks or in + // bin-op node - cancel scheduling, it may cause wrong def-use deps in + // analysis, leading to a crash. + // Non-scheduled nodes may not have related ScheduleData model, which may lead + // to a skipped dep analysis. + if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() && + EI.UserTE->doesNotNeedToSchedule() && + EI.UserTE->getOpcode() != Instruction::PHI && + any_of(EI.UserTE->Scalars, [](Value *V) { + auto *I = dyn_cast<Instruction>(V); + if (!I || I->hasOneUser()) + return false; + for (User *U : I->users()) { + auto *UI = cast<Instruction>(U); + if (isa<BinaryOperator>(UI)) + return true; + } + return false; + })) + return std::nullopt; bool HasCopyables = S.areInstructionsWithCopyableElements(); if (((!HasCopyables && doesNotNeedToSchedule(VL)) || all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { @@ -22134,6 +22155,27 @@ bool BoUpSLP::collectValuesToDemote( {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(), VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()}); + if (E.isAltShuffle()) { + // Combining these opcodes may lead to incorrect analysis, skip for now. + auto IsDangerousOpcode = [](unsigned Opcode) { + switch (Opcode) { + case Instruction::Shl: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return true; + default: + break; + } + return false; + }; + if (IsDangerousOpcode(E.getAltOpcode())) + return FinalAnalysis(); + } + switch (E.getOpcode()) { // We can always demote truncations and extensions. Since truncations can diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 9c869dd..d354933 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -92,7 +92,7 @@ void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const { DGNode::print(OS, false); if (PrintDeps) { // Print memory preds. - static constexpr const unsigned Indent = 4; + static constexpr unsigned Indent = 4; for (auto *Pred : MemPreds) OS.indent(Indent) << "<-" << *Pred->getInstruction() << "\n"; } diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 86dbd21..5534da9 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -25,14 +25,14 @@ static cl::opt<bool> "emit new instructions (*very* expensive).")); #endif // NDEBUG -static constexpr const unsigned long StopAtDisabled = +static constexpr unsigned long StopAtDisabled = std::numeric_limits<unsigned long>::max(); static cl::opt<unsigned long> StopAt("sbvec-stop-at", cl::init(StopAtDisabled), cl::Hidden, cl::desc("Vectorize if the invocation count is < than this. 0 " "disables vectorization.")); -static constexpr const unsigned long StopBundleDisabled = +static constexpr unsigned long StopBundleDisabled = std::numeric_limits<unsigned long>::max(); static cl::opt<unsigned long> StopBundle("sbvec-stop-bndl", cl::init(StopBundleDisabled), cl::Hidden, diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index ed2f80b..2de6921 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -43,7 +43,7 @@ cl::opt<std::string> AllowFiles( "sbvec-allow-files", cl::init(".*"), cl::Hidden, cl::desc("Run the vectorizer only on file paths that match any in the " "list of comma-separated regex's.")); -static constexpr const char AllowFilesDelim = ','; +static constexpr char AllowFilesDelim = ','; SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") { if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1f10058..cfe1f1e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -939,7 +939,7 @@ class VPIRMetadata { SmallVector<std::pair<unsigned, MDNode *>> Metadata; public: - VPIRMetadata() {} + VPIRMetadata() = default; /// Adds metatadata that can be preserved from the original instruction /// \p I. @@ -950,12 +950,9 @@ public: VPIRMetadata(Instruction &I, LoopVersioning *LVer); /// Copy constructor for cloning. - VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {} + VPIRMetadata(const VPIRMetadata &Other) = default; - VPIRMetadata &operator=(const VPIRMetadata &Other) { - Metadata = Other.Metadata; - return *this; - } + VPIRMetadata &operator=(const VPIRMetadata &Other) = default; /// Add all metadata to \p I. void applyMetadata(Instruction &I) const; @@ -1107,14 +1104,14 @@ public: VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {} VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(), - const Twine &Name = ""); + const VPIRFlags &Flags, const VPIRMetadata &MD = {}, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = ""); VP_CLASSOF_IMPL(VPDef::VPInstructionSC) VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); - auto *New = new VPInstruction(Opcode, Operands, *this, getDebugLoc(), Name); + auto *New = new VPInstruction(Opcode, operands(), *this, *this, + getDebugLoc(), Name); if (getUnderlyingValue()) New->setUnderlyingValue(getUnderlyingInstr()); return New; @@ -1196,7 +1193,14 @@ public: VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands, Type *ResultTy, const VPIRFlags &Flags, DebugLoc DL, const Twine &Name = "") - : VPInstruction(Opcode, Operands, Flags, DL, Name), ResultTy(ResultTy) {} + : VPInstruction(Opcode, Operands, Flags, {}, DL, Name), + ResultTy(ResultTy) {} + + VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands, + Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags, + const VPIRMetadata &Metadata, const Twine &Name = "") + : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name), + ResultTy(ResultTy) {} static inline bool classof(const VPRecipeBase *R) { // VPInstructionWithType are VPInstructions with specific opcodes requiring @@ -1221,10 +1225,9 @@ public: } VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); auto *New = - new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this, - getDebugLoc(), getName()); + new VPInstructionWithType(getOpcode(), operands(), getResultType(), + *this, getDebugLoc(), getName()); New->setUnderlyingValue(getUnderlyingValue()); return New; } @@ -3206,6 +3209,9 @@ protected: : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); + assert(isa<VPVectorEndPointerRecipe>(getAddr()) || + !Reverse && + "Reversed acccess without VPVectorEndPointerRecipe address?"); } public: @@ -3977,7 +3983,7 @@ class VPIRBasicBlock : public VPBasicBlock { IRBB(IRBB) {} public: - ~VPIRBasicBlock() override {} + ~VPIRBasicBlock() override = default; static inline bool classof(const VPBlockBase *V) { return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; @@ -4029,7 +4035,7 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase { IsReplicator(IsReplicator) {} public: - ~VPRegionBlock() override {} + ~VPRegionBlock() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPBlockBase *V) { @@ -4109,6 +4115,12 @@ public: const VPCanonicalIVPHIRecipe *getCanonicalIV() const { return const_cast<VPRegionBlock *>(this)->getCanonicalIV(); } + + /// Return the type of the canonical IV for loop regions. + Type *getCanonicalIVType() { return getCanonicalIV()->getScalarType(); } + const Type *getCanonicalIVType() const { + return getCanonicalIV()->getScalarType(); + } }; inline VPRegionBlock *VPRecipeBase::getRegion() { @@ -4387,15 +4399,25 @@ public: } /// Return a VPValue wrapping i1 true. - VPValue *getTrue() { - LLVMContext &Ctx = getContext(); - return getOrAddLiveIn(ConstantInt::getTrue(Ctx)); - } + VPValue *getTrue() { return getConstantInt(1, 1); } /// Return a VPValue wrapping i1 false. - VPValue *getFalse() { - LLVMContext &Ctx = getContext(); - return getOrAddLiveIn(ConstantInt::getFalse(Ctx)); + VPValue *getFalse() { return getConstantInt(1, 0); } + + /// Return a VPValue wrapping a ConstantInt with the given type and value. + VPValue *getConstantInt(Type *Ty, uint64_t Val, bool IsSigned = false) { + return getOrAddLiveIn(ConstantInt::get(Ty, Val, IsSigned)); + } + + /// Return a VPValue wrapping a ConstantInt with the given bitwidth and value. + VPValue *getConstantInt(unsigned BitWidth, uint64_t Val, + bool IsSigned = false) { + return getConstantInt(APInt(BitWidth, Val, IsSigned)); + } + + /// Return a VPValue wrapping a ConstantInt with the given APInt value. + VPValue *getConstantInt(const APInt &Val) { + return getOrAddLiveIn(ConstantInt::get(getContext(), Val)); } /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 65688a3..1a66d20 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -612,8 +612,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan, if (!RequiresScalarEpilogueCheck) Cmp = Plan.getFalse(); else if (TailFolded) - Cmp = Plan.getOrAddLiveIn( - ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext()))); + Cmp = Plan.getTrue(); else Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(), &Plan.getVectorTripCount(), LatchDL, "cmp.n"); @@ -712,8 +711,8 @@ void VPlanTransforms::addMinimumIterationCheck( // additional overflow check is required before entering the vector loop. // Get the maximum unsigned value for the type. - VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get( - TripCountTy, cast<IntegerType>(TripCountTy)->getMask())); + VPValue *MaxUIntTripCount = + Plan.getConstantInt(cast<IntegerType>(TripCountTy)->getMask()); VPValue *DistanceToMax = Builder.createNaryOp( Instruction::Sub, {MaxUIntTripCount, TripCountVPV}, DebugLoc::getUnknown()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 2aaabd9..965426f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -350,13 +350,14 @@ struct VPCostContext { SmallPtrSet<Instruction *, 8> SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; ScalarEvolution &SE; + const Loop *L; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, TargetTransformInfo::TargetCostKind CostKind, - ScalarEvolution &SE) + ScalarEvolution &SE, const Loop *L) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind), SE(SE) {} + CostKind(CostKind), SE(SE), L(L) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index b5b98c6..b57c448 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -313,7 +313,8 @@ private: // Check for recipes that do not have opcodes. if constexpr (std::is_same_v<RecipeTy, VPScalarIVStepsRecipe> || std::is_same_v<RecipeTy, VPCanonicalIVPHIRecipe> || - std::is_same_v<RecipeTy, VPDerivedIVRecipe>) + std::is_same_v<RecipeTy, VPDerivedIVRecipe> || + std::is_same_v<RecipeTy, VPVectorEndPointerRecipe>) return DefR; else return DefR && DefR->getOpcode() == Opcode; @@ -686,6 +687,64 @@ m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2}); } +template <typename Addr_t, typename Mask_t> struct Load_match { + Addr_t Addr; + Mask_t Mask; + + Load_match(Addr_t Addr, Mask_t Mask) : Addr(Addr), Mask(Mask) {} + + template <typename OpTy> bool match(const OpTy *V) const { + auto *Load = dyn_cast<VPWidenLoadRecipe>(V); + if (!Load || !Addr.match(Load->getAddr()) || !Load->isMasked() || + !Mask.match(Load->getMask())) + return false; + return true; + } +}; + +/// Match a (possibly reversed) masked load. +template <typename Addr_t, typename Mask_t> +inline Load_match<Addr_t, Mask_t> m_MaskedLoad(const Addr_t &Addr, + const Mask_t &Mask) { + return Load_match<Addr_t, Mask_t>(Addr, Mask); +} + +template <typename Addr_t, typename Val_t, typename Mask_t> struct Store_match { + Addr_t Addr; + Val_t Val; + Mask_t Mask; + + Store_match(Addr_t Addr, Val_t Val, Mask_t Mask) + : Addr(Addr), Val(Val), Mask(Mask) {} + + template <typename OpTy> bool match(const OpTy *V) const { + auto *Store = dyn_cast<VPWidenStoreRecipe>(V); + if (!Store || !Addr.match(Store->getAddr()) || + !Val.match(Store->getStoredValue()) || !Store->isMasked() || + !Mask.match(Store->getMask())) + return false; + return true; + } +}; + +/// Match a (possibly reversed) masked store. +template <typename Addr_t, typename Val_t, typename Mask_t> +inline Store_match<Addr_t, Val_t, Mask_t> +m_MaskedStore(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) { + return Store_match<Addr_t, Val_t, Mask_t>(Addr, Val, Mask); +} + +template <typename Op0_t, typename Op1_t> +using VectorEndPointerRecipe_match = + Recipe_match<std::tuple<Op0_t, Op1_t>, 0, + /*Commutative*/ false, VPVectorEndPointerRecipe>; + +template <typename Op0_t, typename Op1_t> +VectorEndPointerRecipe_match<Op0_t, Op1_t> m_VecEndPtr(const Op0_t &Op0, + const Op1_t &Op1) { + return VectorEndPointerRecipe_match<Op0_t, Op1_t>(Op0, Op1); +} + /// Match a call argument at a given argument index. template <typename Opnd_t> struct Argument_match { /// Call argument index to match. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9a63c80..1ee405a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -162,8 +162,12 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPPredInstPHISC: case VPVectorEndPointerSC: return false; - case VPInstructionSC: - return mayWriteToMemory(); + case VPInstructionSC: { + auto *VPI = cast<VPInstruction>(this); + return mayWriteToMemory() || + VPI->getOpcode() == VPInstruction::BranchOnCount || + VPI->getOpcode() == VPInstruction::BranchOnCond; + } case VPWidenCallSC: { Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction(); return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); @@ -490,10 +494,10 @@ template class VPUnrollPartAccessor<3>; } VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL, - const Twine &Name) + const VPIRFlags &Flags, const VPIRMetadata &MD, + DebugLoc DL, const Twine &Name) : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), - VPIRMetadata(), Opcode(Opcode), Name(Name.str()) { + VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) { assert(flagsValidForOpcode(getOpcode()) && "Set flags not supported for the provided opcode"); assert((getNumOperandsForOpcode(Opcode) == -1u || @@ -1241,6 +1245,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case Instruction::Select: case Instruction::PHI: case VPInstruction::AnyOf: + case VPInstruction::BranchOnCond: + case VPInstruction::BranchOnCount: case VPInstruction::Broadcast: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: @@ -2372,9 +2378,8 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return false; auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *CanIV = getRegion()->getCanonicalIV(); return StartC && StartC->isZero() && StepC && StepC->isOne() && - getScalarType() == CanIV->getScalarType(); + getScalarType() == getRegion()->getCanonicalIVType(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3167,26 +3172,30 @@ bool VPReplicateRecipe::shouldPack() const { }); } -/// Returns true if \p Ptr is a pointer computation for which the legacy cost -/// model computes a SCEV expression when computing the address cost. -static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { +/// Returns a SCEV expression for \p Ptr if it is a pointer computation for +/// which the legacy cost model computes a SCEV expression when computing the +/// address cost. Computing SCEVs for VPValues is incomplete and returns +/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In +/// those cases we fall back to the legacy cost model. Otherwise return nullptr. +static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE, + const Loop *L) { auto *PtrR = Ptr->getDefiningRecipe(); if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) && cast<VPReplicateRecipe>(PtrR)->getOpcode() == Instruction::GetElementPtr) || isa<VPWidenGEPRecipe>(PtrR) || match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) - return false; + return nullptr; // We are looking for a GEP where all indices are either loop invariant or // inductions. for (VPValue *Opd : drop_begin(PtrR->operands())) { if (!Opd->isDefinedOutsideLoopRegions() && !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) - return false; + return nullptr; } - return true; + return vputils::getSCEVExprForVPValue(Ptr, SE, L); } /// Returns true if \p V is used as part of the address of another load or @@ -3354,9 +3363,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, bool IsLoad = UI->getOpcode() == Instruction::Load; const VPValue *PtrOp = getOperand(!IsLoad); - // TODO: Handle cases where we need to pass a SCEV to - // getAddressComputationCost. - if (shouldUseAddressAccessSCEV(PtrOp)) + const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L); + if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV)) break; Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); @@ -3374,7 +3382,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, InstructionCost ScalarCost = ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, - nullptr, Ctx.CostKind); + PtrSCEV, Ctx.CostKind); if (isSingleScalar()) return ScalarCost; diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.h b/llvm/lib/Transforms/Vectorize/VPlanSLP.h index 77ff36c..44972c68 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.h +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.h @@ -89,8 +89,7 @@ class VPlanSlp { /// Width of the widest combined bundle in bits. unsigned WidestBundleBits = 0; - using MultiNodeOpTy = - typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; + using MultiNodeOpTy = std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; // Input operand bundles for the current multi node. Each multi node operand // bundle contains values not matching the multi node's opcode. They will diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 4d98014..9d9bb14 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -151,7 +151,27 @@ static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) { static bool sinkScalarOperands(VPlan &Plan) { auto Iter = vp_depth_first_deep(Plan.getEntry()); + bool ScalarVFOnly = Plan.hasScalarVFOnly(); bool Changed = false; + + auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo, + VPSingleDefRecipe *Candidate) { + // We only know how to duplicate VPReplicateRecipes and + // VPScalarIVStepsRecipes for now. + if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) + return false; + + if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() || + Candidate->mayReadOrWriteMemory()) + return false; + + if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) + if (!ScalarVFOnly && RepR->isSingleScalar()) + return false; + + return true; + }; + // First, collect the operands of all recipes in replicate blocks as seeds for // sinking. SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; @@ -159,51 +179,37 @@ static bool sinkScalarOperands(VPlan &Plan) { VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) continue; - VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]); - if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) + VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front()); + if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) continue; for (auto &Recipe : *VPBB) { - for (VPValue *Op : Recipe.operands()) + for (VPValue *Op : Recipe.operands()) { if (auto *Def = dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert({VPBB, Def}); + if (IsValidSinkCandidate(VPBB, Def)) + WorkList.insert({VPBB, Def}); + } } } - bool ScalarVFOnly = Plan.hasScalarVFOnly(); // Try to sink each replicate or scalar IV steps recipe in the worklist. for (unsigned I = 0; I != WorkList.size(); ++I) { VPBasicBlock *SinkTo; VPSingleDefRecipe *SinkCandidate; std::tie(SinkTo, SinkCandidate) = WorkList[I]; - if (SinkCandidate->getParent() == SinkTo || - SinkCandidate->mayHaveSideEffects() || - SinkCandidate->mayReadOrWriteMemory()) - continue; - if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) { - if (!ScalarVFOnly && RepR->isSingleScalar()) - continue; - } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate)) - continue; - bool NeedsDuplicating = false; // All recipe users of the sink candidate must be in the same block SinkTo - // or all users outside of SinkTo must be uniform-after-vectorization ( - // i.e., only first lane is used) . In the latter case, we need to duplicate - // SinkCandidate. - auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, - SinkCandidate](VPUser *U) { - auto *UI = cast<VPRecipeBase>(U); - if (UI->getParent() == SinkTo) - return true; - NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate); - // We only know how to duplicate VPReplicateRecipes and - // VPScalarIVStepsRecipes for now. - return NeedsDuplicating && - isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate); - }; - if (!all_of(SinkCandidate->users(), CanSinkWithUser)) + // or all users outside of SinkTo must have only their first lane used. In + // the latter case, we need to duplicate SinkCandidate. + auto UsersOutsideSinkTo = + make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) { + return cast<VPRecipeBase>(U)->getParent() != SinkTo; + }); + if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) { + return !U->onlyFirstLaneUsed(SinkCandidate); + })) continue; + bool NeedsDuplicating = !UsersOutsideSinkTo.empty(); if (NeedsDuplicating) { if (ScalarVFOnly) @@ -230,7 +236,8 @@ static bool sinkScalarOperands(VPlan &Plan) { for (VPValue *Op : SinkCandidate->operands()) if (auto *Def = dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert({SinkTo, Def}); + if (IsValidSinkCandidate(SinkTo, Def)) + WorkList.insert({SinkTo, Def}); Changed = true; } return Changed; @@ -699,8 +706,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { continue; const InductionDescriptor &ID = PtrIV->getInductionDescriptor(); - VPValue *StartV = - Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0)); + VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0); VPValue *StepV = PtrIV->getOperand(1); VPScalarIVStepsRecipe *Steps = createScalarIVSteps( Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr, @@ -820,7 +826,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, // Calculate the final index. VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); auto *CanonicalIV = LoopRegion->getCanonicalIV(); - Type *CanonicalIVType = CanonicalIV->getScalarType(); + Type *CanonicalIVType = LoopRegion->getCanonicalIVType(); VPBuilder B(cast<VPBasicBlock>(PredVPBB)); DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc(); @@ -836,7 +842,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, // changed it means the exit is using the incremented value, so we need to // add the step. if (Incoming != WideIV) { - VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1)); + VPValue *One = Plan.getConstantInt(CanonicalIVType, 1); EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL); } @@ -882,7 +888,7 @@ static VPValue *optimizeLatchExitInductionUser( return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape"); if (ScalarTy->isPointerTy()) { Type *StepTy = TypeInfo.inferScalarType(Step); - auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0)); + auto *Zero = Plan.getConstantInt(StepTy, 0); return B.createPtrAdd(EndValue, B.createNaryOp(Instruction::Sub, {Zero, Step}), DebugLoc::getUnknown(), "ind.escape"); @@ -1057,13 +1063,9 @@ static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R, return nullptr; } -/// Try to simplify recipe \p R. -static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { - VPlan *Plan = R.getParent()->getPlan(); - - auto *Def = dyn_cast<VPSingleDefRecipe>(&R); - if (!Def) - return; +/// Try to simplify VPSingleDefRecipe \p Def. +static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { + VPlan *Plan = Def->getParent()->getPlan(); // Simplification of live-in IR values for SingleDef recipes using // InstSimplifyFolder. @@ -1073,7 +1075,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return Def->replaceAllUsesWith(V); // Fold PredPHI LiveIn -> LiveIn. - if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) { + if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) { VPValue *Op = PredPHI->getOperand(0); if (Op->isLiveIn()) PredPHI->replaceAllUsesWith(Op); @@ -1092,12 +1094,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { - unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) + unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue())) ? Instruction::SExt : Instruction::ZExt; auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A, TruncTy); - if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { + if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) { // UnderlyingExt has distinct return type, used to retain legacy cost. Ext->setUnderlyingValue(UnderlyingExt); } @@ -1160,7 +1162,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { Builder.createLogicalAnd(X, Builder.createOr(Y, Z))); // x && !x -> 0 - if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) + if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) return Def->replaceAllUsesWith(Plan->getFalse()); if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) @@ -1188,8 +1190,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return Def->replaceAllUsesWith(A); if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt()))) - return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1) - : R.getOperand(0)); + return Def->replaceAllUsesWith( + Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0)); if (match(Def, m_Not(m_VPValue(A)))) { if (match(A, m_Not(m_VPValue(A)))) @@ -1218,8 +1220,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } // If Cmp doesn't have a debug location, use the one from the negation, // to preserve the location. - if (!Cmp->getDebugLoc() && R.getDebugLoc()) - Cmp->setDebugLoc(R.getDebugLoc()); + if (!Cmp->getDebugLoc() && Def->getDebugLoc()) + Cmp->setDebugLoc(Def->getDebugLoc()); } } } @@ -1245,7 +1247,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A), m_VPValue(X), m_VPValue())) && match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) && - TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) { + TypeInfo.inferScalarType(Def)->isIntegerTy(1)) { Def->setOperand(1, Def->getOperand(0)); Def->setOperand(0, Y); return; @@ -1253,35 +1255,41 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) { if (Phi->getOperand(0) == Phi->getOperand(1)) - Def->replaceAllUsesWith(Phi->getOperand(0)); + Phi->replaceAllUsesWith(Phi->getOperand(0)); return; } // Look through ExtractLastElement (BuildVector ....). - if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()), - m_ExtractLastLanePerPart(m_BuildVector())))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_CombineOr(m_ExtractLastElement(m_BuildVector()), + m_ExtractLastLanePerPart(m_BuildVector())))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith( BuildVector->getOperand(BuildVector->getNumOperands() - 1)); return; } // Look through ExtractPenultimateElement (BuildVector ....). - if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( - m_BuildVector()))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( + m_BuildVector()))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith( BuildVector->getOperand(BuildVector->getNumOperands() - 2)); return; } uint64_t Idx; - if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith(BuildVector->getOperand(Idx)); return; } + if (match(Def, m_BuildVector()) && all_equal(Def->operands())) { + Def->replaceAllUsesWith( + Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0))); + return; + } + if (auto *Phi = dyn_cast<VPPhi>(Def)) { if (Phi->getNumOperands() == 1) Phi->replaceAllUsesWith(Phi->getOperand(0)); @@ -1298,7 +1306,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { isa<VPPhi>(X)) { auto *Phi = cast<VPPhi>(X); if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) && - Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) { + Phi->getNumUsers() == 1 && (*Phi->user_begin() == Def)) { Phi->setOperand(0, Y); Def->replaceAllUsesWith(Phi); return; @@ -1306,7 +1314,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } // VPVectorPointer for part 0 can be replaced by their start pointer. - if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) { + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) { if (VecPtr->isFirstPart()) { VecPtr->replaceAllUsesWith(VecPtr->getOperand(0)); return; @@ -1361,9 +1369,9 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan) { Plan.getEntry()); VPTypeAnalysis TypeInfo(Plan); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { - for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - simplifyRecipe(R, TypeInfo); - } + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) + if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R)) + simplifyRecipe(Def, TypeInfo); } } @@ -1419,6 +1427,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { true /*IsSingleScalar*/); Clone->insertBefore(RepOrWidenR); RepOrWidenR->replaceAllUsesWith(Clone); + if (isDeadRecipe(*RepOrWidenR)) + RepOrWidenR->eraseFromParent(); } } } @@ -1572,9 +1582,9 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, continue; // Update IV operands and comparison bound to use new narrower type. - auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0)); + auto *NewStart = Plan.getConstantInt(NewIVTy, 0); WideIV->setStartValue(NewStart); - auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1)); + auto *NewStep = Plan.getConstantInt(NewIVTy, 1); WideIV->setStepValue(NewStep); auto *NewBTC = new VPWidenCastRecipe( @@ -1693,8 +1703,7 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, // When using wide lane masks, the return type of the get.active.lane.mask // intrinsic is VF x UF (last operand). - VPValue *ALMMultiplier = - Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); + VPValue *ALMMultiplier = Plan.getConstantInt(64, UF); EntryALM->setOperand(2, ALMMultiplier); LoopALM->setOperand(2, ALMMultiplier); @@ -2400,8 +2409,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1)); + VPValue *ALMMultiplier = + Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1); auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC, ALMMultiplier}, DL, "active.lane.mask.entry"); @@ -2501,7 +2510,7 @@ void VPlanTransforms::addActiveLaneMask( } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1)); + ConstantInt::get(LoopRegion->getCanonicalIVType(), 1)); LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, @@ -2515,90 +2524,102 @@ void VPlanTransforms::addActiveLaneMask( HeaderMask->eraseFromParent(); } +template <typename Op0_t, typename Op1_t> struct RemoveMask_match { + Op0_t In; + Op1_t &Out; + + RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {} + + template <typename OpTy> bool match(OpTy *V) const { + if (m_Specific(In).match(V)) { + Out = nullptr; + return true; + } + if (m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V)) + return true; + return false; + } +}; + +/// Match a specific mask \p In, or a combination of it (logical-and In, Out). +/// Returns the remaining part \p Out if so, or nullptr otherwise. +template <typename Op0_t, typename Op1_t> +static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In, + Op1_t &Out) { + return RemoveMask_match<Op0_t, Op1_t>(In, Out); +} + /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based /// recipe could be created. /// \p HeaderMask Header Mask. /// \p CurRecipe Recipe to be transform. /// \p TypeInfo VPlan-based type analysis. -/// \p AllOneMask The vector mask parameter of vector-predication intrinsics. /// \p EVL The explicit vector length parameter of vector-predication /// intrinsics. static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, - VPTypeAnalysis &TypeInfo, - VPValue &AllOneMask, VPValue &EVL) { - // FIXME: Don't transform recipes to EVL recipes if they're not masked by the - // header mask. - auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { - assert(OrigMask && "Unmasked recipe when folding tail"); - // HeaderMask will be handled using EVL. - VPValue *Mask; - if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) - return Mask; - return HeaderMask == OrigMask ? nullptr : OrigMask; - }; + VPTypeAnalysis &TypeInfo, VPValue &EVL) { + VPlan *Plan = CurRecipe.getParent()->getPlan(); + VPValue *Addr, *Mask, *EndPtr; /// Adjust any end pointers so that they point to the end of EVL lanes not VF. - auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * { - auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr); - if (!EndPtr) - return Addr; - assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() && - "VPVectorEndPointerRecipe with non-VF VF operand?"); - assert( - all_of(EndPtr->users(), - [](VPUser *U) { - return cast<VPWidenMemoryRecipe>(U)->isReverse(); - }) && - "VPVectorEndPointRecipe not used by reversed widened memory recipe?"); - VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone(); - EVLAddr->insertBefore(&CurRecipe); - EVLAddr->setOperand(1, &EVL); - return EVLAddr; + auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) { + auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone(); + EVLEndPtr->insertBefore(&CurRecipe); + EVLEndPtr->setOperand(1, &EVL); + return EVLEndPtr; }; - return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe) - .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) { - VPValue *NewMask = GetNewMask(L->getMask()); - VPValue *NewAddr = GetNewAddr(L->getAddr()); - return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask); - }) - .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) { - VPValue *NewMask = GetNewMask(S->getMask()); - VPValue *NewAddr = GetNewAddr(S->getAddr()); - return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask); - }) - .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) { - VPValue *NewMask = GetNewMask(IR->getMask()); - return new VPInterleaveEVLRecipe(*IR, EVL, NewMask); - }) - .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { - VPValue *NewMask = GetNewMask(Red->getCondOp()); - return new VPReductionEVLRecipe(*Red, EVL, NewMask); - }) - .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * { - VPValue *LHS, *RHS; - // Transform select with a header mask condition - // select(header_mask, LHS, RHS) - // into vector predication merge. - // vp.merge(all-true, LHS, RHS, EVL) - if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), - m_VPValue(RHS)))) - return nullptr; - // Use all true as the condition because this transformation is - // limited to selects whose condition is a header mask. - return new VPWidenIntrinsicRecipe( - Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL}, - TypeInfo.inferScalarType(LHS), VPI->getDebugLoc()); - }) - .Default([&](VPRecipeBase *R) { return nullptr; }); + if (match(&CurRecipe, + m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) && + !cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) + return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr, + EVL, Mask); + + if (match(&CurRecipe, + m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) + return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), + AdjustEndPtr(EndPtr), EVL, Mask); + + if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(), + m_RemoveMask(HeaderMask, Mask))) && + !cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) + return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr, + EVL, Mask); + + if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(), + m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) + return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), + AdjustEndPtr(EndPtr), EVL, Mask); + + if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe)) + if (Rdx->isConditional() && + match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask))) + return new VPReductionEVLRecipe(*Rdx, EVL, Mask); + + if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe)) + if (Interleave->getMask() && + match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask))) + return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask); + + VPValue *LHS, *RHS; + if (match(&CurRecipe, + m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) + return new VPWidenIntrinsicRecipe( + Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL}, + TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc()); + + return nullptr; } /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPTypeAnalysis TypeInfo(Plan); - VPValue *AllOneMask = Plan.getTrue(); VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); @@ -2658,7 +2679,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1)); VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe( Intrinsic::experimental_vp_splice, - {V1, V2, Imm, AllOneMask, PrevEVL, &EVL}, + {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL}, TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc()); VPSplice->insertBefore(&R); R.getVPSingleValue()->replaceAllUsesWith(VPSplice); @@ -2692,7 +2713,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { for (VPUser *U : collectUsersRecursively(EVLMask)) { auto *CurRecipe = cast<VPRecipeBase>(U); VPRecipeBase *EVLRecipe = - optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL); + optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL); if (!EVLRecipe) continue; @@ -2773,7 +2794,7 @@ void VPlanTransforms::addExplicitVectorLength( VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); auto *CanonicalIVPHI = LoopRegion->getCanonicalIV(); - auto *CanIVTy = CanonicalIVPHI->getScalarType(); + auto *CanIVTy = LoopRegion->getCanonicalIVType(); VPValue *StartV = CanonicalIVPHI->getStartValue(); // Create the ExplicitVectorLengthPhi recipe in the main loop. @@ -2788,8 +2809,7 @@ void VPlanTransforms::addExplicitVectorLength( if (MaxSafeElements) { // Support for MaxSafeDist for correct loop emission. - VPValue *AVLSafe = - Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements)); + VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements); VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(), "safe_avl"); @@ -2902,9 +2922,8 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext); VPBuilder Builder(LatchExitingBr); - VPValue *Cmp = - Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, - Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy))); + VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, + Plan.getConstantInt(AVLTy, 0)); Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp); LatchExitingBr->eraseFromParent(); } @@ -2928,8 +2947,7 @@ void VPlanTransforms::replaceSymbolicStrides( // Only handle constant strides for now. continue; - auto *CI = - Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst)); + auto *CI = Plan.getConstantInt(*StrideConst); if (VPValue *StrideVPV = Plan.getLiveIn(StrideV)) StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); @@ -2944,7 +2962,7 @@ void VPlanTransforms::replaceSymbolicStrides( unsigned BW = U->getType()->getScalarSizeInBits(); APInt C = isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW); - VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C)); + VPValue *CI = Plan.getConstantInt(C); StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); } RewriteMap[StrideV] = PSE.getSCEV(StrideV); @@ -3123,8 +3141,7 @@ void VPlanTransforms::createInterleaveGroups( DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) * IG->getIndex(IRInsertPos), /*IsSigned=*/true); - VPValue *OffsetVPV = - Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset)); + VPValue *OffsetVPV = Plan.getConstantInt(-Offset); VPBuilder B(InsertPos); Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW); } @@ -3865,8 +3882,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, VPBuilder Builder(VectorPH, VectorPH->begin()); auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); auto *TCMO = Builder.createNaryOp( - Instruction::Sub, - {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))}, + Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)}, DebugLoc::getCompilerGenerated(), "trip.count.minus.1"); BTC->replaceAllUsesWith(TCMO); } @@ -3991,9 +4007,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, if (TailByMasking) { TC = Builder.createNaryOp( Instruction::Add, - {TC, Builder.createNaryOp( - Instruction::Sub, - {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})}, + {TC, Builder.createNaryOp(Instruction::Sub, + {Step, Plan.getConstantInt(TCTy, 1)})}, DebugLoc::getCompilerGenerated(), "n.rnd.up"); } @@ -4015,8 +4030,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, if (RequiresScalarEpilogue) { assert(!TailByMasking && "requiring scalar epilogue is not supported with fail folding"); - VPValue *IsZero = Builder.createICmp( - CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0))); + VPValue *IsZero = + Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0)); R = Builder.createSelect(IsZero, Step, R); } @@ -4054,7 +4069,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, } VF.replaceAllUsesWith(RuntimeVF); - VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF())); + VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF()); VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF}); VFxUF.replaceAllUsesWith(MulByUF); } @@ -4174,7 +4189,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VFMinVal = VF.getKnownMinValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; for (auto &R : *VectorLoop->getEntryBasicBlock()) { - if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount())) + if (isa<VPCanonicalIVPHIRecipe>(&R)) continue; if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) && @@ -4334,17 +4349,17 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, VPBuilder PHBuilder(Plan.getVectorPreheader()); VPValue *UF = Plan.getOrAddLiveIn( - ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); + ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF())); if (VF.isScalable()) { VPValue *VScale = PHBuilder.createElementCount( - CanIV->getScalarType(), ElementCount::getScalable(1)); + VectorLoop->getCanonicalIVType(), ElementCount::getScalable(1)); VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); Inc->setOperand(1, VScaleUF); Plan.getVF().replaceAllUsesWith(VScale); } else { Inc->setOperand(1, UF); Plan.getVF().replaceAllUsesWith( - Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + Plan.getConstantInt(CanIV->getScalarType(), 1)); } removeDeadRecipes(Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index cfd1a74..d6a0028 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -68,10 +68,9 @@ class UnrollState { void unrollWidenInductionByUF(VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi); - VPValue *getConstantVPV(unsigned Part) { - Type *CanIVIntTy = - Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType(); - return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); + VPValue *getConstantInt(unsigned Part) { + Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType(); + return Plan.getConstantInt(CanIVIntTy, Part); } public: @@ -138,7 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { remapOperands(&PartIR, Part); if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) { - ScalarIVSteps->addOperand(getConstantVPV(Part)); + ScalarIVSteps->addOperand(getConstantInt(Part)); } addRecipeForPart(&Part0R, &PartIR, Part); @@ -250,7 +249,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, for (unsigned Part = 1; Part != UF; ++Part) VPV2Parts[VPI][Part - 1] = StartV; } - Copy->addOperand(getConstantVPV(Part)); + Copy->addOperand(getConstantInt(Part)); } else { assert(isa<VPActiveLaneMaskPHIRecipe>(R) && "unexpected header phi recipe not needing unrolled part"); @@ -319,7 +318,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) || match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) - Copy->addOperand(getConstantVPV(Part)); + Copy->addOperand(getConstantInt(Part)); if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R)) Copy->setOperand(0, R.getOperand(0)); @@ -475,8 +474,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, if (LaneDefs != Def2LaneDefs.end()) return LaneDefs->second[Lane.getKnownLane()]; - VPValue *Idx = - Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); } @@ -510,8 +508,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane())); continue; } - VPValue *Idx = - Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane()); VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); NewOps.push_back(Ext); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 4db92e7..c6380d3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -32,22 +32,17 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) { } VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { - VPValue *Expanded = nullptr; if (auto *E = dyn_cast<SCEVConstant>(Expr)) - Expanded = Plan.getOrAddLiveIn(E->getValue()); - else { - auto *U = dyn_cast<SCEVUnknown>(Expr); - // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction - // value. Otherwise the value may be defined in a loop and using it directly - // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA - // form. - if (U && !isa<Instruction>(U->getValue())) { - Expanded = Plan.getOrAddLiveIn(U->getValue()); - } else { - Expanded = new VPExpandSCEVRecipe(Expr); - Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe()); - } - } + return Plan.getOrAddLiveIn(E->getValue()); + // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction + // value. Otherwise the value may be defined in a loop and using it directly + // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA + // form. + auto *U = dyn_cast<SCEVUnknown>(Expr); + if (U && !isa<Instruction>(U->getValue())) + return Plan.getOrAddLiveIn(U->getValue()); + auto *Expanded = new VPExpandSCEVRecipe(Expr); + Plan.getEntry()->appendRecipe(Expanded); return Expanded; } @@ -75,7 +70,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { B == Plan.getBackedgeTakenCount(); } -const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { +const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V, + ScalarEvolution &SE, const Loop *L) { if (V->isLiveIn()) { if (Value *LiveIn = V->getLiveInIRValue()) return SE.getSCEV(LiveIn); @@ -86,6 +82,53 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe()) .Case<VPExpandSCEVRecipe>( [](const VPExpandSCEVRecipe *R) { return R->getSCEV(); }) + .Case<VPCanonicalIVPHIRecipe>([&SE, L](const VPCanonicalIVPHIRecipe *R) { + if (!L) + return SE.getCouldNotCompute(); + const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); + return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L, + SCEV::FlagAnyWrap); + }) + .Case<VPDerivedIVRecipe>([&SE, L](const VPDerivedIVRecipe *R) { + const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); + const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L); + const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L); + if (any_of(ArrayRef({Start, IV, Scale}), IsaPred<SCEVCouldNotCompute>)) + return SE.getCouldNotCompute(); + + return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()), + SE.getMulExpr(IV, SE.getTruncateOrSignExtend( + Scale, IV->getType()))); + }) + .Case<VPScalarIVStepsRecipe>([&SE, L](const VPScalarIVStepsRecipe *R) { + const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L); + const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L); + if (isa<SCEVCouldNotCompute>(IV) || isa<SCEVCouldNotCompute>(Step) || + !Step->isOne()) + return SE.getCouldNotCompute(); + return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()), + Step); + }) + .Case<VPReplicateRecipe>([&SE, L](const VPReplicateRecipe *R) { + if (R->getOpcode() != Instruction::GetElementPtr) + return SE.getCouldNotCompute(); + + const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L); + if (isa<SCEVCouldNotCompute>(Base)) + return SE.getCouldNotCompute(); + + SmallVector<const SCEV *> IndexExprs; + for (VPValue *Index : drop_begin(R->operands())) { + const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L); + if (isa<SCEVCouldNotCompute>(IndexExpr)) + return SE.getCouldNotCompute(); + IndexExprs.push_back(IndexExpr); + } + + Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr()) + ->getSourceElementType(); + return SE.getGEPExpr(Base, IndexExprs, SrcElementTy); + }) .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 37cd413..c21a0e7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr); /// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no /// SCEV expression could be constructed. -const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE); +const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, + const Loop *L = nullptr); /// Returns true if \p VPV is a single scalar, either because it produces the /// same value for all lanes or only has its first lane used. |
