diff options
Diffstat (limited to 'llvm/lib/Transforms')
26 files changed, 775 insertions, 282 deletions
| diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 5ba2167..cc53ec2 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -1957,8 +1957,12 @@ Value *DataFlowSanitizer::getShadowAddress(Value *Addr,  Value *DataFlowSanitizer::getShadowAddress(Value *Addr,                                             BasicBlock::iterator Pos) {    IRBuilder<> IRB(Pos->getParent(), Pos); -  Value *ShadowOffset = getShadowOffset(Addr, IRB); -  return getShadowAddress(Addr, Pos, ShadowOffset); +  Value *ShadowAddr = getShadowOffset(Addr, IRB); +  uint64_t ShadowBase = MapParams->ShadowBase; +  if (ShadowBase != 0) +    ShadowAddr = +        IRB.CreateAdd(ShadowAddr, ConstantInt::get(IntptrTy, ShadowBase)); +  return getShadowAddress(Addr, Pos, ShadowAddr);  }  Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2, diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 7795cce..b5548d4 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -69,14 +69,6 @@ namespace llvm {  // Command line option to enable vtable value profiling. Defined in  // ProfileData/InstrProf.cpp: -enable-vtable-value-profiling=  extern cl::opt<bool> EnableVTableValueProfiling; -// TODO: Remove -debug-info-correlate in next LLVM release, in favor of -// -profile-correlate=debug-info. -cl::opt<bool> DebugInfoCorrelate( -    "debug-info-correlate", -    cl::desc("Use debug info to correlate profiles. (Deprecated, use " -             "-profile-correlate=debug-info)"), -    cl::init(false)); -  LLVM_ABI cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate(      "profile-correlate",      cl::desc("Use debug info or binary file to correlate profiles."), @@ -1047,7 +1039,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {    // in lightweight mode. We need to move the value profile pointer to the    // Counter struct to get this working.    assert( -      !DebugInfoCorrelate && ProfileCorrelate == InstrProfCorrelator::NONE && +      ProfileCorrelate == InstrProfCorrelator::NONE &&        "Value profiling is not yet supported with lightweight instrumentation");    GlobalVariable *Name = Ind->getName();    auto It = ProfileDataMap.find(Name); @@ -1504,7 +1496,7 @@ static inline Constant *getVTableAddrForProfData(GlobalVariable *GV) {  }  void InstrLowerer::getOrCreateVTableProfData(GlobalVariable *GV) { -  assert(!DebugInfoCorrelate && +  assert(ProfileCorrelate != InstrProfCorrelator::DEBUG_INFO &&           "Value profiling is not supported with lightweight instrumentation");    if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())      return; @@ -1584,8 +1576,7 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc,    // Use internal rather than private linkage so the counter variable shows up    // in the symbol table when using debug info for correlation. -  if ((DebugInfoCorrelate || -       ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) && +  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO &&        TT.isOSBinFormatMachO() && Linkage == GlobalValue::PrivateLinkage)      Linkage = GlobalValue::InternalLinkage; @@ -1691,8 +1682,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {    auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts);    PD.RegionCounters = CounterPtr; -  if (DebugInfoCorrelate || -      ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) { +  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) {      LLVMContext &Ctx = M.getContext();      Function *Fn = Inc->getParent()->getParent();      if (auto *SP = Fn->getSubprogram()) { @@ -1737,7 +1727,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {  void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {    // When debug information is correlated to profile data, a data variable    // is not needed. -  if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) +  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)      return;    GlobalVariable *NamePtr = Inc->getName(); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index a6ec6c1..b72d41a 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -127,15 +127,19 @@ static uint64_t computeStackId(const memprof::Frame &Frame) {    return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column);  } +static AllocationType getAllocType(const AllocationInfo *AllocInfo) { +  return getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), +                      AllocInfo->Info.getAllocCount(), +                      AllocInfo->Info.getTotalLifetime()); +} +  static AllocationType addCallStack(CallStackTrie &AllocTrie,                                     const AllocationInfo *AllocInfo,                                     uint64_t FullStackId) {    SmallVector<uint64_t> StackIds;    for (const auto &StackFrame : AllocInfo->CallStack)      StackIds.push_back(computeStackId(StackFrame)); -  auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), -                                AllocInfo->Info.getAllocCount(), -                                AllocInfo->Info.getTotalLifetime()); +  auto AllocType = getAllocType(AllocInfo);    std::vector<ContextTotalSize> ContextSizeInfo;    if (recordContextSizeInfoForAnalysis()) {      auto TotalSize = AllocInfo->Info.getTotalSize(); @@ -216,7 +220,6 @@ static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,    }    LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to "                      << Reason << ".\n"); -  return;  }  struct AllocMatchInfo { @@ -406,22 +409,39 @@ handleAllocSite(Instruction &I, CallBase *CI,                  const std::set<const AllocationInfo *> &AllocInfoSet,                  std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>                      &FullStackIdToAllocMatchInfo) { +  // TODO: Remove this once the profile creation logic deduplicates contexts +  // that are the same other than the IsInlineFrame bool. Until then, keep the +  // largest. +  DenseMap<uint64_t, const AllocationInfo *> UniqueFullContextIdAllocInfo; +  for (auto *AllocInfo : AllocInfoSet) { +    auto FullStackId = computeFullStackId(AllocInfo->CallStack); +    auto [It, Inserted] = +        UniqueFullContextIdAllocInfo.insert({FullStackId, AllocInfo}); +    // If inserted entry, done. +    if (Inserted) +      continue; +    // Keep the larger one, or the noncold one if they are the same size. +    auto CurSize = It->second->Info.getTotalSize(); +    auto NewSize = AllocInfo->Info.getTotalSize(); +    if ((CurSize > NewSize) || +        (CurSize == NewSize && +         getAllocType(AllocInfo) != AllocationType::NotCold)) +      continue; +    It->second = AllocInfo; +  }    // We may match this instruction's location list to multiple MIB    // contexts. Add them to a Trie specialized for trimming the contexts to    // the minimal needed to disambiguate contexts with unique behavior.    CallStackTrie AllocTrie(&ORE, MaxColdSize);    uint64_t TotalSize = 0;    uint64_t TotalColdSize = 0; -  for (auto *AllocInfo : AllocInfoSet) { +  for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) {      // Check the full inlined call stack against this one.      // If we found and thus matched all frames on the call, include      // this MIB.      if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,                                             InlinedCallStack)) {        NumOfMemProfMatchedAllocContexts++; -      uint64_t FullStackId = 0; -      if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis()) -        FullStackId = computeFullStackId(AllocInfo->CallStack);        auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);        TotalSize += AllocInfo->Info.getTotalSize();        if (AllocType == AllocationType::Cold) diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 71736cf..af53fa0 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -456,7 +456,7 @@ createIRLevelProfileFlagVar(Module &M,      ProfileVersion |= VARIANT_MASK_INSTR_ENTRY;    if (PGOInstrumentLoopEntries)      ProfileVersion |= VARIANT_MASK_INSTR_LOOP_ENTRIES; -  if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) +  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)      ProfileVersion |= VARIANT_MASK_DBG_CORRELATE;    if (PGOFunctionEntryCoverage)      ProfileVersion |= diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 66e45ec..e84ca81 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -122,16 +122,22 @@ static cl::opt<unsigned>                    cl::desc("Maximum cost accepted for the transformation"),                    cl::Hidden, cl::init(50)); -extern cl::opt<bool> ProfcheckDisableMetadataFixes; - -} // namespace llvm -  static cl::opt<double> MaxClonedRate(      "dfa-max-cloned-rate",      cl::desc(          "Maximum cloned instructions rate accepted for the transformation"),      cl::Hidden, cl::init(7.5)); +static cl::opt<unsigned> +    MaxOuterUseBlocks("dfa-max-out-use-blocks", +                      cl::desc("Maximum unduplicated blocks with outer uses " +                               "accepted for the transformation"), +                      cl::Hidden, cl::init(40)); + +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + +} // namespace llvm +  namespace {  class SelectInstToUnfold {    SelectInst *SI; @@ -965,8 +971,16 @@ private:      // SLPVectorizer.      // TODO: Thread the switch partially before reaching the threshold.      uint64_t NumOrigInst = 0; -    for (auto *BB : DuplicateMap.keys()) +    uint64_t NumOuterUseBlock = 0; +    for (auto *BB : DuplicateMap.keys()) {        NumOrigInst += BB->sizeWithoutDebug(); +      // Only unduplicated blocks with single predecessor require new phi +      // nodes. +      for (auto *Succ : successors(BB)) +        if (!DuplicateMap.count(Succ) && Succ->getSinglePredecessor()) +          NumOuterUseBlock++; +    } +      if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) {        LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much "                             "instructions wll be cloned\n"); @@ -977,6 +991,20 @@ private:        return false;      } +    // Too much unduplicated blocks with outer uses may cause too much +    // insertions of phi nodes for duplicated definitions. TODO: Drop this +    // threshold if we come up with another way to reduce the number of inserted +    // phi nodes. +    if (NumOuterUseBlock > MaxOuterUseBlocks) { +      LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much " +                           "blocks with outer uses\n"); +      ORE->emit([&]() { +        return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch) +               << "Too much blocks with outer uses."; +      }); +      return false; +    } +      InstructionCost DuplicationCost = 0;      unsigned JumpTableSize = 0; diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 7ebcc21..4ba4ba3 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -162,8 +162,6 @@ class IndVarSimplify {                                   const SCEV *ExitCount,                                   PHINode *IndVar, SCEVExpander &Rewriter); -  bool sinkUnusedInvariants(Loop *L); -  public:    IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,                   const DataLayout &DL, TargetLibraryInfo *TLI, @@ -1079,85 +1077,6 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,    return true;  } -//===----------------------------------------------------------------------===// -//  sinkUnusedInvariants. A late subpass to cleanup loop preheaders. -//===----------------------------------------------------------------------===// - -/// If there's a single exit block, sink any loop-invariant values that -/// were defined in the preheader but not used inside the loop into the -/// exit block to reduce register pressure in the loop. -bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { -  BasicBlock *ExitBlock = L->getExitBlock(); -  if (!ExitBlock) return false; - -  BasicBlock *Preheader = L->getLoopPreheader(); -  if (!Preheader) return false; - -  bool MadeAnyChanges = false; -  for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { - -    // Skip BB Terminator. -    if (Preheader->getTerminator() == &I) -      continue; - -    // New instructions were inserted at the end of the preheader. -    if (isa<PHINode>(I)) -      break; - -    // Don't move instructions which might have side effects, since the side -    // effects need to complete before instructions inside the loop.  Also don't -    // move instructions which might read memory, since the loop may modify -    // memory. Note that it's okay if the instruction might have undefined -    // behavior: LoopSimplify guarantees that the preheader dominates the exit -    // block. -    if (I.mayHaveSideEffects() || I.mayReadFromMemory()) -      continue; - -    // Skip debug or pseudo instructions. -    if (I.isDebugOrPseudoInst()) -      continue; - -    // Skip eh pad instructions. -    if (I.isEHPad()) -      continue; - -    // Don't sink alloca: we never want to sink static alloca's out of the -    // entry block, and correctly sinking dynamic alloca's requires -    // checks for stacksave/stackrestore intrinsics. -    // FIXME: Refactor this check somehow? -    if (isa<AllocaInst>(&I)) -      continue; - -    // Determine if there is a use in or before the loop (direct or -    // otherwise). -    bool UsedInLoop = false; -    for (Use &U : I.uses()) { -      Instruction *User = cast<Instruction>(U.getUser()); -      BasicBlock *UseBB = User->getParent(); -      if (PHINode *P = dyn_cast<PHINode>(User)) { -        unsigned i = -          PHINode::getIncomingValueNumForOperand(U.getOperandNo()); -        UseBB = P->getIncomingBlock(i); -      } -      if (UseBB == Preheader || L->contains(UseBB)) { -        UsedInLoop = true; -        break; -      } -    } - -    // If there is, the def must remain in the preheader. -    if (UsedInLoop) -      continue; - -    // Otherwise, sink it to the exit block. -    I.moveBefore(ExitBlock->getFirstInsertionPt()); -    SE->forgetValue(&I); -    MadeAnyChanges = true; -  } - -  return MadeAnyChanges; -} -  static void replaceExitCond(BranchInst *BI, Value *NewCond,                              SmallVectorImpl<WeakTrackingVH> &DeadInsts) {    auto *OldCond = BI->getCondition(); @@ -2065,10 +1984,6 @@ bool IndVarSimplify::run(Loop *L) {    // The Rewriter may not be used from this point on. -  // Loop-invariant instructions in the preheader that aren't used in the -  // loop may be sunk below the loop to reduce register pressure. -  Changed |= sinkUnusedInvariants(L); -    // rewriteFirstIterationLoopExitValues does not rely on the computation of    // trip count and therefore can further simplify exit values in addition to    // rewriteLoopExitValues. diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index b2c526b..d13b990 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -211,9 +211,15 @@ static Instruction *cloneInstructionInExitBlock(  static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,                               MemorySSAUpdater &MSSAU); -static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest, -                                  ICFLoopSafetyInfo &SafetyInfo, -                                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE); +static void moveInstructionBefore( +    Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo, +    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, +    MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator); + +static bool sinkUnusedInvariantsFromPreheaderToExit( +    Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo, +    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT, +    SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE);  static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L,                                  function_ref<void(Instruction *)> Fn); @@ -471,6 +477,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI,                                      TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE)              : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,                           MSSAU, &SafetyInfo, Flags, ORE); + +  // sink pre-header defs that are unused in-loop into the unique exit to reduce +  // pressure. +  Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU, +                                                     SE, DT, Flags, ORE); +    Flags.setIsSink(false);    if (Preheader)      Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L, @@ -1456,19 +1468,80 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,  static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,                                    ICFLoopSafetyInfo &SafetyInfo, -                                  MemorySSAUpdater &MSSAU, -                                  ScalarEvolution *SE) { +                                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE, +                                  MemorySSA::InsertionPlace Point) {    SafetyInfo.removeInstruction(&I);    SafetyInfo.insertInstructionTo(&I, Dest->getParent());    I.moveBefore(*Dest->getParent(), Dest);    if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(            MSSAU.getMemorySSA()->getMemoryAccess(&I))) -    MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), -                      MemorySSA::BeforeTerminator); +    MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point);    if (SE)      SE->forgetBlockAndLoopDispositions(&I);  } +// If there's a single exit block, sink any loop-invariant values that were +// defined in the preheader but not used inside the loop into the exit block +// to reduce register pressure in the loop. +static bool sinkUnusedInvariantsFromPreheaderToExit( +    Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo, +    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT, +    SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) { +  BasicBlock *ExitBlock = L->getExitBlock(); +  if (!ExitBlock) +    return false; + +  BasicBlock *Preheader = L->getLoopPreheader(); +  if (!Preheader) +    return false; + +  bool MadeAnyChanges = false; + +  for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { + +    // Skip terminator. +    if (Preheader->getTerminator() == &I) +      continue; + +    // New instructions were inserted at the end of the preheader. +    if (isa<PHINode>(I)) +      break; + +    // Don't move instructions which might have side effects, since the side +    // effects need to complete before instructions inside the loop. Note that +    // it's okay if the instruction might have undefined behavior: LoopSimplify +    // guarantees that the preheader dominates the exit block. +    if (I.mayHaveSideEffects()) +      continue; + +    if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr)) +      continue; + +    // Determine if there is a use in or before the loop (direct or +    // otherwise). +    bool UsedInLoopOrPreheader = false; +    for (Use &U : I.uses()) { +      auto *UserI = cast<Instruction>(U.getUser()); +      BasicBlock *UseBB = UserI->getParent(); +      if (auto *PN = dyn_cast<PHINode>(UserI)) { +        UseBB = PN->getIncomingBlock(U); +      } +      if (UseBB == Preheader || L->contains(UseBB)) { +        UsedInLoopOrPreheader = true; +        break; +      } +    } +    if (UsedInLoopOrPreheader) +      continue; + +    moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo, +                          MSSAU, SE, MemorySSA::Beginning); +    MadeAnyChanges = true; +  } + +  return MadeAnyChanges; +} +  static Instruction *sinkThroughTriviallyReplaceablePHI(      PHINode *TPN, Instruction *I, LoopInfo *LI,      SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies, diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 1a279b6..001215a 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1318,6 +1318,11 @@ public:    /// the loop, in which case some special-case heuristics may be used.    bool AllFixupsOutsideLoop = true; +  /// This records whether all of the fixups using this LSRUse are unconditional +  /// within the loop, meaning they will be executed on every path to the loop +  /// latch. This includes fixups before early exits. +  bool AllFixupsUnconditional = true; +    /// RigidFormula is set to true to guarantee that this use will be associated    /// with a single formula--the one that initially matched. Some SCEV    /// expressions cannot be expanded. This allows LSR to consider the registers @@ -1421,16 +1426,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,      if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||          TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {        const SCEV *Start; -      const SCEVConstant *Step; -      if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step)))) +      const APInt *Step; +      if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {          // If the step size matches the base offset, we could use pre-indexed          // addressing. -        if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() && -             Step->getAPInt() == F.BaseOffset.getFixedValue()) || -            ((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) && -             SE->isLoopInvariant(Start, L))) +        bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) && +                           F.BaseOffset.isFixed() && +                           *Step == F.BaseOffset.getFixedValue(); +        bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) && +                            !isa<SCEVConstant>(Start) && +                            SE->isLoopInvariant(Start, L); +        // We can only pre or post index when the load/store is unconditional. +        if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)            LoopCost = 0; +      }      } +      // If the loop counts down to zero and we'll be using a hardware loop then      // the addrec will be combined into the hardware loop instruction.      if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() && @@ -1783,6 +1794,9 @@ void LSRUse::print(raw_ostream &OS) const {    if (AllFixupsOutsideLoop)      OS << ", all-fixups-outside-loop"; +  if (AllFixupsUnconditional) +    OS << ", all-fixups-unconditional"; +    if (WidestFixupType)      OS << ", widest fixup type: " << *WidestFixupType;  } @@ -2213,6 +2227,7 @@ class LSRInstance {    void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);    void CountRegisters(const Formula &F, size_t LUIdx);    bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); +  bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;    void CollectLoopInvariantFixupsAndFormulae(); @@ -3607,6 +3622,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {      LF.PostIncLoops = TmpPostIncLoops;      LF.Offset = Offset;      LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); +    LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);      // Create SCEV as Formula for calculating baseline cost      if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) { @@ -3680,6 +3696,14 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {    return true;  } +/// Test whether this fixup will be executed each time the corresponding IV +/// increment instruction is executed. +bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const { +  // If the fixup block dominates the IV increment block then there is no path +  // through the loop to the increment that doesn't pass through the fixup. +  return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent()); +} +  /// Check for other uses of loop-invariant values which we're tracking. These  /// other uses will pin these values in registers, making them less profitable  /// for elimination. @@ -3803,6 +3827,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {          LF.OperandValToReplace = U;          LF.Offset = Offset;          LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); +        LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);          if (!LU.WidestFixupType ||              SE.getTypeSizeInBits(LU.WidestFixupType) <              SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) @@ -4940,6 +4965,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {        LLVM_DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');        LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; +      LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;        // Transfer the fixups of LU to LUThatHas.        for (LSRFixup &Fixup : LU.Fixups) { diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index e043d07..08be5df 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1534,8 +1534,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,    bool SrcNotDom = false;    auto CaptureTrackingWithModRef = -      [&](Instruction *AI, -          function_ref<bool(Instruction *)> ModRefCallback) -> bool { +      [&](Instruction *AI, function_ref<bool(Instruction *)> ModRefCallback, +          bool &AddressCaptured) -> bool {      SmallVector<Instruction *, 8> Worklist;      Worklist.push_back(AI);      unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking(); @@ -1559,8 +1559,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,          if (!Visited.insert(&U).second)            continue;          UseCaptureInfo CI = DetermineUseCaptureKind(U, AI); -        if (capturesAnything(CI.UseCC)) +        if (capturesAnyProvenance(CI.UseCC))            return false; +        AddressCaptured |= capturesAddress(CI.UseCC);          if (UI->mayReadOrWriteMemory()) {            if (UI->isLifetimeStartOrEnd()) { @@ -1627,7 +1628,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,      return true;    }; -  if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback)) +  bool DestAddressCaptured = false; +  if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback, +                                 DestAddressCaptured))      return false;    // Bailout if Dest may have any ModRef before Store.    if (!ReachabilityWorklist.empty() && @@ -1653,7 +1656,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,      return true;    }; -  if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback)) +  bool SrcAddressCaptured = false; +  if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback, +                                 SrcAddressCaptured)) +    return false; + +  // If both the source and destination address are captured, the fact that they +  // are no longer two separate allocations may be observed. +  if (DestAddressCaptured && SrcAddressCaptured)      return false;    // We can do the transformation. First, move the SrcAlloca to the start of the diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 5af6c96..bb6c879 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -81,6 +81,7 @@ STATISTIC(  STATISTIC(NumInvariantConditionsInjected,            "Number of invariant conditions injected and unswitched"); +namespace llvm {  static cl::opt<bool> EnableNonTrivialUnswitch(      "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,      cl::desc("Forcibly enables non-trivial loop unswitching rather than " @@ -131,11 +132,17 @@ static cl::opt<bool> InjectInvariantConditions(  static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold(      "simple-loop-unswitch-inject-invariant-condition-hotness-threshold", -    cl::Hidden, cl::desc("Only try to inject loop invariant conditions and " -                         "unswitch on them to eliminate branches that are " -                         "not-taken 1/<this option> times or less."), +    cl::Hidden, +    cl::desc("Only try to inject loop invariant conditions and " +             "unswitch on them to eliminate branches that are " +             "not-taken 1/<this option> times or less."),      cl::init(16)); +static cl::opt<bool> EstimateProfile("simple-loop-unswitch-estimate-profile", +                                     cl::Hidden, cl::init(true)); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} // namespace llvm +  AnalysisKey ShouldRunExtraSimpleLoopUnswitch::Key;  namespace {  struct CompareDesc { @@ -268,13 +275,42 @@ static bool areLoopExitPHIsLoopInvariant(const Loop &L,    llvm_unreachable("Basic blocks should never be empty!");  } -/// Copy a set of loop invariant values \p ToDuplicate and insert them at the +/// Copy a set of loop invariant values \p Invariants and insert them at the  /// end of \p BB and conditionally branch on the copied condition. We only  /// branch on a single value. +/// We attempt to estimate the profile of the resulting conditional branch from +/// \p ComputeProfFrom, which is the original conditional branch we're +/// unswitching. +/// When \p Direction is true, the \p Invariants form a disjunction, and the +/// branch conditioned on it exits the loop on the "true" case. When \p +/// Direction is false, the \p Invariants form a conjunction and the branch +/// exits on the "false" case.  static void buildPartialUnswitchConditionalBranch(      BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction,      BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, -    const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { +    const Instruction *I, AssumptionCache *AC, const DominatorTree &DT, +    const BranchInst &ComputeProfFrom) { + +  SmallVector<uint32_t> BranchWeights; +  bool HasBranchWeights = EstimateProfile && !ProfcheckDisableMetadataFixes && +                          extractBranchWeights(ComputeProfFrom, BranchWeights); +  // If Direction is true, that means we had a disjunction and that the "true" +  // case exits. The probability of the disjunction of the subset of terms is at +  // most as high as the original one. So, if the probability is higher than the +  // one we'd assign in absence of a profile (i.e. 0.5), we will use 0.5, +  // but if it's lower, we will use the original probability. +  // Conversely, if Direction is false, that means we had a conjunction, and the +  // probability of exiting is captured in the second branch weight. That +  // probability is a disjunction (of the negation of the original terms). The +  // same reasoning applies as above. +  // Issue #165649: should we expect BFI to conserve, and use that to calculate +  // the branch weights? +  if (HasBranchWeights && +      static_cast<double>(BranchWeights[Direction ? 0 : 1]) / +              static_cast<double>(sum_of(BranchWeights)) > +          0.5) +    HasBranchWeights = false; +    IRBuilder<> IRB(&BB);    IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); @@ -287,8 +323,14 @@ static void buildPartialUnswitchConditionalBranch(    Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants)                            : IRB.CreateAnd(FrozenInvariants); -  IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, -                   Direction ? &NormalSucc : &UnswitchedSucc); +  auto *BR = IRB.CreateCondBr( +      Cond, Direction ? &UnswitchedSucc : &NormalSucc, +      Direction ? &NormalSucc : &UnswitchedSucc, +      HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof) +                       : nullptr); +  if (!HasBranchWeights) +    setExplicitlyUnknownBranchWeightsIfProfiled( +        *BR, *BR->getParent()->getParent(), DEBUG_TYPE);  }  /// Copy a set of loop invariant values, and conditionally branch on them. @@ -658,7 +700,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,               " condition!");      buildPartialUnswitchConditionalBranch(          *OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH, -        FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT); +        FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT, BI);    }    // Update the dominator tree with the added edge. @@ -2477,7 +2519,7 @@ static void unswitchNontrivialInvariants(      else {        buildPartialUnswitchConditionalBranch(            *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, -          FreezeLoopUnswitchCond, BI, &AC, DT); +          FreezeLoopUnswitchCond, BI, &AC, DT, *BI);      }      DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 9829d4d..11db0ec 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -674,6 +674,79 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,    return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName);  } +/// Helper function to update the cycle or loop information after inserting a +/// new block between a callbr instruction and one of its target blocks.  Adds +/// the new block to the innermost cycle or loop that the callbr instruction and +/// the original target block share. +/// \p LCI            cycle or loop information to update +/// \p CallBrBlock    block containing the callbr instruction +/// \p CallBrTarget   new target block of the callbr instruction +/// \p Succ           original target block of the callbr instruction +template <typename TI, typename T> +static bool updateCycleLoopInfo(TI *LCI, BasicBlock *CallBrBlock, +                                BasicBlock *CallBrTarget, BasicBlock *Succ) { +  static_assert(std::is_same_v<TI, CycleInfo> || std::is_same_v<TI, LoopInfo>, +                "type must be CycleInfo or LoopInfo"); +  if (!LCI) +    return false; + +  T *LC; +  if constexpr (std::is_same_v<TI, CycleInfo>) +    LC = LCI->getSmallestCommonCycle(CallBrBlock, Succ); +  else +    LC = LCI->getSmallestCommonLoop(CallBrBlock, Succ); +  if (!LC) +    return false; + +  if constexpr (std::is_same_v<TI, CycleInfo>) +    LCI->addBlockToCycle(CallBrTarget, LC); +  else +    LC->addBasicBlockToLoop(CallBrTarget, *LCI); + +  return true; +} + +BasicBlock *llvm::SplitCallBrEdge(BasicBlock *CallBrBlock, BasicBlock *Succ, +                                  unsigned SuccIdx, DomTreeUpdater *DTU, +                                  CycleInfo *CI, LoopInfo *LI, +                                  bool *UpdatedLI) { +  CallBrInst *CallBr = dyn_cast<CallBrInst>(CallBrBlock->getTerminator()); +  assert(CallBr && "expected callbr terminator"); +  assert(SuccIdx < CallBr->getNumSuccessors() && +         Succ == CallBr->getSuccessor(SuccIdx) && "invalid successor index"); + +  // Create a new block between callbr and the specified successor. +  // splitBlockBefore cannot be re-used here since it cannot split if the split +  // point is a PHI node (because BasicBlock::splitBasicBlockBefore cannot +  // handle that). But we don't need to rewire every part of a potential PHI +  // node. We only care about the edge between CallBrBlock and the original +  // successor. +  BasicBlock *CallBrTarget = +      BasicBlock::Create(CallBrBlock->getContext(), +                         CallBrBlock->getName() + ".target." + Succ->getName(), +                         CallBrBlock->getParent()); +  // Rewire control flow from the new target block to the original successor. +  Succ->replacePhiUsesWith(CallBrBlock, CallBrTarget); +  // Rewire control flow from callbr to the new target block. +  CallBr->setSuccessor(SuccIdx, CallBrTarget); +  // Jump from the new target block to the original successor. +  BranchInst::Create(Succ, CallBrTarget); + +  bool Updated = +      updateCycleLoopInfo<LoopInfo, Loop>(LI, CallBrBlock, CallBrTarget, Succ); +  if (UpdatedLI) +    *UpdatedLI = Updated; +  updateCycleLoopInfo<CycleInfo, Cycle>(CI, CallBrBlock, CallBrTarget, Succ); +  if (DTU) { +    DTU->applyUpdates({{DominatorTree::Insert, CallBrBlock, CallBrTarget}}); +    if (DTU->getDomTree().dominates(CallBrBlock, Succ)) +      DTU->applyUpdates({{DominatorTree::Delete, CallBrBlock, Succ}, +                         {DominatorTree::Insert, CallBrTarget, Succ}}); +  } + +  return CallBrTarget; +} +  void llvm::setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) {    if (auto *II = dyn_cast<InvokeInst>(TI))      II->setUnwindDest(Succ); diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp index 0046a00..287a177 100644 --- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp +++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp @@ -13,6 +13,7 @@  #include "llvm/Transforms/Utils/ControlFlowUtils.h"  #include "llvm/ADT/SetVector.h"  #include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/ValueHandle.h" @@ -281,7 +282,9 @@ std::pair<BasicBlock *, bool> ControlFlowHub::finalize(    for (auto [BB, Succ0, Succ1] : Branches) {  #ifndef NDEBUG -    assert(Incoming.insert(BB).second && "Duplicate entry for incoming block."); +    assert( +        (Incoming.insert(BB).second || isa<CallBrInst>(BB->getTerminator())) && +        "Duplicate entry for incoming block.");  #endif      if (Succ0)        Outgoing.insert(Succ0); diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp index 45e1d12..804af22 100644 --- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp +++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp @@ -79,6 +79,53 @@  // Limitation: The pass cannot handle switch statements and indirect  //             branches. Both must be lowered to plain branches first.  // +// CallBr support: CallBr is handled as a more general branch instruction which +// can have multiple successors. The pass redirects the edges to intermediate +// target blocks that unconditionally branch to the original callbr target +// blocks. This allows the control flow hub to know to which of the original +// target blocks to jump to. +// Example input CFG: +//                        Entry (callbr) +//                       /     \ +//                      v       v +//                      H ----> B +//                      ^      /| +//                       `----' | +//                              v +//                             Exit +// +// becomes: +//                        Entry (callbr) +//                       /     \ +//                      v       v +//                 target.H   target.B +//                      |       | +//                      v       v +//                      H ----> B +//                      ^      /| +//                       `----' | +//                              v +//                             Exit +// +// Note +// OUTPUT CFG: Converted to a natural loop with a new header N. +// +//                        Entry (callbr) +//                       /     \ +//                      v       v +//                 target.H   target.B +//                      \       / +//                       \     / +//                        v   v +//                          N <---. +//                         / \     \ +//                        /   \     | +//                       v     v    / +//                       H --> B --' +//                             | +//                             v +//                            Exit +//  //===----------------------------------------------------------------------===//  #include "llvm/Transforms/Utils/FixIrreducible.h" @@ -231,6 +278,7 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,      return false;    LLVM_DEBUG(dbgs() << "Processing cycle:\n" << CI.print(&C) << "\n";); +  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);    ControlFlowHub CHub;    SetVector<BasicBlock *> Predecessors; @@ -242,18 +290,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,    }    for (BasicBlock *P : Predecessors) { -    auto *Branch = cast<BranchInst>(P->getTerminator()); -    // Exactly one of the two successors is the header. -    BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; -    BasicBlock *Succ1 = Succ0 ? nullptr : Header; -    if (!Succ0) -      assert(Branch->getSuccessor(1) == Header); -    assert(Succ0 || Succ1); -    CHub.addBranch(P, Succ0, Succ1); - -    LLVM_DEBUG(dbgs() << "Added internal branch: " << P->getName() << " -> " -                      << (Succ0 ? Succ0->getName() : "") << " " -                      << (Succ1 ? Succ1->getName() : "") << "\n"); +    if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator())) { +      // Exactly one of the two successors is the header. +      BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; +      BasicBlock *Succ1 = Succ0 ? nullptr : Header; +      assert(Succ0 || Branch->getSuccessor(1) == Header); +      assert(Succ0 || Succ1); +      CHub.addBranch(P, Succ0, Succ1); + +      LLVM_DEBUG(dbgs() << "Added internal branch: " << printBasicBlock(P) +                        << " -> " << printBasicBlock(Succ0) +                        << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) +                        << '\n'); +    } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) { +      for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { +        BasicBlock *Succ = CallBr->getSuccessor(I); +        if (Succ != Header) +          continue; +        BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); +        CHub.addBranch(NewSucc, Succ); +        LLVM_DEBUG(dbgs() << "Added internal branch: " +                          << printBasicBlock(NewSucc) << " -> " +                          << printBasicBlock(Succ) << '\n'); +      } +    } else { +      llvm_unreachable("unsupported block terminator"); +    }    }    // Redirect external incoming edges. This includes the edges on the header. @@ -266,17 +328,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,    }    for (BasicBlock *P : Predecessors) { -    auto *Branch = cast<BranchInst>(P->getTerminator()); -    BasicBlock *Succ0 = Branch->getSuccessor(0); -    Succ0 = C.contains(Succ0) ? Succ0 : nullptr; -    BasicBlock *Succ1 = -        Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); -    Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; -    CHub.addBranch(P, Succ0, Succ1); - -    LLVM_DEBUG(dbgs() << "Added external branch: " << P->getName() << " -> " -                      << (Succ0 ? Succ0->getName() : "") << " " -                      << (Succ1 ? Succ1->getName() : "") << "\n"); +    if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator()); Branch) { +      BasicBlock *Succ0 = Branch->getSuccessor(0); +      Succ0 = C.contains(Succ0) ? Succ0 : nullptr; +      BasicBlock *Succ1 = +          Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); +      Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; +      CHub.addBranch(P, Succ0, Succ1); + +      LLVM_DEBUG(dbgs() << "Added external branch: " << printBasicBlock(P) +                        << " -> " << printBasicBlock(Succ0) +                        << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) +                        << '\n'); +    } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) { +      for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { +        BasicBlock *Succ = CallBr->getSuccessor(I); +        if (!C.contains(Succ)) +          continue; +        BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); +        CHub.addBranch(NewSucc, Succ); +        LLVM_DEBUG(dbgs() << "Added external branch: " +                          << printBasicBlock(NewSucc) << " -> " +                          << printBasicBlock(Succ) << '\n'); +      } +    } else { +      llvm_unreachable("unsupported block terminator"); +    }    }    // Redirect all the backedges through a "hub" consisting of a series @@ -292,7 +369,6 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,    SetVector<BasicBlock *> Entries;    Entries.insert(C.entry_rbegin(), C.entry_rend()); -  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);    CHub.finalize(&DTU, GuardBlocks, "irr");  #if defined(EXPENSIVE_CHECKS)    assert(DT.verify(DominatorTree::VerificationLevel::Full)); @@ -325,8 +401,6 @@ static bool FixIrreducibleImpl(Function &F, CycleInfo &CI, DominatorTree &DT,    LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: "                      << F.getName() << "\n"); -  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); -    bool Changed = false;    for (Cycle *TopCycle : CI.toplevel_cycles()) {      for (Cycle *C : depth_first(TopCycle)) { diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 4fe736a..94dfd3a 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -499,9 +499,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,    const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L);    const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); -  unsigned EstimatedLoopInvocationWeight = 0;    std::optional<unsigned> OriginalTripCount = -      llvm::getLoopEstimatedTripCount(L, &EstimatedLoopInvocationWeight); +      llvm::getLoopEstimatedTripCount(L); +  BranchProbability OriginalLoopProb = llvm::getLoopProbability(L);    // Effectively "DCE" unrolled iterations that are beyond the max tripcount    // and will never be executed. @@ -592,11 +592,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,                                                : isEpilogProfitable(L);    if (ULO.Runtime && -      !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount, -                                  EpilogProfitability, ULO.UnrollRemainder, -                                  ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, -                                  PreserveLCSSA, ULO.SCEVExpansionBudget, -                                  ULO.RuntimeUnrollMultiExit, RemainderLoop)) { +      !UnrollRuntimeLoopRemainder( +          L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability, +          ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI, +          PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit, +          RemainderLoop, OriginalTripCount, OriginalLoopProb)) {      if (ULO.Force)        ULO.Runtime = false;      else { @@ -1130,11 +1130,46 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,      LI->erase(L);      // We shouldn't try to use `L` anymore.      L = nullptr; -  } else if (OriginalTripCount) { -    // Update the trip count. Note that the remainder has already logic -    // computing it in `UnrollRuntimeLoopRemainder`. -    setLoopEstimatedTripCount(L, *OriginalTripCount / ULO.Count, -                              EstimatedLoopInvocationWeight); +  } else { +    // Update metadata for the loop's branch weights and estimated trip count: +    // - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch +    //   weights, latch branch weights, and estimated trip count of the +    //   remainder loop it creates.  It also sets the branch weights for the +    //   unrolled loop guard it creates.  The branch weights for the unrolled +    //   loop latch are adjusted below.  FIXME: Handle prologue loops. +    // - Otherwise, if unrolled loop iteration latches become unconditional, +    //   branch weights are adjusted above.  FIXME: Actually handle such +    //   unconditional latches. +    // - Otherwise, the original loop's branch weights are correct for the +    //   unrolled loop, so do not adjust them. +    // - In all cases, the unrolled loop's estimated trip count is set below. +    // +    // As an example of the last case, consider what happens if the unroll count +    // is 4 for a loop with an estimated trip count of 10 when we do not create +    // a remainder loop and all iterations' latches remain conditional.  Each +    // unrolled iteration's latch still has the same probability of exiting the +    // loop as it did when in the original loop, and thus it should still have +    // the same branch weights.  Each unrolled iteration's non-zero probability +    // of exiting already appropriately reduces the probability of reaching the +    // remaining iterations just as it did in the original loop.  Trying to also +    // adjust the branch weights of the final unrolled iteration's latch (i.e., +    // the backedge for the unrolled loop as a whole) to reflect its new trip +    // count of 3 will erroneously further reduce its block frequencies. +    // However, in case an analysis later needs to estimate the trip count of +    // the unrolled loop as a whole without considering the branch weights for +    // each unrolled iteration's latch within it, we store the new trip count as +    // separate metadata. +    if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) { +      // Where p is always the probability of executing at least 1 more +      // iteration, the probability for at least n more iterations is p^n. +      setLoopProbability(L, OriginalLoopProb.pow(ULO.Count)); +    } +    if (OriginalTripCount) { +      unsigned NewTripCount = *OriginalTripCount / ULO.Count; +      if (!ULO.Runtime && *OriginalTripCount % ULO.Count) +        ++NewTripCount; +      setLoopEstimatedTripCount(L, NewTripCount); +    }    }    // LoopInfo should not be valid, confirm that. diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 6312831..1e8f6cc 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -40,6 +40,7 @@  #include "llvm/Transforms/Utils/LoopUtils.h"  #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"  #include "llvm/Transforms/Utils/UnrollLoop.h" +#include <cmath>  using namespace llvm; @@ -195,6 +196,21 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,    }  } +/// Assume, due to our position in the remainder loop or its guard, anywhere +/// from 0 to \p N more iterations can possibly execute.  Among such cases in +/// the original loop (with loop probability \p OriginalLoopProb), what is the +/// probability of executing at least one more iteration? +static BranchProbability +probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { +  // Each of these variables holds the original loop's probability that the +  // number of iterations it will execute is some m in the specified range. +  BranchProbability ProbOne = OriginalLoopProb;                // 1 <= m +  BranchProbability ProbTooMany = ProbOne.pow(N + 1);          // N + 1 <= m +  BranchProbability ProbNotTooMany = ProbTooMany.getCompl();   // 0 <= m <= N +  BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N +  return ProbOneNotTooMany / ProbNotTooMany; +} +  /// Connect the unrolling epilog code to the original loop.  /// The unrolling epilog code contains code to execute the  /// 'extra' iterations if the run-time trip count modulo the @@ -221,7 +237,8 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,                            BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,                            ValueToValueMapTy &VMap, DominatorTree *DT,                            LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE, -                          unsigned Count, AssumptionCache &AC) { +                          unsigned Count, AssumptionCache &AC, +                          BranchProbability OriginalLoopProb) {    BasicBlock *Latch = L->getLoopLatch();    assert(Latch && "Loop must have a latch");    BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]); @@ -332,12 +349,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,                           PreserveLCSSA);    // Add the branch to the exit block (around the epilog loop)    MDNode *BranchWeights = nullptr; -  if (hasBranchWeightMD(*Latch->getTerminator())) { +  if (OriginalLoopProb.isUnknown() && +      hasBranchWeightMD(*Latch->getTerminator())) {      // Assume equal distribution in interval [0, Count).      MDBuilder MDB(B.getContext());      BranchWeights = MDB.createBranchWeights(1, Count - 1);    } -  B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); +  BranchInst *RemainderLoopGuard = +      B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights); +  if (!OriginalLoopProb.isUnknown()) { +    setBranchProbability(RemainderLoopGuard, +                         probOfNextInRemainder(OriginalLoopProb, Count - 1), +                         /*ForFirstTarget=*/true); +  }    InsertPt->eraseFromParent();    if (DT) {      auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit); @@ -357,14 +381,15 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,  /// The cloned blocks should be inserted between InsertTop and InsertBot.  /// InsertTop should be new preheader, InsertBot new loop exit.  /// Returns the new cloned loop that is created. -static Loop * -CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, -                const bool UnrollRemainder, -                BasicBlock *InsertTop, -                BasicBlock *InsertBot, BasicBlock *Preheader, +static Loop *CloneLoopBlocks(Loop *L, Value *NewIter, +                             const bool UseEpilogRemainder, +                             const bool UnrollRemainder, BasicBlock *InsertTop, +                             BasicBlock *InsertBot, BasicBlock *Preheader,                               std::vector<BasicBlock *> &NewBlocks,                               LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, -                             DominatorTree *DT, LoopInfo *LI, unsigned Count) { +                             DominatorTree *DT, LoopInfo *LI, unsigned Count, +                             std::optional<unsigned> OriginalTripCount, +                             BranchProbability OriginalLoopProb) {    StringRef suffix = UseEpilogRemainder ? "epil" : "prol";    BasicBlock *Header = L->getHeader();    BasicBlock *Latch = L->getLoopLatch(); @@ -419,7 +444,8 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,            Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next");        Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp");        MDNode *BranchWeights = nullptr; -      if (hasBranchWeightMD(*LatchBR)) { +      if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && +          hasBranchWeightMD(*LatchBR)) {          uint32_t ExitWeight;          uint32_t BackEdgeWeight;          if (Count >= 3) { @@ -437,7 +463,29 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,          MDBuilder MDB(Builder.getContext());          BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight);        } -      Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); +      BranchInst *RemainderLoopLatch = +          Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights); +      if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { +        // Compute the total frequency of the original loop body from the +        // remainder iterations.  Once we've reached them, the first of them +        // always executes, so its frequency and probability are 1. +        double FreqRemIters = 1; +        if (Count > 2) { +          BranchProbability ProbReaching = BranchProbability::getOne(); +          for (unsigned N = Count - 2; N >= 1; --N) { +            ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N); +            FreqRemIters += double(ProbReaching.getNumerator()) / +                            ProbReaching.getDenominator(); +          } +        } +        // Solve for the loop probability that would produce that frequency. +        // Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters. +        double ProbDouble = 1 - 1 / FreqRemIters; +        BranchProbability Prob = BranchProbability::getBranchProbability( +            std::round(ProbDouble * BranchProbability::getDenominator()), +            BranchProbability::getDenominator()); +        setBranchProbability(RemainderLoopLatch, Prob, /*ForFirstTarget=*/true); +      }        NewIdx->addIncoming(Zero, InsertTop);        NewIdx->addIncoming(IdxNext, NewBB);        LatchBR->eraseFromParent(); @@ -460,25 +508,13 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,    Loop *NewLoop = NewLoops[L];    assert(NewLoop && "L should have been cloned"); -  MDNode *LoopID = NewLoop->getLoopID(); -  // Only add loop metadata if the loop is not going to be completely -  // unrolled. -  if (UnrollRemainder) -    return NewLoop; - -  std::optional<MDNode *> NewLoopID = makeFollowupLoopID( -      LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); -  if (NewLoopID) { -    NewLoop->setLoopID(*NewLoopID); - -    // Do not setLoopAlreadyUnrolled if loop attributes have been defined -    // explicitly. -    return NewLoop; -  } +  if (OriginalTripCount && UseEpilogRemainder) +    setLoopEstimatedTripCount(NewLoop, *OriginalTripCount % Count);    // Add unroll disable metadata to disable future unrolling for this loop. -  NewLoop->setLoopAlreadyUnrolled(); +  if (!UnrollRemainder) +    NewLoop->setLoopAlreadyUnrolled();    return NewLoop;  } @@ -603,7 +639,8 @@ bool llvm::UnrollRuntimeLoopRemainder(      LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,      const TargetTransformInfo *TTI, bool PreserveLCSSA,      unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit, -    Loop **ResultLoop) { +    Loop **ResultLoop, std::optional<unsigned> OriginalTripCount, +    BranchProbability OriginalLoopProb) {    LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");    LLVM_DEBUG(L->dump());    LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" @@ -823,12 +860,23 @@ bool llvm::UnrollRuntimeLoopRemainder(    BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;    // Branch to either remainder (extra iterations) loop or unrolling loop.    MDNode *BranchWeights = nullptr; -  if (hasBranchWeightMD(*Latch->getTerminator())) { +  if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) && +      hasBranchWeightMD(*Latch->getTerminator())) {      // Assume loop is nearly always entered.      MDBuilder MDB(B.getContext());      BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights);    } -  B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); +  BranchInst *UnrollingLoopGuard = +      B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights); +  if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) { +    // The original loop's first iteration always happens.  Compute the +    // probability of the original loop executing Count-1 iterations after that +    // to complete the first iteration of the unrolled loop. +    BranchProbability ProbOne = OriginalLoopProb; +    BranchProbability ProbRest = ProbOne.pow(Count - 1); +    setBranchProbability(UnrollingLoopGuard, ProbRest, +                         /*ForFirstTarget=*/false); +  }    PreHeaderBR->eraseFromParent();    if (DT) {      if (UseEpilogRemainder) @@ -855,9 +903,10 @@ bool llvm::UnrollRuntimeLoopRemainder(    // iterations. This function adds the appropriate CFG connections.    BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;    BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; -  Loop *remainderLoop = CloneLoopBlocks( -      L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot, -      NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count); +  Loop *remainderLoop = +      CloneLoopBlocks(L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, +                      InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, +                      LI, Count, OriginalTripCount, OriginalLoopProb);    // Insert the cloned blocks into the function.    F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end()); @@ -956,7 +1005,8 @@ bool llvm::UnrollRuntimeLoopRemainder(      // Connect the epilog code to the original loop and update the      // PHI functions.      ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader, -                  NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC); +                  NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC, +                  OriginalLoopProb);      // Update counter in loop for unrolling.      // Use an incrementing IV.  Pre-incr/post-incr is backedge/trip count. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index b6ba822..8be471b 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -962,13 +962,51 @@ bool llvm::setLoopEstimatedTripCount(    if (LatchBranch->getSuccessor(0) != L->getHeader())      std::swap(BackedgeTakenWeight, LatchExitWeight); -  MDBuilder MDB(LatchBranch->getContext()); -    // Set/Update profile metadata. -  LatchBranch->setMetadata( -      LLVMContext::MD_prof, -      MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight)); +  setBranchWeights(*LatchBranch, {BackedgeTakenWeight, LatchExitWeight}, +                   /*IsExpected=*/false); + +  return true; +} + +BranchProbability llvm::getLoopProbability(Loop *L) { +  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); +  if (!LatchBranch) +    return BranchProbability::getUnknown(); +  bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); +  return getBranchProbability(LatchBranch, FirstTargetIsLoop); +} +bool llvm::setLoopProbability(Loop *L, BranchProbability P) { +  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); +  if (!LatchBranch) +    return false; +  bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader(); +  return setBranchProbability(LatchBranch, P, FirstTargetIsLoop); +} + +BranchProbability llvm::getBranchProbability(BranchInst *B, +                                             bool ForFirstTarget) { +  if (B->getNumSuccessors() != 2) +    return BranchProbability::getUnknown(); +  uint64_t Weight0, Weight1; +  if (!extractBranchWeights(*B, Weight0, Weight1)) +    return BranchProbability::getUnknown(); +  if (!ForFirstTarget) +    std::swap(Weight0, Weight1); +  return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1); +} + +bool llvm::setBranchProbability(BranchInst *B, BranchProbability P, +                                bool ForFirstTarget) { +  if (B->getNumSuccessors() != 2) +    return false; +  BranchProbability Prob0 = P; +  BranchProbability Prob1 = P.getCompl(); +  if (!ForFirstTarget) +    std::swap(Prob0, Prob1); +  setBranchWeights(*B, {Prob0.getNumerator(), Prob1.getNumerator()}, +                   /*IsExpected=*/false);    return true;  } diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index a9ab3b3..27fed73 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -809,7 +809,6 @@ public:    void emitInstructionAnnot(const Instruction *I,                              formatted_raw_ostream &OS) override {      if (const auto *PI = PredInfo->getPredicateInfoFor(I)) { -      OS << "; Has predicate info\n";        if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {          OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge             << " Comparison:" << *PB->Condition << " Edge: ["; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 4fac5d3..7f6d779 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1866,10 +1866,19 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,    // If either of the blocks has it's address taken, then we can't do this fold,    // because the code we'd hoist would no longer run when we jump into the block    // by it's address. -  for (auto *Succ : successors(BB)) -    if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor()) +  for (auto *Succ : successors(BB)) { +    if (Succ->hasAddressTaken())        return false; - +    if (Succ->getSinglePredecessor()) +      continue; +    // If Succ has >1 predecessors, continue to check if the Succ contains only +    // one `unreachable` inst. Since executing `unreachable` inst is an UB, we +    // can relax the condition based on the assumptiom that the program would +    // never enter Succ and trigger such an UB. +    if (isa<UnreachableInst>(*Succ->begin())) +      continue; +    return false; +  }    // The second of pair is a SkipFlags bitmask.    using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>;    SmallVector<SuccIterPair, 8> SuccIterPairs; @@ -5968,14 +5977,14 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,    }    // Prune obsolete incoming values off the successors' PHI nodes. -  for (auto BBI = Dest->begin(); isa<PHINode>(BBI); ++BBI) { +  for (auto &PHI : make_early_inc_range(Dest->phis())) {      unsigned PreviousEdges = Cases->size();      if (Dest == SI->getDefaultDest())        ++PreviousEdges;      for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) -      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); +      PHI.removeIncomingValue(SI->getParent());    } -  for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) { +  for (auto &PHI : make_early_inc_range(OtherDest->phis())) {      unsigned PreviousEdges = OtherCases->size();      if (OtherDest == SI->getDefaultDest())        ++PreviousEdges; @@ -5984,7 +5993,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,      if (NewBI->isUnconditional())        ++E;      for (unsigned I = 0; I != E; ++I) -      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); +      PHI.removeIncomingValue(SI->getParent());    }    // Clean up the default block - it may have phis or other instructions before @@ -7623,7 +7632,9 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,      auto *DefaultCaseBB = SI->getDefaultDest();      BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU);      auto It = OrigBB->getTerminator()->getIterator(); -    BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); +    auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); +    // BI is handling the default case for SI, and so should share its DebugLoc. +    BI->setDebugLoc(SI->getDebugLoc());      It->eraseFromParent();      addPredecessorToBlock(DefaultCaseBB, OrigBB, SplitBB); diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 9f338db..94c5c170 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -12,7 +12,11 @@  //  // Limitation: This assumes that all terminators in the CFG are direct branches  //             (the "br" instruction). The presence of any other control flow -//             such as indirectbr, switch or callbr will cause an assert. +//             such as indirectbr or switch will cause an assert. +//             The callbr terminator is supported by creating intermediate +//             target blocks that unconditionally branch to the original target +//             blocks. These intermediate target blocks can then be redirected +//             through the ControlFlowHub as usual.  //  //===----------------------------------------------------------------------===// @@ -150,25 +154,55 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {    SmallVector<BasicBlock *, 8> ExitingBlocks;    L->getExitingBlocks(ExitingBlocks); +  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); +  SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix;    // Redirect exiting edges through a control flow hub.    ControlFlowHub CHub; -  for (auto *BB : ExitingBlocks) { -    auto *Branch = cast<BranchInst>(BB->getTerminator()); -    BasicBlock *Succ0 = Branch->getSuccessor(0); -    Succ0 = L->contains(Succ0) ? nullptr : Succ0; - -    BasicBlock *Succ1 = -        Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); -    Succ1 = L->contains(Succ1) ? nullptr : Succ1; -    CHub.addBranch(BB, Succ0, Succ1); - -    LLVM_DEBUG(dbgs() << "Added exiting branch: " << BB->getName() << " -> {" -                      << (Succ0 ? Succ0->getName() : "<none>") << ", " -                      << (Succ1 ? Succ1->getName() : "<none>") << "}\n"); + +  for (unsigned I = 0; I < ExitingBlocks.size(); ++I) { +    BasicBlock *BB = ExitingBlocks[I]; +    if (BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator())) { +      BasicBlock *Succ0 = Branch->getSuccessor(0); +      Succ0 = L->contains(Succ0) ? nullptr : Succ0; + +      BasicBlock *Succ1 = +          Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); +      Succ1 = L->contains(Succ1) ? nullptr : Succ1; +      CHub.addBranch(BB, Succ0, Succ1); + +      LLVM_DEBUG(dbgs() << "Added extiting branch: " << printBasicBlock(BB) +                        << " -> " << printBasicBlock(Succ0) +                        << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) +                        << '\n'); +    } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(BB->getTerminator())) { +      for (unsigned J = 0; J < CallBr->getNumSuccessors(); ++J) { +        BasicBlock *Succ = CallBr->getSuccessor(J); +        if (L->contains(Succ)) +          continue; +        bool UpdatedLI = false; +        BasicBlock *NewSucc = +            SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); +        // Even if CallBr and Succ do not have a common parent loop, we need to +        // add the new target block to the parent loop of the current loop. +        if (!UpdatedLI) +          CallBrTargetBlocksToFix.push_back(NewSucc); +        // ExitingBlocks is later used to restore SSA, so we need to make sure +        // that the blocks used for phi nodes in the guard blocks match the +        // predecessors of the guard blocks, which, in the case of callbr, are +        // the new intermediate target blocks instead of the callbr blocks +        // themselves. +        ExitingBlocks[I] = NewSucc; +        CHub.addBranch(NewSucc, Succ); +        LLVM_DEBUG(dbgs() << "Added exiting branch: " +                          << printBasicBlock(NewSucc) << " -> " +                          << printBasicBlock(Succ) << '\n'); +      } +    } else { +      llvm_unreachable("unsupported block terminator"); +    }    }    SmallVector<BasicBlock *, 8> GuardBlocks; -  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);    BasicBlock *LoopExitBlock;    bool ChangedCFG;    std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize( @@ -187,10 +221,19 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {    // The guard blocks were created outside the loop, so they need to become    // members of the parent loop. -  if (auto ParentLoop = L->getParentLoop()) { +  // Same goes for the callbr target blocks.  Although we try to add them to the +  // smallest common parent loop of the callbr block and the corresponding +  // original target block, there might not have been such a loop, in which case +  // the newly created callbr target blocks are not part of any loop. For nested +  // loops, this might result in them leading to a loop with multiple entry +  // points. +  if (auto *ParentLoop = L->getParentLoop()) {      for (auto *G : GuardBlocks) {        ParentLoop->addBasicBlockToLoop(G, LI);      } +    for (auto *C : CallBrTargetBlocksToFix) { +      ParentLoop->addBasicBlockToLoop(C, LI); +    }      ParentLoop->verifyLoop();    } @@ -218,8 +261,6 @@ bool UnifyLoopExitsLegacyPass::runOnFunction(Function &F) {    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); -    return runImpl(LI, DT);  } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f7968ab..25bf49d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3908,7 +3908,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(          continue;        VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, -                            *CM.PSE.getSE()); +                            *CM.PSE.getSE(), OrigLoop);        precomputeCosts(*Plan, VF, CostCtx);        auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());        for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4166,7 +4166,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {        // Add on other costs that are modelled in VPlan, but not in the legacy        // cost model.        VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, -                            *CM.PSE.getSE()); +                            *CM.PSE.getSE(), OrigLoop);        VPRegionBlock *VectorRegion = P->getVectorLoopRegion();        assert(VectorRegion && "Expected to have a vector region!");        for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -5750,13 +5750,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {               getMemoryInstructionCost(I, ElementCount::getFixed(1))));          UpdateMemOpUserCost(cast<LoadInst>(I));        } else if (const auto *Group = getInterleavedAccessGroup(I)) { -        // Scalarize an interleave group of address loads. -        for (unsigned I = 0; I < Group->getFactor(); ++I) { -          if (Instruction *Member = Group->getMember(I)) { -            setWideningDecision( -                Member, VF, CM_Scalarize, -                (VF.getKnownMinValue() * -                 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); +        // Scalarize all members of this interleaved group when any member +        // is used as an address. The address-used load skips scalarization +        // overhead, other members include it. +        for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) { +          if (Instruction *Member = Group->getMember(Idx)) { +            InstructionCost Cost = +                AddrDefs.contains(Member) +                    ? (VF.getKnownMinValue() * +                       getMemoryInstructionCost(Member, +                                                ElementCount::getFixed(1))) +                    : getMemInstScalarizationCost(Member, VF); +            setWideningDecision(Member, VF, CM_Scalarize, Cost);              UpdateMemOpUserCost(cast<LoadInst>(Member));            }          } @@ -6871,7 +6876,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,  InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,                                                 ElementCount VF) const { -  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); +  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(), +                        OrigLoop);    InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);    // Now compute and add the VPlan-based cost. @@ -7105,12 +7111,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {    // case, don't trigger the assertion, as the extra simplifications may cause a    // different VF to be picked by the VPlan-based cost model.    VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, -                        *CM.PSE.getSE()); +                        *CM.PSE.getSE(), OrigLoop);    precomputeCosts(BestPlan, BestFactor.Width, CostCtx);    // Verify that the VPlan-based and legacy cost models agree, except for VPlans    // with early exits and plans with additional VPlan simplifications. The    // legacy cost model doesn't properly model costs for such loops.    assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() || +          !Legal->getLAI()->getSymbolicStrides().empty() ||            planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),                                                  CostCtx, OrigLoop,                                                  BestFactor.Width) || @@ -8335,11 +8342,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(                &R) ||            (isa<VPInstruction>(&R) && !UnderlyingValue))          continue; - -      // FIXME: VPlan0, which models a copy of the original scalar loop, should -      // not use VPWidenPHIRecipe to model the phis. -      assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) && -             UnderlyingValue && "unsupported recipe"); +      assert(isa<VPInstruction>(&R) && UnderlyingValue && "unsupported recipe");        // TODO: Gradually replace uses of underlying instruction by analyses on        // VPlan. @@ -8440,7 +8443,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(    // and mulacc-reduction are implemented.    if (!CM.foldTailWithEVL()) {      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, -                          *CM.PSE.getSE()); +                          *CM.PSE.getSE(), OrigLoop);      VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,                               CostCtx, Range);    } @@ -9910,7 +9913,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {      bool ForceVectorization =          Hints.getForce() == LoopVectorizeHints::FK_Enabled;      VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, -                          CM.CostKind, *CM.PSE.getSE()); +                          CM.CostKind, *CM.PSE.getSE(), L);      if (!ForceVectorization &&          !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,                                       LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 43166c0..1b55a3b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -16920,7 +16920,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(        // otherwise TEPtr depends on TE.        if ((TEInsertBlock != InsertPt->getParent() ||             TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) && -          !CheckOrdering(InsertPt)) +          (!CheckOrdering(InsertPt) || +           (UseEI.UserTE->hasCopyableElements() && +            isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) && +            is_contained(UseEI.UserTE->Scalars, TEInsertPt))))          continue;        // The node is reused - exit.        if (CheckAndUseSameNode(TEPtr)) diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 2aaabd9..965426f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -350,13 +350,14 @@ struct VPCostContext {    SmallPtrSet<Instruction *, 8> SkipCostComputation;    TargetTransformInfo::TargetCostKind CostKind;    ScalarEvolution &SE; +  const Loop *L;    VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,                  const VPlan &Plan, LoopVectorizationCostModel &CM,                  TargetTransformInfo::TargetCostKind CostKind, -                ScalarEvolution &SE) +                ScalarEvolution &SE, const Loop *L)        : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), -        CostKind(CostKind), SE(SE) {} +        CostKind(CostKind), SE(SE), L(L) {}    /// Return the cost for \p UI with \p VF using the legacy cost model as    /// fallback until computing the cost of all recipes migrates to VPlan. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9a63c80..bde62dd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3167,26 +3167,30 @@ bool VPReplicateRecipe::shouldPack() const {    });  } -/// Returns true if \p Ptr is a pointer computation for which the legacy cost -/// model computes a SCEV expression when computing the address cost. -static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { +/// Returns a SCEV expression for \p Ptr if it is a pointer computation for +/// which the legacy cost model computes a SCEV expression when computing the +/// address cost. Computing SCEVs for VPValues is incomplete and returns +/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In +/// those cases we fall back to the legacy cost model. Otherwise return nullptr. +static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE, +                                        const Loop *L) {    auto *PtrR = Ptr->getDefiningRecipe();    if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&                    cast<VPReplicateRecipe>(PtrR)->getOpcode() ==                        Instruction::GetElementPtr) ||                   isa<VPWidenGEPRecipe>(PtrR) ||                   match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) -    return false; +    return nullptr;    // We are looking for a GEP where all indices are either loop invariant or    // inductions.    for (VPValue *Opd : drop_begin(PtrR->operands())) {      if (!Opd->isDefinedOutsideLoopRegions() &&          !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) -      return false; +      return nullptr;    } -  return true; +  return vputils::getSCEVExprForVPValue(Ptr, SE, L);  }  /// Returns true if \p V is used as part of the address of another load or @@ -3354,9 +3358,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,      bool IsLoad = UI->getOpcode() == Instruction::Load;      const VPValue *PtrOp = getOperand(!IsLoad); -    // TODO: Handle cases where we need to pass a SCEV to -    // getAddressComputationCost. -    if (shouldUseAddressAccessSCEV(PtrOp)) +    const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L); +    if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))        break;      Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); @@ -3374,7 +3377,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,      InstructionCost ScalarCost =          ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(                                PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, -                              nullptr, Ctx.CostKind); +                              PtrSCEV, Ctx.CostKind);      if (isSingleScalar())        return ScalarCost; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d9ac26bb..986c801 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1419,6 +1419,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {                                            true /*IsSingleScalar*/);        Clone->insertBefore(RepOrWidenR);        RepOrWidenR->replaceAllUsesWith(Clone); +      if (isDeadRecipe(*RepOrWidenR)) +        RepOrWidenR->eraseFromParent();      }    }  } @@ -4062,7 +4064,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,  DenseMap<const SCEV *, Value *>  VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {    const DataLayout &DL = SE.getDataLayout(); -  SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true); +  SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);    auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());    BasicBlock *EntryBB = Entry->getIRBasicBlock(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 4db92e7..8c23e78 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -75,7 +75,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {           B == Plan.getBackedgeTakenCount();  } -const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { +const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V, +                                           ScalarEvolution &SE, const Loop *L) {    if (V->isLiveIn()) {      if (Value *LiveIn = V->getLiveInIRValue())        return SE.getSCEV(LiveIn); @@ -86,6 +87,53 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {    return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe())        .Case<VPExpandSCEVRecipe>(            [](const VPExpandSCEVRecipe *R) { return R->getSCEV(); }) +      .Case<VPCanonicalIVPHIRecipe>([&SE, L](const VPCanonicalIVPHIRecipe *R) { +        if (!L) +          return SE.getCouldNotCompute(); +        const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); +        return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L, +                                SCEV::FlagAnyWrap); +      }) +      .Case<VPDerivedIVRecipe>([&SE, L](const VPDerivedIVRecipe *R) { +        const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); +        const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L); +        const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L); +        if (any_of(ArrayRef({Start, IV, Scale}), IsaPred<SCEVCouldNotCompute>)) +          return SE.getCouldNotCompute(); + +        return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()), +                             SE.getMulExpr(IV, SE.getTruncateOrSignExtend( +                                                   Scale, IV->getType()))); +      }) +      .Case<VPScalarIVStepsRecipe>([&SE, L](const VPScalarIVStepsRecipe *R) { +        const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L); +        const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L); +        if (isa<SCEVCouldNotCompute>(IV) || isa<SCEVCouldNotCompute>(Step) || +            !Step->isOne()) +          return SE.getCouldNotCompute(); +        return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()), +                             Step); +      }) +      .Case<VPReplicateRecipe>([&SE, L](const VPReplicateRecipe *R) { +        if (R->getOpcode() != Instruction::GetElementPtr) +          return SE.getCouldNotCompute(); + +        const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L); +        if (isa<SCEVCouldNotCompute>(Base)) +          return SE.getCouldNotCompute(); + +        SmallVector<const SCEV *> IndexExprs; +        for (VPValue *Index : drop_begin(R->operands())) { +          const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L); +          if (isa<SCEVCouldNotCompute>(IndexExpr)) +            return SE.getCouldNotCompute(); +          IndexExprs.push_back(IndexExpr); +        } + +        Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr()) +                                 ->getSourceElementType(); +        return SE.getGEPExpr(Base, IndexExprs, SrcElementTy); +      })        .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });  } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 37cd413..c21a0e7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr);  /// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no  /// SCEV expression could be constructed. -const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE); +const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, +                                  const Loop *L = nullptr);  /// Returns true if \p VPV is a single scalar, either because it produces the  /// same value for all lanes or only has its first lane used. | 
