12 files changed, 549 insertions, 553 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index aa030294..127a506 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -60,6 +60,58 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
   return true;
 }
 
+/// Let N = 2 * M.
+/// Given an N-bit integer representing a pack of two M-bit integers,
+/// we can select one of the packed integers by right-shifting by either
+/// zero or M (which is the most straightforward to check if M is a power
+/// of 2), and then isolating the lower M bits. In this case, we can
+/// represent the shift as a select on whether the shr amount is nonzero.
+static Value *simplifyShiftSelectingPackedElement(Instruction *I,
+                                                  const APInt &DemandedMask,
+                                                  InstCombinerImpl &IC,
+                                                  unsigned Depth) {
+  assert(I->getOpcode() == Instruction::LShr &&
+         "Only lshr instruction supported");
+
+  uint64_t ShlAmt;
+  Value *Upper, *Lower;
+  if (!match(I->getOperand(0),
+             m_OneUse(m_c_DisjointOr(
+                 m_OneUse(m_Shl(m_Value(Upper), m_ConstantInt(ShlAmt))),
+                 m_Value(Lower)))))
+    return nullptr;
+
+  if (!isPowerOf2_64(ShlAmt))
+    return nullptr;
+
+  const uint64_t DemandedBitWidth = DemandedMask.getActiveBits();
+  if (DemandedBitWidth > ShlAmt)
+    return nullptr;
+
+  // Check that upper demanded bits are not lost from lshift.
+  if (Upper->getType()->getScalarSizeInBits() < ShlAmt + DemandedBitWidth)
+    return nullptr;
+
+  KnownBits KnownLowerBits = IC.computeKnownBits(Lower, I, Depth);
+  if (!KnownLowerBits.getMaxValue().isIntN(ShlAmt))
+    return nullptr;
+
+  Value *ShrAmt = I->getOperand(1);
+  KnownBits KnownShrBits = IC.computeKnownBits(ShrAmt, I, Depth);
+
+  // Verify that ShrAmt is either exactly ShlAmt (which is a power of 2) or
+  // zero.
+  if (~KnownShrBits.Zero != ShlAmt)
+    return nullptr;
+
+  Value *ShrAmtZ =
+      IC.Builder.CreateICmpEQ(ShrAmt, Constant::getNullValue(ShrAmt->getType()),
+                              ShrAmt->getName() + ".z");
+  Value *Select = IC.Builder.CreateSelect(ShrAmtZ, Lower, Upper);
+  Select->takeName(I);
+  return Select;
+}
+
 /// Returns the bitwidth of the given scalar or pointer type. For vector types,
 /// returns the element type's bitwidth.
 static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
@@ -798,9 +850,13 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
       Known >>= ShiftAmt;
       if (ShiftAmt)
         Known.Zero.setHighBits(ShiftAmt);  // high bits known zero.
-    } else {
-      llvm::computeKnownBits(I, Known, Q, Depth);
+      break;
     }
+    if (Value *V =
+            simplifyShiftSelectingPackedElement(I, DemandedMask, *this, Depth))
+      return V;
+
+    llvm::computeKnownBits(I, Known, Q, Depth);
     break;
   }
   case Instruction::AShr: {
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 048cdf4..d56a1af 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1970,12 +1970,6 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
     NewPhiValues.push_back(nullptr);
     OpsToMoveUseToIncomingBB.push_back(i);
 
-    // If the InVal is an invoke at the end of the pred block, then we can't
-    // insert a computation after it without breaking the edge.
-    if (isa<InvokeInst>(InVal))
-      if (cast<Instruction>(InVal)->getParent() == InBB)
-        return nullptr;
-
     // Do not push the operation across a loop backedge. This could result in
     // an infinite combine loop, and is generally non-profitable (especially
     // if the operation was originally outside the loop).
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 584cdad..e448230 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -1206,19 +1206,18 @@ private:
     // value for the new predecessor ClonedBB. The value will either be the same
     // value from BB or a cloned value.
     for (BasicBlock *Succ : BlocksToUpdate) {
-      for (auto II = Succ->begin(); PHINode *Phi = dyn_cast<PHINode>(II);
-           ++II) {
-        Value *Incoming = Phi->getIncomingValueForBlock(BB);
+      for (PHINode &Phi : Succ->phis()) {
+        Value *Incoming = Phi.getIncomingValueForBlock(BB);
         if (Incoming) {
           if (isa<Constant>(Incoming)) {
-            Phi->addIncoming(Incoming, ClonedBB);
+            Phi.addIncoming(Incoming, ClonedBB);
             continue;
           }
           Value *ClonedVal = VMap[Incoming];
           if (ClonedVal)
-            Phi->addIncoming(ClonedVal, ClonedBB);
+            Phi.addIncoming(ClonedVal, ClonedBB);
           else
-            Phi->addIncoming(Incoming, ClonedBB);
+            Phi.addIncoming(Incoming, ClonedBB);
         }
       }
     }
@@ -1313,27 +1312,19 @@ private:
   void cleanPhiNodes(BasicBlock *BB) {
     // If BB is no longer reachable, remove any remaining phi nodes
     if (pred_empty(BB)) {
-      std::vector<PHINode *> PhiToRemove;
-      for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) {
-        PhiToRemove.push_back(Phi);
-      }
-      for (PHINode *PN : PhiToRemove) {
-        PN->replaceAllUsesWith(PoisonValue::get(PN->getType()));
-        PN->eraseFromParent();
+      for (PHINode &PN : make_early_inc_range(BB->phis())) {
+        PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
+        PN.eraseFromParent();
       }
       return;
     }
 
     // Remove any incoming values that come from an invalid predecessor
-    for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) {
-      std::vector<BasicBlock *> BlocksToRemove;
-      for (BasicBlock *IncomingBB : Phi->blocks()) {
-        if (!isPredecessor(BB, IncomingBB))
-          BlocksToRemove.push_back(IncomingBB);
-      }
-      for (BasicBlock *BB : BlocksToRemove)
-        Phi->removeIncomingValue(BB);
-    }
+    for (PHINode &Phi : BB->phis())
+      Phi.removeIncomingValueIf([&](unsigned Index) {
+        BasicBlock *IncomingBB = Phi.getIncomingBlock(Index);
+        return !isPredecessor(BB, IncomingBB);
+      });
   }
 
   /// Checks if BB was already cloned for a particular next state value. If it
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index b9b5b58..638952a 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -699,6 +699,7 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
     case Instruction::FPTrunc:
     case Instruction::FPExt:
     case Instruction::PtrToInt:
+    case Instruction::PtrToAddr:
     case Instruction::IntToPtr:
     case Instruction::AddrSpaceCast:
     case Instruction::BitCast:
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index d6b7633..3c1a8ba 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -2066,6 +2066,7 @@ NewGVN::performSymbolicEvaluation(Instruction *I,
   case Instruction::FPTrunc:
   case Instruction::FPExt:
   case Instruction::PtrToInt:
+  case Instruction::PtrToAddr:
   case Instruction::IntToPtr:
   case Instruction::Select:
   case Instruction::ExtractElement:
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 60e5df0..7ffccf7 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -355,6 +355,8 @@ void SimplifyCFGPass::printPipeline(
   OS << (Options.ForwardSwitchCondToPhi ? "" : "no-") << "forward-switch-cond;";
   OS << (Options.ConvertSwitchRangeToICmp ? "" : "no-")
      << "switch-range-to-icmp;";
+  OS << (Options.ConvertSwitchToArithmetic ? "" : "no-")
+     << "switch-to-arithmetic;";
   OS << (Options.ConvertSwitchToLookupTable ? "" : "no-")
      << "switch-to-lookup;";
   OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;";
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 7cc9ff8..0c8d6fa 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -45,12 +45,6 @@ STATISTIC(NumInstrsHoisted,
           "Number of instructions hoisted into loop preheader");
 STATISTIC(NumInstrsDuplicated,
           "Number of instructions cloned into loop preheader");
-STATISTIC(NumRotated, "Number of loops rotated");
-
-static cl::opt<bool>
-    MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden,
-                cl::desc("Allow loop rotation multiple times in order to reach "
-                         "a better latch exit"));
 
 // Probability that a rotated loop has zero trip count / is never entered.
 static constexpr uint32_t ZeroTripCountWeights[] = {1, 127};
@@ -206,50 +200,6 @@ static bool profitableToRotateLoopExitingLatch(Loop *L) {
   return false;
 }
 
-// Check that latch exit is deoptimizing (which means - very unlikely to happen)
-// and there is another exit from the loop which is non-deoptimizing.
-// If we rotate latch to that exit our loop has a better chance of being fully
-// canonical.
-//
-// It can give false positives in some rare cases.
-static bool canRotateDeoptimizingLatchExit(Loop *L) {
-  BasicBlock *Latch = L->getLoopLatch();
-  assert(Latch && "need latch");
-  BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
-  // Need normal exiting latch.
-  if (!BI || !BI->isConditional())
-    return false;
-
-  BasicBlock *Exit = BI->getSuccessor(1);
-  if (L->contains(Exit))
-    Exit = BI->getSuccessor(0);
-
-  // Latch exit is non-deoptimizing, no need to rotate.
-  if (!Exit->getPostdominatingDeoptimizeCall())
-    return false;
-
-  SmallVector<BasicBlock *, 4> Exits;
-  L->getUniqueExitBlocks(Exits);
-  if (!Exits.empty()) {
-    // There is at least one non-deoptimizing exit.
-    //
-    // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact,
-    // as it can conservatively return false for deoptimizing exits with
-    // complex enough control flow down to deoptimize call.
-    //
-    // That means here we can report success for a case where
-    // all exits are deoptimizing but one of them has complex enough
-    // control flow (e.g. with loops).
-    //
-    // That should be a very rare case and false positives for this function
-    // have compile-time effect only.
-    return any_of(Exits, [](const BasicBlock *BB) {
-      return !BB->getPostdominatingDeoptimizeCall();
-    });
-  }
-  return false;
-}
-
 static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
                                 bool HasConditionalPreHeader,
                                 bool SuccsSwapped) {
@@ -387,506 +337,489 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
 /// rotation. LoopRotate should be repeatable and converge to a canonical
 /// form. This property is satisfied because simplifying the loop latch can only
 /// happen once across multiple invocations of the LoopRotate pass.
-///
-/// If -loop-rotate-multi is enabled we can do multiple rotations in one go
-/// so to reach a suitable (non-deoptimizing) exit.
 bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
 
   bool Rotated = false;
-  do {
-    BasicBlock *OrigHeader = L->getHeader();
-    BasicBlock *OrigLatch = L->getLoopLatch();
-
-    BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-    if (!BI || BI->isUnconditional())
-      return Rotated;
-
-    // If the loop header is not one of the loop exiting blocks then
-    // either this loop is already rotated or it is not
-    // suitable for loop rotation transformations.
-    if (!L->isLoopExiting(OrigHeader))
+  BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI || BI->isUnconditional())
+    return Rotated;
+
+  // If the loop header is not one of the loop exiting blocks then
+  // either this loop is already rotated or it is not
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExiting(OrigHeader))
+    return Rotated;
+
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (!OrigLatch)
+    return Rotated;
+
+  // Rotate if the loop latch was just simplified. Or if it makes the loop exit
+  // count computable. Or if we think it will be profitable.
+  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
+      !profitableToRotateLoopExitingLatch(L))
+    return Rotated;
+
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
+  {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    CodeMetrics Metrics;
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);
+    if (Metrics.notDuplicatable) {
+      LLVM_DEBUG(
+          dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+                 << " instructions: ";
+          L->dump());
       return Rotated;
-
-    // If the loop latch already contains a branch that leaves the loop then the
-    // loop is already rotated.
-    if (!OrigLatch)
+    }
+    if (Metrics.Convergence != ConvergenceKind::None) {
+      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                           "instructions: ";
+                 L->dump());
       return Rotated;
-
-    // Rotate if either the loop latch does *not* exit the loop, or if the loop
-    // latch was just simplified. Or if we think it will be profitable.
-    if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
-        !profitableToRotateLoopExitingLatch(L) &&
-        !canRotateDeoptimizingLatchExit(L))
+    }
+    if (!Metrics.NumInsts.isValid()) {
+      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
+                           " with invalid cost: ";
+                 L->dump());
       return Rotated;
-
-    // Check size of original header and reject loop if it is very big or we can't
-    // duplicate blocks inside it.
-    {
-      SmallPtrSet<const Value *, 32> EphValues;
-      CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
-      CodeMetrics Metrics;
-      Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);
-      if (Metrics.notDuplicatable) {
-        LLVM_DEBUG(
-                   dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
-                   << " instructions: ";
-                   L->dump());
-        return Rotated;
-      }
-      if (Metrics.Convergence != ConvergenceKind::None) {
-        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
-                   "instructions: ";
-                   L->dump());
-        return Rotated;
-      }
-      if (!Metrics.NumInsts.isValid()) {
-        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
-                   " with invalid cost: ";
-                   L->dump());
-        return Rotated;
-      }
-      if (Metrics.NumInsts > MaxHeaderSize) {
-        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
-                          << Metrics.NumInsts
-                          << " instructions, which is more than the threshold ("
-                          << MaxHeaderSize << " instructions): ";
-                   L->dump());
-        ++NumNotRotatedDueToHeaderSize;
-        return Rotated;
-      }
-
-      // When preparing for LTO, avoid rotating loops with calls that could be
-      // inlined during the LTO stage.
-      if (PrepareForLTO && Metrics.NumInlineCandidates > 0)
-        return Rotated;
     }
-
-    // Now, this loop is suitable for rotation.
-    BasicBlock *OrigPreheader = L->getLoopPreheader();
-
-    // If the loop could not be converted to canonical form, it must have an
-    // indirectbr in it, just give up.
-    if (!OrigPreheader || !L->hasDedicatedExits())
+    if (Metrics.NumInsts > MaxHeaderSize) {
+      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
+                        << Metrics.NumInsts
+                        << " instructions, which is more than the threshold ("
+                        << MaxHeaderSize << " instructions): ";
+                 L->dump());
+      ++NumNotRotatedDueToHeaderSize;
       return Rotated;
-
-    // Anything ScalarEvolution may know about this loop or the PHI nodes
-    // in its header will soon be invalidated. We should also invalidate
-    // all outer loops because insertion and deletion of blocks that happens
-    // during the rotation may violate invariants related to backedge taken
-    // infos in them.
-    if (SE) {
-      SE->forgetTopmostLoop(L);
-      // We may hoist some instructions out of loop. In case if they were cached
-      // as "loop variant" or "loop computable", these caches must be dropped.
-      // We also may fold basic blocks, so cached block dispositions also need
-      // to be dropped.
-      SE->forgetBlockAndLoopDispositions();
     }
 
-    LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
-    if (MSSAU && VerifyMemorySSA)
-      MSSAU->getMemorySSA()->verifyMemorySSA();
-
-    // Find new Loop header. NewHeader is a Header's one and only successor
-    // that is inside loop.  Header's other successor is outside the
-    // loop.  Otherwise loop is not suitable for rotation.
-    BasicBlock *Exit = BI->getSuccessor(0);
-    BasicBlock *NewHeader = BI->getSuccessor(1);
-    bool BISuccsSwapped = L->contains(Exit);
-    if (BISuccsSwapped)
-      std::swap(Exit, NewHeader);
-    assert(NewHeader && "Unable to determine new loop header");
-    assert(L->contains(NewHeader) && !L->contains(Exit) &&
-           "Unable to determine loop header and exit blocks");
-
-    // This code assumes that the new header has exactly one predecessor.
-    // Remove any single-entry PHI nodes in it.
-    assert(NewHeader->getSinglePredecessor() &&
-           "New header doesn't have one pred!");
-    FoldSingleEntryPHINodes(NewHeader);
-
-    // Begin by walking OrigHeader and populating ValueMap with an entry for
-    // each Instruction.
-    BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
-    ValueToValueMapTy ValueMap, ValueMapMSSA;
-
-    // For PHI nodes, the value available in OldPreHeader is just the
-    // incoming value from OldPreHeader.
-    for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
-      InsertNewValueIntoMap(ValueMap, PN,
-                            PN->getIncomingValueForBlock(OrigPreheader));
-
-    // For the rest of the instructions, either hoist to the OrigPreheader if
-    // possible or create a clone in the OldPreHeader if not.
-    Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
-
-    // Record all debug records preceding LoopEntryBranch to avoid
-    // duplication.
-    using DbgHash =
-        std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;
-    auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
-      auto VarLocOps = D->location_ops();
-      return {{hash_combine_range(VarLocOps), D->getVariable()},
-              D->getExpression()};
-    };
-
-    SmallDenseSet<DbgHash, 8> DbgRecords;
-    // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
-    // terminator.
-    for (const DbgVariableRecord &DVR :
-         filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
-      DbgRecords.insert(makeHash(&DVR));
-
-    // Remember the local noalias scope declarations in the header. After the
-    // rotation, they must be duplicated and the scope must be cloned. This
-    // avoids unwanted interaction across iterations.
-    SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions;
-    for (Instruction &I : *OrigHeader)
-      if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
-        NoAliasDeclInstructions.push_back(Decl);
-
-    Module *M = OrigHeader->getModule();
-
-    // Track the next DbgRecord to clone. If we have a sequence where an
-    // instruction is hoisted instead of being cloned:
-    //    DbgRecord blah
-    //    %foo = add i32 0, 0
-    //    DbgRecord xyzzy
-    //    %bar = call i32 @foobar()
-    // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
-    // attached to %foo, then when %foo his hoisted it will "fall down" onto the
-    // function call:
-    //    DbgRecord blah
-    //    DbgRecord xyzzy
-    //    %bar = call i32 @foobar()
-    // causing it to appear attached to the call too.
-    //
-    // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
-    // here" position to account for this behaviour. We point it at any
-    // DbgRecords on the next instruction, here labelled xyzzy, before we hoist
-    // %foo. Later, we only only clone DbgRecords from that position (xyzzy)
-    // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
-    // a range because it gives us a natural way of testing whether
-    //  there were DbgRecords on the next instruction before we hoisted things).
-    iterator_range<DbgRecord::self_iterator> NextDbgInsts =
-        (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange();
-
-    while (I != E) {
-      Instruction *Inst = &*I++;
-
-      // If the instruction's operands are invariant and it doesn't read or write
-      // memory, then it is safe to hoist.  Doing this doesn't change the order of
-      // execution in the preheader, but does prevent the instruction from
-      // executing in each iteration of the loop.  This means it is safe to hoist
-      // something that might trap, but isn't safe to hoist something that reads
-      // memory (without proving that the loop doesn't write).
-      if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
-          !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
-          !isa<AllocaInst>(Inst) &&
-          // It is not safe to hoist the value of these instructions in
-          // coroutines, as the addresses of otherwise eligible variables (e.g.
-          // thread-local variables and errno) may change if the coroutine is
-          // resumed in a different thread.Therefore, we disable this
-          // optimization for correctness. However, this may block other correct
-          // optimizations.
-          // FIXME: This should be reverted once we have a better model for
-          // memory access in coroutines.
-          !Inst->getFunction()->isPresplitCoroutine()) {
-
-        if (!NextDbgInsts.empty()) {
-          auto DbgValueRange =
-              LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-          RemapDbgRecordRange(M, DbgValueRange, ValueMap,
-                              RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-          // Erase anything we've seen before.
-          for (DbgVariableRecord &DVR :
-               make_early_inc_range(filterDbgVars(DbgValueRange)))
-            if (DbgRecords.count(makeHash(&DVR)))
-              DVR.eraseFromParent();
-        }
-
-        NextDbgInsts = I->getDbgRecordRange();
-
-        Inst->moveBefore(LoopEntryBranch->getIterator());
+    // When preparing for LTO, avoid rotating loops with calls that could be
+    // inlined during the LTO stage.
+    if (PrepareForLTO && Metrics.NumInlineCandidates > 0)
+      return Rotated;
+  }
 
-        ++NumInstrsHoisted;
-        continue;
-      }
+  // Now, this loop is suitable for rotation.
+  BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!OrigPreheader || !L->hasDedicatedExits())
+    return Rotated;
+
+  // Anything ScalarEvolution may know about this loop or the PHI nodes
+  // in its header will soon be invalidated. We should also invalidate
+  // all outer loops because insertion and deletion of blocks that happens
+  // during the rotation may violate invariants related to backedge taken
+  // infos in them.
+  if (SE) {
+    SE->forgetTopmostLoop(L);
+    // We may hoist some instructions out of loop. In case if they were cached
+    // as "loop variant" or "loop computable", these caches must be dropped.
+    // We also may fold basic blocks, so cached block dispositions also need
+    // to be dropped.
+    SE->forgetBlockAndLoopDispositions();
+  }
 
-      // Otherwise, create a duplicate of the instruction.
-      Instruction *C = Inst->clone();
-      if (const DebugLoc &DL = C->getDebugLoc())
-        mapAtomInstance(DL, ValueMap);
+  LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
-      C->insertBefore(LoopEntryBranch->getIterator());
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  BasicBlock *Exit = BI->getSuccessor(0);
+  BasicBlock *NewHeader = BI->getSuccessor(1);
+  bool BISuccsSwapped = L->contains(Exit);
+  if (BISuccsSwapped)
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) &&
+         "Unable to determine loop header and exit blocks");
+
+  // This code assumes that the new header has exactly one predecessor.
+  // Remove any single-entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Begin by walking OrigHeader and populating ValueMap with an entry for
+  // each Instruction.
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  ValueToValueMapTy ValueMap, ValueMapMSSA;
+
+  // For PHI nodes, the value available in OldPreHeader is just the
+  // incoming value from OldPreHeader.
+  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    InsertNewValueIntoMap(ValueMap, PN,
+                          PN->getIncomingValueForBlock(OrigPreheader));
+
+  // For the rest of the instructions, either hoist to the OrigPreheader if
+  // possible or create a clone in the OldPreHeader if not.
+  Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
+
+  // Record all debug records preceding LoopEntryBranch to avoid
+  // duplication.
+  using DbgHash =
+      std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;
+  auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
+    auto VarLocOps = D->location_ops();
+    return {{hash_combine_range(VarLocOps), D->getVariable()},
+            D->getExpression()};
+  };
 
-      ++NumInstrsDuplicated;
+  SmallDenseSet<DbgHash, 8> DbgRecords;
+  // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
+  // terminator.
+  for (const DbgVariableRecord &DVR :
+       filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
+    DbgRecords.insert(makeHash(&DVR));
+
+  // Remember the local noalias scope declarations in the header. After the
+  // rotation, they must be duplicated and the scope must be cloned. This
+  // avoids unwanted interaction across iterations.
+  SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions;
+  for (Instruction &I : *OrigHeader)
+    if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+      NoAliasDeclInstructions.push_back(Decl);
+
+  Module *M = OrigHeader->getModule();
+
+  // Track the next DbgRecord to clone. If we have a sequence where an
+  // instruction is hoisted instead of being cloned:
+  //    DbgRecord blah
+  //    %foo = add i32 0, 0
+  //    DbgRecord xyzzy
+  //    %bar = call i32 @foobar()
+  // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
+  // attached to %foo, then when %foo his hoisted it will "fall down" onto the
+  // function call:
+  //    DbgRecord blah
+  //    DbgRecord xyzzy
+  //    %bar = call i32 @foobar()
+  // causing it to appear attached to the call too.
+  //
+  // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
+  // here" position to account for this behaviour. We point it at any
+  // DbgRecords on the next instruction, here labelled xyzzy, before we hoist
+  // %foo. Later, we only only clone DbgRecords from that position (xyzzy)
+  // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
+  // a range because it gives us a natural way of testing whether
+  //  there were DbgRecords on the next instruction before we hoisted things).
+  iterator_range<DbgRecord::self_iterator> NextDbgInsts =
+      (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange();
+
+  while (I != E) {
+    Instruction *Inst = &*I++;
+
+    // If the instruction's operands are invariant and it doesn't read or write
+    // memory, then it is safe to hoist.  Doing this doesn't change the order of
+    // execution in the preheader, but does prevent the instruction from
+    // executing in each iteration of the loop.  This means it is safe to hoist
+    // something that might trap, but isn't safe to hoist something that reads
+    // memory (without proving that the loop doesn't write).
+    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+        !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
+        !isa<AllocaInst>(Inst) &&
+        // It is not safe to hoist the value of these instructions in
+        // coroutines, as the addresses of otherwise eligible variables (e.g.
+        // thread-local variables and errno) may change if the coroutine is
+        // resumed in a different thread.Therefore, we disable this
+        // optimization for correctness. However, this may block other correct
+        // optimizations.
+        // FIXME: This should be reverted once we have a better model for
+        // memory access in coroutines.
+        !Inst->getFunction()->isPresplitCoroutine()) {
 
       if (!NextDbgInsts.empty()) {
-        auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-        RemapDbgRecordRange(M, Range, ValueMap,
+        auto DbgValueRange =
+            LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
+        RemapDbgRecordRange(M, DbgValueRange, ValueMap,
                             RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-        NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();
         // Erase anything we've seen before.
         for (DbgVariableRecord &DVR :
-             make_early_inc_range(filterDbgVars(Range)))
+             make_early_inc_range(filterDbgVars(DbgValueRange)))
           if (DbgRecords.count(makeHash(&DVR)))
             DVR.eraseFromParent();
       }
 
-      // Eagerly remap the operands of the instruction.
-      RemapInstruction(C, ValueMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
-      // With the operands remapped, see if the instruction constant folds or is
-      // otherwise simplifyable.  This commonly occurs because the entry from PHI
-      // nodes allows icmps and other instructions to fold.
-      Value *V = simplifyInstruction(C, SQ);
-      if (V && LI->replacementPreservesLCSSAForm(C, V)) {
-        // If so, then delete the temporary instruction and stick the folded value
-        // in the map.
-        InsertNewValueIntoMap(ValueMap, Inst, V);
-        if (!C->mayHaveSideEffects()) {
-          C->eraseFromParent();
-          C = nullptr;
-        }
-      } else {
-        InsertNewValueIntoMap(ValueMap, Inst, C);
-      }
-      if (C) {
-        // Otherwise, stick the new instruction into the new block!
-        C->setName(Inst->getName());
-
-        if (auto *II = dyn_cast<AssumeInst>(C))
-          AC->registerAssumption(II);
-        // MemorySSA cares whether the cloned instruction was inserted or not, and
-        // not whether it can be remapped to a simplified value.
-        if (MSSAU)
-          InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
-      }
-    }
+      NextDbgInsts = I->getDbgRecordRange();
 
-    if (!NoAliasDeclInstructions.empty()) {
-      // There are noalias scope declarations:
-      // (general):
-      // Original:    OrigPre              { OrigHeader NewHeader ... Latch }
-      // after:      (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }
-      //
-      // with D: llvm.experimental.noalias.scope.decl,
-      //      U: !noalias or !alias.scope depending on D
-      //       ... { D U1 U2 }   can transform into:
-      // (0) : ... { D U1 U2 }        // no relevant rotation for this part
-      // (1) : ... D' { U1 U2 D }     // D is part of OrigHeader
-      // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader
-      //
-      // We now want to transform:
-      // (1) -> : ... D' { D U1 U2 D'' }
-      // (2) -> : ... D' U1' { D U2 D'' U1'' }
-      // D: original llvm.experimental.noalias.scope.decl
-      // D', U1': duplicate with replaced scopes
-      // D'', U1'': different duplicate with replaced scopes
-      // This ensures a safe fallback to 'may_alias' introduced by the rotate,
-      // as U1'' and U1' scopes will not be compatible wrt to the local restrict
-
-      // Clone the llvm.experimental.noalias.decl again for the NewHeader.
-      BasicBlock::iterator NewHeaderInsertionPoint =
-          NewHeader->getFirstNonPHIIt();
-      for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {
-        LLVM_DEBUG(dbgs() << "  Cloning llvm.experimental.noalias.scope.decl:"
-                          << *NAD << "\n");
-        Instruction *NewNAD = NAD->clone();
-        NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint);
-      }
+      Inst->moveBefore(LoopEntryBranch->getIterator());
 
-      // Scopes must now be duplicated, once for OrigHeader and once for
-      // OrigPreHeader'.
-      {
-        auto &Context = NewHeader->getContext();
-
-        SmallVector<MDNode *, 8> NoAliasDeclScopes;
-        for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)
-          NoAliasDeclScopes.push_back(NAD->getScopeList());
-
-        LLVM_DEBUG(dbgs() << "  Updating OrigHeader scopes\n");
-        cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context,
-                                   "h.rot");
-        LLVM_DEBUG(OrigHeader->dump());
-
-        // Keep the compile time impact low by only adapting the inserted block
-        // of instructions in the OrigPreHeader. This might result in slightly
-        // more aliasing between these instructions and those that were already
-        // present, but it will be much faster when the original PreHeader is
-        // large.
-        LLVM_DEBUG(dbgs() << "  Updating part of OrigPreheader scopes\n");
-        auto *FirstDecl =
-            cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]);
-        auto *LastInst = &OrigPreheader->back();
-        cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst,
-                                   Context, "pre.rot");
-        LLVM_DEBUG(OrigPreheader->dump());
-
-        LLVM_DEBUG(dbgs() << "  Updated NewHeader:\n");
-        LLVM_DEBUG(NewHeader->dump());
-      }
+      ++NumInstrsHoisted;
+      continue;
     }
 
-    // Along with all the other instructions, we just cloned OrigHeader's
-    // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
-    // successors by duplicating their incoming values for OrigHeader.
-    for (BasicBlock *SuccBB : successors(OrigHeader))
-      for (BasicBlock::iterator BI = SuccBB->begin();
-           PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
-        PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
-    // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
-    // OrigPreHeader's old terminator (the original branch into the loop), and
-    // remove the corresponding incoming values from the PHI nodes in OrigHeader.
-    LoopEntryBranch->eraseFromParent();
-    OrigPreheader->flushTerminatorDbgRecords();
-
-    // Update MemorySSA before the rewrite call below changes the 1:1
-    // instruction:cloned_instruction_or_value mapping.
-    if (MSSAU) {
-      InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
-      MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
-                                          ValueMapMSSA);
-    }
+    // Otherwise, create a duplicate of the instruction.
+    Instruction *C = Inst->clone();
+    if (const DebugLoc &DL = C->getDebugLoc())
+      mapAtomInstance(DL, ValueMap);
 
-    SmallVector<PHINode*, 2> InsertedPHIs;
-    // If there were any uses of instructions in the duplicated block outside the
-    // loop, update them, inserting PHI nodes as required
-    RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
-                                    &InsertedPHIs);
-
-    // Attach debug records to the new phis if that phi uses a value that
-    // previously had debug metadata attached. This keeps the debug info
-    // up-to-date in the loop body.
-    if (!InsertedPHIs.empty())
-      insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
-
-    // NewHeader is now the header of the loop.
-    L->moveToHeader(NewHeader);
-    assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
-    // Inform DT about changes to the CFG.
-    if (DT) {
-      // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
-      // the DT about the removed edge to the OrigHeader (that got removed).
-      SmallVector<DominatorTree::UpdateType, 3> Updates = {
-          {DominatorTree::Insert, OrigPreheader, Exit},
-          {DominatorTree::Insert, OrigPreheader, NewHeader},
-          {DominatorTree::Delete, OrigPreheader, OrigHeader}};
-
-      if (MSSAU) {
-        MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
-        if (VerifyMemorySSA)
-          MSSAU->getMemorySSA()->verifyMemorySSA();
-      } else {
-        DT->applyUpdates(Updates);
-      }
-    }
+    C->insertBefore(LoopEntryBranch->getIterator());
 
-    // At this point, we've finished our major CFG changes.  As part of cloning
-    // the loop into the preheader we've simplified instructions and the
-    // duplicated conditional branch may now be branching on a constant.  If it is
-    // branching on a constant and if that constant means that we enter the loop,
-    // then we fold away the cond branch to an uncond branch.  This simplifies the
-    // loop in cases important for nested loops, and it also means we don't have
-    // to split as many edges.
-    BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
-    assert(PHBI->isConditional() && "Should be clone of BI condbr!");
-    const Value *Cond = PHBI->getCondition();
-    const bool HasConditionalPreHeader =
-        !isa<ConstantInt>(Cond) ||
-        PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader;
-
-    updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped);
+    ++NumInstrsDuplicated;
 
-    if (HasConditionalPreHeader) {
-      // The conditional branch can't be folded, handle the general case.
-      // Split edges as necessary to preserve LoopSimplify form.
-
-      // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
-      // thus is not a preheader anymore.
-      // Split the edge to form a real preheader.
-      BasicBlock *NewPH = SplitCriticalEdge(
-                                            OrigPreheader, NewHeader,
-                                            CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
-      NewPH->setName(NewHeader->getName() + ".lr.ph");
-
-      // Preserve canonical loop form, which means that 'Exit' should have only
-      // one predecessor. Note that Exit could be an exit block for multiple
-      // nested loops, causing both of the edges to now be critical and need to
-      // be split.
-      SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit));
-      bool SplitLatchEdge = false;
-      for (BasicBlock *ExitPred : ExitPreds) {
-        // We only need to split loop exit edges.
-        Loop *PredLoop = LI->getLoopFor(ExitPred);
-        if (!PredLoop || PredLoop->contains(Exit) ||
-            isa<IndirectBrInst>(ExitPred->getTerminator()))
-          continue;
-        SplitLatchEdge |= L->getLoopLatch() == ExitPred;
-        BasicBlock *ExitSplit = SplitCriticalEdge(
-                                                  ExitPred, Exit,
-                                                  CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
-        ExitSplit->moveBefore(Exit);
+    if (!NextDbgInsts.empty()) {
+      auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
+      RemapDbgRecordRange(M, Range, ValueMap,
+                          RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();
+      // Erase anything we've seen before.
+      for (DbgVariableRecord &DVR : make_early_inc_range(filterDbgVars(Range)))
+        if (DbgRecords.count(makeHash(&DVR)))
+          DVR.eraseFromParent();
+    }
+
+    // Eagerly remap the operands of the instruction.
+    RemapInstruction(C, ValueMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+    // With the operands remapped, see if the instruction constant folds or is
+    // otherwise simplifyable.  This commonly occurs because the entry from PHI
+    // nodes allows icmps and other instructions to fold.
+    Value *V = simplifyInstruction(C, SQ);
+    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+      // If so, then delete the temporary instruction and stick the folded value
+      // in the map.
+      InsertNewValueIntoMap(ValueMap, Inst, V);
+      if (!C->mayHaveSideEffects()) {
+        C->eraseFromParent();
+        C = nullptr;
       }
-      assert(SplitLatchEdge &&
-             "Despite splitting all preds, failed to split latch exit?");
-      (void)SplitLatchEdge;
     } else {
-      // We can fold the conditional branch in the preheader, this makes things
-      // simpler. The first step is to remove the extra edge to the Exit block.
-      Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-      BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());
-      NewBI->setDebugLoc(PHBI->getDebugLoc());
-      PHBI->eraseFromParent();
+      InsertNewValueIntoMap(ValueMap, Inst, C);
+    }
+    if (C) {
+      // Otherwise, stick the new instruction into the new block!
+      C->setName(Inst->getName());
+
+      if (auto *II = dyn_cast<AssumeInst>(C))
+        AC->registerAssumption(II);
+      // MemorySSA cares whether the cloned instruction was inserted or not, and
+      // not whether it can be remapped to a simplified value.
+      if (MSSAU)
+        InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
+    }
+  }
 
-      // With our CFG finalized, update DomTree if it is available.
-      if (DT) DT->deleteEdge(OrigPreheader, Exit);
+  if (!NoAliasDeclInstructions.empty()) {
+    // There are noalias scope declarations:
+    // (general):
+    // Original:    OrigPre              { OrigHeader NewHeader ... Latch }
+    // after:      (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }
+    //
+    // with D: llvm.experimental.noalias.scope.decl,
+    //      U: !noalias or !alias.scope depending on D
+    //       ... { D U1 U2 }   can transform into:
+    // (0) : ... { D U1 U2 }        // no relevant rotation for this part
+    // (1) : ... D' { U1 U2 D }     // D is part of OrigHeader
+    // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader
+    //
+    // We now want to transform:
+    // (1) -> : ... D' { D U1 U2 D'' }
+    // (2) -> : ... D' U1' { D U2 D'' U1'' }
+    // D: original llvm.experimental.noalias.scope.decl
+    // D', U1': duplicate with replaced scopes
+    // D'', U1'': different duplicate with replaced scopes
+    // This ensures a safe fallback to 'may_alias' introduced by the rotate,
+    // as U1'' and U1' scopes will not be compatible wrt to the local restrict
+
+    // Clone the llvm.experimental.noalias.decl again for the NewHeader.
+    BasicBlock::iterator NewHeaderInsertionPoint =
+        NewHeader->getFirstNonPHIIt();
+    for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {
+      LLVM_DEBUG(dbgs() << "  Cloning llvm.experimental.noalias.scope.decl:"
+                        << *NAD << "\n");
+      Instruction *NewNAD = NAD->clone();
+      NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint);
+    }
 
-      // Update MSSA too, if available.
-      if (MSSAU)
-        MSSAU->removeEdge(OrigPreheader, Exit);
+    // Scopes must now be duplicated, once for OrigHeader and once for
+    // OrigPreHeader'.
+    {
+      auto &Context = NewHeader->getContext();
+
+      SmallVector<MDNode *, 8> NoAliasDeclScopes;
+      for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)
+        NoAliasDeclScopes.push_back(NAD->getScopeList());
+
+      LLVM_DEBUG(dbgs() << "  Updating OrigHeader scopes\n");
+      cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context,
+                                 "h.rot");
+      LLVM_DEBUG(OrigHeader->dump());
+
+      // Keep the compile time impact low by only adapting the inserted block
+      // of instructions in the OrigPreHeader. This might result in slightly
+      // more aliasing between these instructions and those that were already
+      // present, but it will be much faster when the original PreHeader is
+      // large.
+      LLVM_DEBUG(dbgs() << "  Updating part of OrigPreheader scopes\n");
+      auto *FirstDecl =
+          cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]);
+      auto *LastInst = &OrigPreheader->back();
+      cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst,
+                                 Context, "pre.rot");
+      LLVM_DEBUG(OrigPreheader->dump());
+
+      LLVM_DEBUG(dbgs() << "  Updated NewHeader:\n");
+      LLVM_DEBUG(NewHeader->dump());
     }
+  }
 
-    assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
-    assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+  // Along with all the other instructions, we just cloned OrigHeader's
+  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+  // successors by duplicating their incoming values for OrigHeader.
+  for (BasicBlock *SuccBB : successors(OrigHeader))
+    for (BasicBlock::iterator BI = SuccBB->begin();
+         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+  // OrigPreHeader's old terminator (the original branch into the loop), and
+  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+  LoopEntryBranch->eraseFromParent();
+  OrigPreheader->flushTerminatorDbgRecords();
+
+  // Update MemorySSA before the rewrite call below changes the 1:1
+  // instruction:cloned_instruction_or_value mapping.
+  if (MSSAU) {
+    InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
+    MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
+                                        ValueMapMSSA);
+  }
 
-    if (MSSAU && VerifyMemorySSA)
-      MSSAU->getMemorySSA()->verifyMemorySSA();
+  SmallVector<PHINode *, 2> InsertedPHIs;
+  // If there were any uses of instructions in the duplicated block outside the
+  // loop, update them, inserting PHI nodes as required
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
+                                  &InsertedPHIs);
+
+  // Attach debug records to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
+
+  // NewHeader is now the header of the loop.
+  L->moveToHeader(NewHeader);
+  assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+  // Inform DT about changes to the CFG.
+  if (DT) {
+    // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+    // the DT about the removed edge to the OrigHeader (that got removed).
+    SmallVector<DominatorTree::UpdateType, 3> Updates = {
+        {DominatorTree::Insert, OrigPreheader, Exit},
+        {DominatorTree::Insert, OrigPreheader, NewHeader},
+        {DominatorTree::Delete, OrigPreheader, OrigHeader}};
 
-    // Now that the CFG and DomTree are in a consistent state again, try to merge
-    // the OrigHeader block into OrigLatch.  This will succeed if they are
-    // connected by an unconditional branch.  This is just a cleanup so the
-    // emitted code isn't too gross in this common case.
-    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-    BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
-    bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
-    if (DidMerge)
-      RemoveRedundantDbgInstrs(PredBB);
+    if (MSSAU) {
+      MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
+      if (VerifyMemorySSA)
+        MSSAU->getMemorySSA()->verifyMemorySSA();
+    } else {
+      DT->applyUpdates(Updates);
+    }
+  }
 
-    if (MSSAU && VerifyMemorySSA)
-      MSSAU->getMemorySSA()->verifyMemorySSA();
+  // At this point, we've finished our major CFG changes.  As part of cloning
+  // the loop into the preheader we've simplified instructions and the
+  // duplicated conditional branch may now be branching on a constant.  If it is
+  // branching on a constant and if that constant means that we enter the loop,
+  // then we fold away the cond branch to an uncond branch.  This simplifies the
+  // loop in cases important for nested loops, and it also means we don't have
+  // to split as many edges.
+  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+  const Value *Cond = PHBI->getCondition();
+  const bool HasConditionalPreHeader =
+      !isa<ConstantInt>(Cond) ||
+      PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader;
+
+  updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped);
 
-    LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+  if (HasConditionalPreHeader) {
+    // The conditional branch can't be folded, handle the general case.
+    // Split edges as necessary to preserve LoopSimplify form.
+
+    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
+    BasicBlock *NewPH = SplitCriticalEdge(
+        OrigPreheader, NewHeader,
+        CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+    NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor. Note that Exit could be an exit block for multiple
+    // nested loops, causing both of the edges to now be critical and need to
+    // be split.
+    SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit));
+    bool SplitLatchEdge = false;
+    for (BasicBlock *ExitPred : ExitPreds) {
+      // We only need to split loop exit edges.
+      Loop *PredLoop = LI->getLoopFor(ExitPred);
+      if (!PredLoop || PredLoop->contains(Exit) ||
+          isa<IndirectBrInst>(ExitPred->getTerminator()))
+        continue;
+      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+      BasicBlock *ExitSplit = SplitCriticalEdge(
+          ExitPred, Exit,
+          CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+      ExitSplit->moveBefore(Exit);
+    }
+    assert(SplitLatchEdge &&
+           "Despite splitting all preds, failed to split latch exit?");
+    (void)SplitLatchEdge;
+  } else {
+    // We can fold the conditional branch in the preheader, this makes things
+    // simpler. The first step is to remove the extra edge to the Exit block.
+    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());
+    NewBI->setDebugLoc(PHBI->getDebugLoc());
+    PHBI->eraseFromParent();
+
+    // With our CFG finalized, update DomTree if it is available.
+    if (DT)
+      DT->deleteEdge(OrigPreheader, Exit);
+
+    // Update MSSA too, if available.
+    if (MSSAU)
+      MSSAU->removeEdge(OrigPreheader, Exit);
+  }
 
-    ++NumRotated;
+  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
 
-    Rotated = true;
-    SimplifiedLatch = false;
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // Now that the CFG and DomTree are in a consistent state again, try to merge
+  // the OrigHeader block into OrigLatch.  This will succeed if they are
+  // connected by an unconditional branch.  This is just a cleanup so the
+  // emitted code isn't too gross in this common case.
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
+  bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
+  if (DidMerge)
+    RemoveRedundantDbgInstrs(PredBB);
 
-    // Check that new latch is a deoptimizing exit and then repeat rotation if possible.
-    // Deoptimizing latch exit is not a generally typical case, so we just loop over.
-    // TODO: if it becomes a performance bottleneck extend rotation algorithm
-    // to handle multiple rotations in one go.
-  } while (MultiRotate && canRotateDeoptimizingLatchExit(L));
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
+  LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
 
   return true;
 }
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index b8cfe3a..155fcc5 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6642,6 +6642,9 @@ public:
   /// Return true if the replacement is a lookup table.
   bool isLookupTable();
 
+  /// Return true if the replacement is a bit map.
+  bool isBitMap();
+
 private:
   // Depending on the switch, there are different alternatives.
   enum {
@@ -6932,6 +6935,8 @@ Constant *SwitchReplacement::getDefaultValue() { return DefaultValue; }
 
 bool SwitchReplacement::isLookupTable() { return Kind == LookupTableKind; }
 
+bool SwitchReplacement::isBitMap() { return Kind == BitMapKind; }
+
 static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) {
   // 40% is the default density for building a jump table in optsize/minsize
   // mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this
@@ -7097,7 +7102,8 @@ static void reuseTableCompare(
 /// lookup tables.
 static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
                                  DomTreeUpdater *DTU, const DataLayout &DL,
-                                 const TargetTransformInfo &TTI) {
+                                 const TargetTransformInfo &TTI,
+                                 bool ConvertSwitchToLookupTable) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
   BasicBlock *BB = SI->getParent();
@@ -7262,6 +7268,8 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
 
   bool AnyLookupTables = any_of(
       PhiToReplacementMap, [](auto &KV) { return KV.second.isLookupTable(); });
+  bool AnyBitMaps = any_of(PhiToReplacementMap,
+                           [](auto &KV) { return KV.second.isBitMap(); });
 
   // A few conditions prevent the generation of lookup tables:
   //     1. The target does not support lookup tables.
@@ -7274,6 +7282,12 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
        Fn->getFnAttribute("no-jump-tables").getValueAsBool()))
     return false;
 
+  // In the early optimization pipeline, disable formation of lookup tables,
+  // bit maps and mask checks, as they may inhibit further optimization.
+  if (!ConvertSwitchToLookupTable &&
+      (AnyLookupTables || AnyBitMaps || NeedMask))
+    return false;
+
   Builder.SetInsertPoint(SI);
   // TableIndex is the switch condition - TableIndexOffset if we don't
   // use the condition directly
@@ -7929,14 +7943,13 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (Options.ForwardSwitchCondToPhi && forwardSwitchConditionToPHI(SI))
     return requestResimplify();
 
-  // The conversion from switch to lookup tables results in difficult-to-analyze
-  // code and makes pruning branches much harder. This is a problem if the
-  // switch expression itself can still be restricted as a result of inlining or
-  // CVP. Therefore, only apply this transformation during late stages of the
-  // optimisation pipeline.
-  if (Options.ConvertSwitchToLookupTable &&
-      simplifySwitchLookup(SI, Builder, DTU, DL, TTI))
-    return requestResimplify();
+  // The conversion of switches to arithmetic or lookup table is disabled in
+  // the early optimization pipeline, as it may lose information or make the
+  // resulting code harder to analyze.
+  if (Options.ConvertSwitchToArithmetic || Options.ConvertSwitchToLookupTable)
+    if (simplifySwitchLookup(SI, Builder, DTU, DL, TTI,
+                             Options.ConvertSwitchToLookupTable))
+      return requestResimplify();
 
   if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI))
     return requestResimplify();
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cee08ef..3f16b03 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4264,8 +4264,8 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
   if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
         if (!Legal->isReductionVariable(&Phi))
           return Legal->isFixedOrderRecurrence(&Phi);
-        RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
-        return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
+        return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
+            Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind());
       }))
     return false;
 
@@ -7282,8 +7282,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     if (!Exit->hasPredecessors())
       continue;
     for (VPRecipeBase &PhiR : Exit->phis())
-      SE.forgetLcssaPhiWithNewPredecessor(
-          OrigLoop, cast<PHINode>(&cast<VPIRPhi>(PhiR).getInstruction()));
+      SE.forgetLcssaPhiWithNewPredecessor(OrigLoop,
+                                          &cast<VPIRPhi>(PhiR).getIRPhi());
   }
   // Forget the original loop and block dispositions.
   SE.forgetLoop(OrigLoop);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 91c3d42..cfa8d27 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10621,7 +10621,8 @@ class InstructionsCompatibilityAnalysis {
   /// elements.
   static bool isSupportedOpcode(const unsigned Opcode) {
     return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
-           Opcode == Instruction::Shl;
+           Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
+           Opcode == Instruction::UDiv;
   }
 
   /// Identifies the best candidate value, which represents main opcode
@@ -10939,6 +10940,8 @@ public:
       case Instruction::Add:
       case Instruction::LShr:
       case Instruction::Shl:
+      case Instruction::SDiv:
+      case Instruction::UDiv:
         VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:
@@ -22066,8 +22069,10 @@ bool BoUpSLP::collectValuesToDemote(
     auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
       assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
       return all_of(E.Scalars, [&](Value *V) {
-        auto *I = cast<Instruction>(V);
         APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+        if (E.hasCopyableElements() && E.isCopyableElement(V))
+          return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
+        auto *I = cast<Instruction>(V);
         return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
                MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
       });
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index b36298f..81deba2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -840,8 +840,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
     // TODO: Support multiple MaxNum/MinNum reductions and other reductions.
     if (RedPhiR)
       return false;
-    if (Cur->getRecurrenceKind() != RecurKind::FMaxNum &&
-        Cur->getRecurrenceKind() != RecurKind::FMinNum) {
+    if (!RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
+            Cur->getRecurrenceKind())) {
       HasUnsupportedPhi = true;
       continue;
     }
@@ -861,10 +861,9 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
   if (!MinMaxOp)
     return false;
 
-  RecurKind RedPhiRK = RedPhiR->getRecurrenceKind();
-  assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) &&
+  assert(RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
+             RedPhiR->getRecurrenceKind()) &&
          "unsupported reduction");
-  (void)RedPhiRK;
 
   /// Check if the vector loop of \p Plan can early exit and restart
   /// execution of last vector iteration in the scalar loop. This requires all
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 94e2628..3a9770c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1230,6 +1230,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::ExtractLane:
   case VPInstruction::ExtractLastElement:
   case VPInstruction::ExtractPenultimateElement:
+  case VPInstruction::ActiveLaneMask:
   case VPInstruction::FirstActiveLane:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd: