3 files changed, 517 insertions, 545 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 7cc9ff8..0c8d6fa 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -45,12 +45,6 @@ STATISTIC(NumInstrsHoisted,
           "Number of instructions hoisted into loop preheader");
 STATISTIC(NumInstrsDuplicated,
           "Number of instructions cloned into loop preheader");
-STATISTIC(NumRotated, "Number of loops rotated");
-
-static cl::opt<bool>
-    MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden,
-                cl::desc("Allow loop rotation multiple times in order to reach "
-                         "a better latch exit"));
 
 // Probability that a rotated loop has zero trip count / is never entered.
 static constexpr uint32_t ZeroTripCountWeights[] = {1, 127};
@@ -206,50 +200,6 @@ static bool profitableToRotateLoopExitingLatch(Loop *L) {
   return false;
 }
 
-// Check that latch exit is deoptimizing (which means - very unlikely to happen)
-// and there is another exit from the loop which is non-deoptimizing.
-// If we rotate latch to that exit our loop has a better chance of being fully
-// canonical.
-//
-// It can give false positives in some rare cases.
-static bool canRotateDeoptimizingLatchExit(Loop *L) {
-  BasicBlock *Latch = L->getLoopLatch();
-  assert(Latch && "need latch");
-  BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
-  // Need normal exiting latch.
-  if (!BI || !BI->isConditional())
-    return false;
-
-  BasicBlock *Exit = BI->getSuccessor(1);
-  if (L->contains(Exit))
-    Exit = BI->getSuccessor(0);
-
-  // Latch exit is non-deoptimizing, no need to rotate.
-  if (!Exit->getPostdominatingDeoptimizeCall())
-    return false;
-
-  SmallVector<BasicBlock *, 4> Exits;
-  L->getUniqueExitBlocks(Exits);
-  if (!Exits.empty()) {
-    // There is at least one non-deoptimizing exit.
-    //
-    // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact,
-    // as it can conservatively return false for deoptimizing exits with
-    // complex enough control flow down to deoptimize call.
-    //
-    // That means here we can report success for a case where
-    // all exits are deoptimizing but one of them has complex enough
-    // control flow (e.g. with loops).
-    //
-    // That should be a very rare case and false positives for this function
-    // have compile-time effect only.
-    return any_of(Exits, [](const BasicBlock *BB) {
-      return !BB->getPostdominatingDeoptimizeCall();
-    });
-  }
-  return false;
-}
-
 static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
                                 bool HasConditionalPreHeader,
                                 bool SuccsSwapped) {
@@ -387,506 +337,489 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
 /// rotation. LoopRotate should be repeatable and converge to a canonical
 /// form. This property is satisfied because simplifying the loop latch can only
 /// happen once across multiple invocations of the LoopRotate pass.
-///
-/// If -loop-rotate-multi is enabled we can do multiple rotations in one go
-/// so to reach a suitable (non-deoptimizing) exit.
 bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
 
   bool Rotated = false;
-  do {
-    BasicBlock *OrigHeader = L->getHeader();
-    BasicBlock *OrigLatch = L->getLoopLatch();
-
-    BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-    if (!BI || BI->isUnconditional())
-      return Rotated;
-
-    // If the loop header is not one of the loop exiting blocks then
-    // either this loop is already rotated or it is not
-    // suitable for loop rotation transformations.
-    if (!L->isLoopExiting(OrigHeader))
+  BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI || BI->isUnconditional())
+    return Rotated;
+
+  // If the loop header is not one of the loop exiting blocks then
+  // either this loop is already rotated or it is not
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExiting(OrigHeader))
+    return Rotated;
+
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (!OrigLatch)
+    return Rotated;
+
+  // Rotate if the loop latch was just simplified. Or if it makes the loop exit
+  // count computable. Or if we think it will be profitable.
+  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
+      !profitableToRotateLoopExitingLatch(L))
+    return Rotated;
+
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
+  {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    CodeMetrics Metrics;
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);
+    if (Metrics.notDuplicatable) {
+      LLVM_DEBUG(
+          dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+                 << " instructions: ";
+          L->dump());
       return Rotated;
-
-    // If the loop latch already contains a branch that leaves the loop then the
-    // loop is already rotated.
-    if (!OrigLatch)
+    }
+    if (Metrics.Convergence != ConvergenceKind::None) {
+      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                           "instructions: ";
+                 L->dump());
       return Rotated;
-
-    // Rotate if either the loop latch does *not* exit the loop, or if the loop
-    // latch was just simplified. Or if we think it will be profitable.
-    if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
-        !profitableToRotateLoopExitingLatch(L) &&
-        !canRotateDeoptimizingLatchExit(L))
+    }
+    if (!Metrics.NumInsts.isValid()) {
+      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
+                           " with invalid cost: ";
+                 L->dump());
       return Rotated;
-
-    // Check size of original header and reject loop if it is very big or we can't
-    // duplicate blocks inside it.
-    {
-      SmallPtrSet<const Value *, 32> EphValues;
-      CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
-      CodeMetrics Metrics;
-      Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);
-      if (Metrics.notDuplicatable) {
-        LLVM_DEBUG(
-                   dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
-                   << " instructions: ";
-                   L->dump());
-        return Rotated;
-      }
-      if (Metrics.Convergence != ConvergenceKind::None) {
-        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
-                   "instructions: ";
-                   L->dump());
-        return Rotated;
-      }
-      if (!Metrics.NumInsts.isValid()) {
-        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
-                   " with invalid cost: ";
-                   L->dump());
-        return Rotated;
-      }
-      if (Metrics.NumInsts > MaxHeaderSize) {
-        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
-                          << Metrics.NumInsts
-                          << " instructions, which is more than the threshold ("
-                          << MaxHeaderSize << " instructions): ";
-                   L->dump());
-        ++NumNotRotatedDueToHeaderSize;
-        return Rotated;
-      }
-
-      // When preparing for LTO, avoid rotating loops with calls that could be
-      // inlined during the LTO stage.
-      if (PrepareForLTO && Metrics.NumInlineCandidates > 0)
-        return Rotated;
     }
-
-    // Now, this loop is suitable for rotation.
-    BasicBlock *OrigPreheader = L->getLoopPreheader();
-
-    // If the loop could not be converted to canonical form, it must have an
-    // indirectbr in it, just give up.
-    if (!OrigPreheader || !L->hasDedicatedExits())
+    if (Metrics.NumInsts > MaxHeaderSize) {
+      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
+                        << Metrics.NumInsts
+                        << " instructions, which is more than the threshold ("
+                        << MaxHeaderSize << " instructions): ";
+                 L->dump());
+      ++NumNotRotatedDueToHeaderSize;
       return Rotated;
-
-    // Anything ScalarEvolution may know about this loop or the PHI nodes
-    // in its header will soon be invalidated. We should also invalidate
-    // all outer loops because insertion and deletion of blocks that happens
-    // during the rotation may violate invariants related to backedge taken
-    // infos in them.
-    if (SE) {
-      SE->forgetTopmostLoop(L);
-      // We may hoist some instructions out of loop. In case if they were cached
-      // as "loop variant" or "loop computable", these caches must be dropped.
-      // We also may fold basic blocks, so cached block dispositions also need
-      // to be dropped.
-      SE->forgetBlockAndLoopDispositions();
     }
 
-    LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
-    if (MSSAU && VerifyMemorySSA)
-      MSSAU->getMemorySSA()->verifyMemorySSA();
-
-    // Find new Loop header. NewHeader is a Header's one and only successor
-    // that is inside loop.  Header's other successor is outside the
-    // loop.  Otherwise loop is not suitable for rotation.
-    BasicBlock *Exit = BI->getSuccessor(0);
-    BasicBlock *NewHeader = BI->getSuccessor(1);
-    bool BISuccsSwapped = L->contains(Exit);
-    if (BISuccsSwapped)
-      std::swap(Exit, NewHeader);
-    assert(NewHeader && "Unable to determine new loop header");
-    assert(L->contains(NewHeader) && !L->contains(Exit) &&
-           "Unable to determine loop header and exit blocks");
-
-    // This code assumes that the new header has exactly one predecessor.
-    // Remove any single-entry PHI nodes in it.
-    assert(NewHeader->getSinglePredecessor() &&
-           "New header doesn't have one pred!");
-    FoldSingleEntryPHINodes(NewHeader);
-
-    // Begin by walking OrigHeader and populating ValueMap with an entry for
-    // each Instruction.
-    BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
-    ValueToValueMapTy ValueMap, ValueMapMSSA;
-
-    // For PHI nodes, the value available in OldPreHeader is just the
-    // incoming value from OldPreHeader.
-    for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
-      InsertNewValueIntoMap(ValueMap, PN,
-                            PN->getIncomingValueForBlock(OrigPreheader));
-
-    // For the rest of the instructions, either hoist to the OrigPreheader if
-    // possible or create a clone in the OldPreHeader if not.
-    Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
-
-    // Record all debug records preceding LoopEntryBranch to avoid
-    // duplication.
-    using DbgHash =
-        std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;
-    auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
-      auto VarLocOps = D->location_ops();
-      return {{hash_combine_range(VarLocOps), D->getVariable()},
-              D->getExpression()};
-    };
-
-    SmallDenseSet<DbgHash, 8> DbgRecords;
-    // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
-    // terminator.
-    for (const DbgVariableRecord &DVR :
-         filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
-      DbgRecords.insert(makeHash(&DVR));
-
-    // Remember the local noalias scope declarations in the header. After the
-    // rotation, they must be duplicated and the scope must be cloned. This
-    // avoids unwanted interaction across iterations.
-    SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions;
-    for (Instruction &I : *OrigHeader)
-      if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
-        NoAliasDeclInstructions.push_back(Decl);
-
-    Module *M = OrigHeader->getModule();
-
-    // Track the next DbgRecord to clone. If we have a sequence where an
-    // instruction is hoisted instead of being cloned:
-    //    DbgRecord blah
-    //    %foo = add i32 0, 0
-    //    DbgRecord xyzzy
-    //    %bar = call i32 @foobar()
-    // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
-    // attached to %foo, then when %foo his hoisted it will "fall down" onto the
-    // function call:
-    //    DbgRecord blah
-    //    DbgRecord xyzzy
-    //    %bar = call i32 @foobar()
-    // causing it to appear attached to the call too.
-    //
-    // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
-    // here" position to account for this behaviour. We point it at any
-    // DbgRecords on the next instruction, here labelled xyzzy, before we hoist
-    // %foo. Later, we only only clone DbgRecords from that position (xyzzy)
-    // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
-    // a range because it gives us a natural way of testing whether
-    //  there were DbgRecords on the next instruction before we hoisted things).
-    iterator_range<DbgRecord::self_iterator> NextDbgInsts =
-        (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange();
-
-    while (I != E) {
-      Instruction *Inst = &*I++;
-
-      // If the instruction's operands are invariant and it doesn't read or write
-      // memory, then it is safe to hoist.  Doing this doesn't change the order of
-      // execution in the preheader, but does prevent the instruction from
-      // executing in each iteration of the loop.  This means it is safe to hoist
-      // something that might trap, but isn't safe to hoist something that reads
-      // memory (without proving that the loop doesn't write).
-      if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
-          !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
-          !isa<AllocaInst>(Inst) &&
-          // It is not safe to hoist the value of these instructions in
-          // coroutines, as the addresses of otherwise eligible variables (e.g.
-          // thread-local variables and errno) may change if the coroutine is
-          // resumed in a different thread.Therefore, we disable this
-          // optimization for correctness. However, this may block other correct
-          // optimizations.
-          // FIXME: This should be reverted once we have a better model for
-          // memory access in coroutines.
-          !Inst->getFunction()->isPresplitCoroutine()) {
-
-        if (!NextDbgInsts.empty()) {
-          auto DbgValueRange =
-              LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-          RemapDbgRecordRange(M, DbgValueRange, ValueMap,
-                              RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-          // Erase anything we've seen before.
-          for (DbgVariableRecord &DVR :
-               make_early_inc_range(filterDbgVars(DbgValueRange)))
-            if (DbgRecords.count(makeHash(&DVR)))
-              DVR.eraseFromParent();
-        }
-
-        NextDbgInsts = I->getDbgRecordRange();
-
-        Inst->moveBefore(LoopEntryBranch->getIterator());
+    // When preparing for LTO, avoid rotating loops with calls that could be
+    // inlined during the LTO stage.
+    if (PrepareForLTO && Metrics.NumInlineCandidates > 0)
+      return Rotated;
+  }
 
-        ++NumInstrsHoisted;
-        continue;
-      }
+  // Now, this loop is suitable for rotation.
+  BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!OrigPreheader || !L->hasDedicatedExits())
+    return Rotated;
+
+  // Anything ScalarEvolution may know about this loop or the PHI nodes
+  // in its header will soon be invalidated. We should also invalidate
+  // all outer loops because insertion and deletion of blocks that happens
+  // during the rotation may violate invariants related to backedge taken
+  // infos in them.
+  if (SE) {
+    SE->forgetTopmostLoop(L);
+    // We may hoist some instructions out of loop. In case if they were cached
+    // as "loop variant" or "loop computable", these caches must be dropped.
+    // We also may fold basic blocks, so cached block dispositions also need
+    // to be dropped.
+    SE->forgetBlockAndLoopDispositions();
+  }
 
-      // Otherwise, create a duplicate of the instruction.
-      Instruction *C = Inst->clone();
-      if (const DebugLoc &DL = C->getDebugLoc())
-        mapAtomInstance(DL, ValueMap);
+  LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
-      C->insertBefore(LoopEntryBranch->getIterator());
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  BasicBlock *Exit = BI->getSuccessor(0);
+  BasicBlock *NewHeader = BI->getSuccessor(1);
+  bool BISuccsSwapped = L->contains(Exit);
+  if (BISuccsSwapped)
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) &&
+         "Unable to determine loop header and exit blocks");
+
+  // This code assumes that the new header has exactly one predecessor.
+  // Remove any single-entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Begin by walking OrigHeader and populating ValueMap with an entry for
+  // each Instruction.
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  ValueToValueMapTy ValueMap, ValueMapMSSA;
+
+  // For PHI nodes, the value available in OldPreHeader is just the
+  // incoming value from OldPreHeader.
+  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    InsertNewValueIntoMap(ValueMap, PN,
+                          PN->getIncomingValueForBlock(OrigPreheader));
+
+  // For the rest of the instructions, either hoist to the OrigPreheader if
+  // possible or create a clone in the OldPreHeader if not.
+  Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
+
+  // Record all debug records preceding LoopEntryBranch to avoid
+  // duplication.
+  using DbgHash =
+      std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;
+  auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
+    auto VarLocOps = D->location_ops();
+    return {{hash_combine_range(VarLocOps), D->getVariable()},
+            D->getExpression()};
+  };
 
-      ++NumInstrsDuplicated;
+  SmallDenseSet<DbgHash, 8> DbgRecords;
+  // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
+  // terminator.
+  for (const DbgVariableRecord &DVR :
+       filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
+    DbgRecords.insert(makeHash(&DVR));
+
+  // Remember the local noalias scope declarations in the header. After the
+  // rotation, they must be duplicated and the scope must be cloned. This
+  // avoids unwanted interaction across iterations.
+  SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions;
+  for (Instruction &I : *OrigHeader)
+    if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+      NoAliasDeclInstructions.push_back(Decl);
+
+  Module *M = OrigHeader->getModule();
+
+  // Track the next DbgRecord to clone. If we have a sequence where an
+  // instruction is hoisted instead of being cloned:
+  //    DbgRecord blah
+  //    %foo = add i32 0, 0
+  //    DbgRecord xyzzy
+  //    %bar = call i32 @foobar()
+  // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
+  // attached to %foo, then when %foo his hoisted it will "fall down" onto the
+  // function call:
+  //    DbgRecord blah
+  //    DbgRecord xyzzy
+  //    %bar = call i32 @foobar()
+  // causing it to appear attached to the call too.
+  //
+  // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
+  // here" position to account for this behaviour. We point it at any
+  // DbgRecords on the next instruction, here labelled xyzzy, before we hoist
+  // %foo. Later, we only only clone DbgRecords from that position (xyzzy)
+  // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
+  // a range because it gives us a natural way of testing whether
+  //  there were DbgRecords on the next instruction before we hoisted things).
+  iterator_range<DbgRecord::self_iterator> NextDbgInsts =
+      (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange();
+
+  while (I != E) {
+    Instruction *Inst = &*I++;
+
+    // If the instruction's operands are invariant and it doesn't read or write
+    // memory, then it is safe to hoist.  Doing this doesn't change the order of
+    // execution in the preheader, but does prevent the instruction from
+    // executing in each iteration of the loop.  This means it is safe to hoist
+    // something that might trap, but isn't safe to hoist something that reads
+    // memory (without proving that the loop doesn't write).
+    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+        !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
+        !isa<AllocaInst>(Inst) &&
+        // It is not safe to hoist the value of these instructions in
+        // coroutines, as the addresses of otherwise eligible variables (e.g.
+        // thread-local variables and errno) may change if the coroutine is
+        // resumed in a different thread.Therefore, we disable this
+        // optimization for correctness. However, this may block other correct
+        // optimizations.
+        // FIXME: This should be reverted once we have a better model for
+        // memory access in coroutines.
+        !Inst->getFunction()->isPresplitCoroutine()) {
 
       if (!NextDbgInsts.empty()) {
-        auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-        RemapDbgRecordRange(M, Range, ValueMap,
+        auto DbgValueRange =
+            LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
+        RemapDbgRecordRange(M, DbgValueRange, ValueMap,
                             RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-        NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();
         // Erase anything we've seen before.
         for (DbgVariableRecord &DVR :
-             make_early_inc_range(filterDbgVars(Range)))
+             make_early_inc_range(filterDbgVars(DbgValueRange)))
           if (DbgRecords.count(makeHash(&DVR)))
             DVR.eraseFromParent();
       }
 
-      // Eagerly remap the operands of the instruction.
-      RemapInstruction(C, ValueMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
-      // With the operands remapped, see if the instruction constant folds or is
-      // otherwise simplifyable.  This commonly occurs because the entry from PHI
-      // nodes allows icmps and other instructions to fold.
-      Value *V = simplifyInstruction(C, SQ);
-      if (V && LI->replacementPreservesLCSSAForm(C, V)) {
-        // If so, then delete the temporary instruction and stick the folded value
-        // in the map.
-        InsertNewValueIntoMap(ValueMap, Inst, V);
-        if (!C->mayHaveSideEffects()) {
-          C->eraseFromParent();
-          C = nullptr;
-        }
-      } else {
-        InsertNewValueIntoMap(ValueMap, Inst, C);
-      }
-      if (C) {
-        // Otherwise, stick the new instruction into the new block!
-        C->setName(Inst->getName());
-
-        if (auto *II = dyn_cast<AssumeInst>(C))
-          AC->registerAssumption(II);
-        // MemorySSA cares whether the cloned instruction was inserted or not, and
-        // not whether it can be remapped to a simplified value.
-        if (MSSAU)
-          InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
-      }
-    }
+      NextDbgInsts = I->getDbgRecordRange();
 
-    if (!NoAliasDeclInstructions.empty()) {
-      // There are noalias scope declarations:
-      // (general):
-      // Original:    OrigPre              { OrigHeader NewHeader ... Latch }
-      // after:      (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }
-      //
-      // with D: llvm.experimental.noalias.scope.decl,
-      //      U: !noalias or !alias.scope depending on D
-      //       ... { D U1 U2 }   can transform into:
-      // (0) : ... { D U1 U2 }        // no relevant rotation for this part
-      // (1) : ... D' { U1 U2 D }     // D is part of OrigHeader
-      // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader
-      //
-      // We now want to transform:
-      // (1) -> : ... D' { D U1 U2 D'' }
-      // (2) -> : ... D' U1' { D U2 D'' U1'' }
-      // D: original llvm.experimental.noalias.scope.decl
-      // D', U1': duplicate with replaced scopes
-      // D'', U1'': different duplicate with replaced scopes
-      // This ensures a safe fallback to 'may_alias' introduced by the rotate,
-      // as U1'' and U1' scopes will not be compatible wrt to the local restrict
-
-      // Clone the llvm.experimental.noalias.decl again for the NewHeader.
-      BasicBlock::iterator NewHeaderInsertionPoint =
-          NewHeader->getFirstNonPHIIt();
-      for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {
-        LLVM_DEBUG(dbgs() << "  Cloning llvm.experimental.noalias.scope.decl:"
-                          << *NAD << "\n");
-        Instruction *NewNAD = NAD->clone();
-        NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint);
-      }
+      Inst->moveBefore(LoopEntryBranch->getIterator());
 
-      // Scopes must now be duplicated, once for OrigHeader and once for
-      // OrigPreHeader'.
-      {
-        auto &Context = NewHeader->getContext();
-
-        SmallVector<MDNode *, 8> NoAliasDeclScopes;
-        for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)
-          NoAliasDeclScopes.push_back(NAD->getScopeList());
-
-        LLVM_DEBUG(dbgs() << "  Updating OrigHeader scopes\n");
-        cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context,
-                                   "h.rot");
-        LLVM_DEBUG(OrigHeader->dump());
-
-        // Keep the compile time impact low by only adapting the inserted block
-        // of instructions in the OrigPreHeader. This might result in slightly
-        // more aliasing between these instructions and those that were already
-        // present, but it will be much faster when the original PreHeader is
-        // large.
-        LLVM_DEBUG(dbgs() << "  Updating part of OrigPreheader scopes\n");
-        auto *FirstDecl =
-            cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]);
-        auto *LastInst = &OrigPreheader->back();
-        cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst,
-                                   Context, "pre.rot");
-        LLVM_DEBUG(OrigPreheader->dump());
-
-        LLVM_DEBUG(dbgs() << "  Updated NewHeader:\n");
-        LLVM_DEBUG(NewHeader->dump());
-      }
+      ++NumInstrsHoisted;
+      continue;
     }
 
-    // Along with all the other instructions, we just cloned OrigHeader's
-    // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
-    // successors by duplicating their incoming values for OrigHeader.
-    for (BasicBlock *SuccBB : successors(OrigHeader))
-      for (BasicBlock::iterator BI = SuccBB->begin();
-           PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
-        PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
-    // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
-    // OrigPreHeader's old terminator (the original branch into the loop), and
-    // remove the corresponding incoming values from the PHI nodes in OrigHeader.
-    LoopEntryBranch->eraseFromParent();
-    OrigPreheader->flushTerminatorDbgRecords();
-
-    // Update MemorySSA before the rewrite call below changes the 1:1
-    // instruction:cloned_instruction_or_value mapping.
-    if (MSSAU) {
-      InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
-      MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
-                                          ValueMapMSSA);
-    }
+    // Otherwise, create a duplicate of the instruction.
+    Instruction *C = Inst->clone();
+    if (const DebugLoc &DL = C->getDebugLoc())
+      mapAtomInstance(DL, ValueMap);
 
-    SmallVector<PHINode*, 2> InsertedPHIs;
-    // If there were any uses of instructions in the duplicated block outside the
-    // loop, update them, inserting PHI nodes as required
-    RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
-                                    &InsertedPHIs);
-
-    // Attach debug records to the new phis if that phi uses a value that
-    // previously had debug metadata attached. This keeps the debug info
-    // up-to-date in the loop body.
-    if (!InsertedPHIs.empty())
-      insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
-
-    // NewHeader is now the header of the loop.
-    L->moveToHeader(NewHeader);
-    assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
-    // Inform DT about changes to the CFG.
-    if (DT) {
-      // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
-      // the DT about the removed edge to the OrigHeader (that got removed).
-      SmallVector<DominatorTree::UpdateType, 3> Updates = {
-          {DominatorTree::Insert, OrigPreheader, Exit},
-          {DominatorTree::Insert, OrigPreheader, NewHeader},
-          {DominatorTree::Delete, OrigPreheader, OrigHeader}};
-
-      if (MSSAU) {
-        MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
-        if (VerifyMemorySSA)
-          MSSAU->getMemorySSA()->verifyMemorySSA();
-      } else {
-        DT->applyUpdates(Updates);
-      }
-    }
+    C->insertBefore(LoopEntryBranch->getIterator());
 
-    // At this point, we've finished our major CFG changes.  As part of cloning
-    // the loop into the preheader we've simplified instructions and the
-    // duplicated conditional branch may now be branching on a constant.  If it is
-    // branching on a constant and if that constant means that we enter the loop,
-    // then we fold away the cond branch to an uncond branch.  This simplifies the
-    // loop in cases important for nested loops, and it also means we don't have
-    // to split as many edges.
-    BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
-    assert(PHBI->isConditional() && "Should be clone of BI condbr!");
-    const Value *Cond = PHBI->getCondition();
-    const bool HasConditionalPreHeader =
-        !isa<ConstantInt>(Cond) ||
-        PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader;
-
-    updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped);
+    ++NumInstrsDuplicated;
 
-    if (HasConditionalPreHeader) {
-      // The conditional branch can't be folded, handle the general case.
-      // Split edges as necessary to preserve LoopSimplify form.
-
-      // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
-      // thus is not a preheader anymore.
-      // Split the edge to form a real preheader.
-      BasicBlock *NewPH = SplitCriticalEdge(
-                                            OrigPreheader, NewHeader,
-                                            CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
-      NewPH->setName(NewHeader->getName() + ".lr.ph");
-
-      // Preserve canonical loop form, which means that 'Exit' should have only
-      // one predecessor. Note that Exit could be an exit block for multiple
-      // nested loops, causing both of the edges to now be critical and need to
-      // be split.
-      SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit));
-      bool SplitLatchEdge = false;
-      for (BasicBlock *ExitPred : ExitPreds) {
-        // We only need to split loop exit edges.
-        Loop *PredLoop = LI->getLoopFor(ExitPred);
-        if (!PredLoop || PredLoop->contains(Exit) ||
-            isa<IndirectBrInst>(ExitPred->getTerminator()))
-          continue;
-        SplitLatchEdge |= L->getLoopLatch() == ExitPred;
-        BasicBlock *ExitSplit = SplitCriticalEdge(
-                                                  ExitPred, Exit,
-                                                  CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
-        ExitSplit->moveBefore(Exit);
+    if (!NextDbgInsts.empty()) {
+      auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
+      RemapDbgRecordRange(M, Range, ValueMap,
+                          RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();
+      // Erase anything we've seen before.
+      for (DbgVariableRecord &DVR : make_early_inc_range(filterDbgVars(Range)))
+        if (DbgRecords.count(makeHash(&DVR)))
+          DVR.eraseFromParent();
+    }
+
+    // Eagerly remap the operands of the instruction.
+    RemapInstruction(C, ValueMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+    // With the operands remapped, see if the instruction constant folds or is
+    // otherwise simplifyable.  This commonly occurs because the entry from PHI
+    // nodes allows icmps and other instructions to fold.
+    Value *V = simplifyInstruction(C, SQ);
+    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+      // If so, then delete the temporary instruction and stick the folded value
+      // in the map.
+      InsertNewValueIntoMap(ValueMap, Inst, V);
+      if (!C->mayHaveSideEffects()) {
+        C->eraseFromParent();
+        C = nullptr;
       }
-      assert(SplitLatchEdge &&
-             "Despite splitting all preds, failed to split latch exit?");
-      (void)SplitLatchEdge;
     } else {
-      // We can fold the conditional branch in the preheader, this makes things
-      // simpler. The first step is to remove the extra edge to the Exit block.
-      Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-      BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());
-      NewBI->setDebugLoc(PHBI->getDebugLoc());
-      PHBI->eraseFromParent();
+      InsertNewValueIntoMap(ValueMap, Inst, C);
+    }
+    if (C) {
+      // Otherwise, stick the new instruction into the new block!
+      C->setName(Inst->getName());
+
+      if (auto *II = dyn_cast<AssumeInst>(C))
+        AC->registerAssumption(II);
+      // MemorySSA cares whether the cloned instruction was inserted or not, and
+      // not whether it can be remapped to a simplified value.
+      if (MSSAU)
+        InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
+    }
+  }
 
-      // With our CFG finalized, update DomTree if it is available.
-      if (DT) DT->deleteEdge(OrigPreheader, Exit);
+  if (!NoAliasDeclInstructions.empty()) {
+    // There are noalias scope declarations:
+    // (general):
+    // Original:    OrigPre              { OrigHeader NewHeader ... Latch }
+    // after:      (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }
+    //
+    // with D: llvm.experimental.noalias.scope.decl,
+    //      U: !noalias or !alias.scope depending on D
+    //       ... { D U1 U2 }   can transform into:
+    // (0) : ... { D U1 U2 }        // no relevant rotation for this part
+    // (1) : ... D' { U1 U2 D }     // D is part of OrigHeader
+    // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader
+    //
+    // We now want to transform:
+    // (1) -> : ... D' { D U1 U2 D'' }
+    // (2) -> : ... D' U1' { D U2 D'' U1'' }
+    // D: original llvm.experimental.noalias.scope.decl
+    // D', U1': duplicate with replaced scopes
+    // D'', U1'': different duplicate with replaced scopes
+    // This ensures a safe fallback to 'may_alias' introduced by the rotate,
+    // as U1'' and U1' scopes will not be compatible wrt to the local restrict
+
+    // Clone the llvm.experimental.noalias.decl again for the NewHeader.
+    BasicBlock::iterator NewHeaderInsertionPoint =
+        NewHeader->getFirstNonPHIIt();
+    for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {
+      LLVM_DEBUG(dbgs() << "  Cloning llvm.experimental.noalias.scope.decl:"
+                        << *NAD << "\n");
+      Instruction *NewNAD = NAD->clone();
+      NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint);
+    }
 
-      // Update MSSA too, if available.
-      if (MSSAU)
-        MSSAU->removeEdge(OrigPreheader, Exit);
+    // Scopes must now be duplicated, once for OrigHeader and once for
+    // OrigPreHeader'.
+    {
+      auto &Context = NewHeader->getContext();
+
+      SmallVector<MDNode *, 8> NoAliasDeclScopes;
+      for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)
+        NoAliasDeclScopes.push_back(NAD->getScopeList());
+
+      LLVM_DEBUG(dbgs() << "  Updating OrigHeader scopes\n");
+      cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context,
+                                 "h.rot");
+      LLVM_DEBUG(OrigHeader->dump());
+
+      // Keep the compile time impact low by only adapting the inserted block
+      // of instructions in the OrigPreHeader. This might result in slightly
+      // more aliasing between these instructions and those that were already
+      // present, but it will be much faster when the original PreHeader is
+      // large.
+      LLVM_DEBUG(dbgs() << "  Updating part of OrigPreheader scopes\n");
+      auto *FirstDecl =
+          cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]);
+      auto *LastInst = &OrigPreheader->back();
+      cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst,
+                                 Context, "pre.rot");
+      LLVM_DEBUG(OrigPreheader->dump());
+
+      LLVM_DEBUG(dbgs() << "  Updated NewHeader:\n");
+      LLVM_DEBUG(NewHeader->dump());
     }
+  }
 
-    assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
-    assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+  // Along with all the other instructions, we just cloned OrigHeader's
+  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+  // successors by duplicating their incoming values for OrigHeader.
+  for (BasicBlock *SuccBB : successors(OrigHeader))
+    for (BasicBlock::iterator BI = SuccBB->begin();
+         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+  // OrigPreHeader's old terminator (the original branch into the loop), and
+  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+  LoopEntryBranch->eraseFromParent();
+  OrigPreheader->flushTerminatorDbgRecords();
+
+  // Update MemorySSA before the rewrite call below changes the 1:1
+  // instruction:cloned_instruction_or_value mapping.
+  if (MSSAU) {
+    InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
+    MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
+                                        ValueMapMSSA);
+  }
 
-    if (MSSAU && VerifyMemorySSA)
-      MSSAU->getMemorySSA()->verifyMemorySSA();
+  SmallVector<PHINode *, 2> InsertedPHIs;
+  // If there were any uses of instructions in the duplicated block outside the
+  // loop, update them, inserting PHI nodes as required
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
+                                  &InsertedPHIs);
+
+  // Attach debug records to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
+
+  // NewHeader is now the header of the loop.
+  L->moveToHeader(NewHeader);
+  assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+  // Inform DT about changes to the CFG.
+  if (DT) {
+    // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+    // the DT about the removed edge to the OrigHeader (that got removed).
+    SmallVector<DominatorTree::UpdateType, 3> Updates = {
+        {DominatorTree::Insert, OrigPreheader, Exit},
+        {DominatorTree::Insert, OrigPreheader, NewHeader},
+        {DominatorTree::Delete, OrigPreheader, OrigHeader}};
 
-    // Now that the CFG and DomTree are in a consistent state again, try to merge
-    // the OrigHeader block into OrigLatch.  This will succeed if they are
-    // connected by an unconditional branch.  This is just a cleanup so the
-    // emitted code isn't too gross in this common case.
-    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-    BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
-    bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
-    if (DidMerge)
-      RemoveRedundantDbgInstrs(PredBB);
+    if (MSSAU) {
+      MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
+      if (VerifyMemorySSA)
+        MSSAU->getMemorySSA()->verifyMemorySSA();
+    } else {
+      DT->applyUpdates(Updates);
+    }
+  }
 
-    if (MSSAU && VerifyMemorySSA)
-      MSSAU->getMemorySSA()->verifyMemorySSA();
+  // At this point, we've finished our major CFG changes.  As part of cloning
+  // the loop into the preheader we've simplified instructions and the
+  // duplicated conditional branch may now be branching on a constant.  If it is
+  // branching on a constant and if that constant means that we enter the loop,
+  // then we fold away the cond branch to an uncond branch.  This simplifies the
+  // loop in cases important for nested loops, and it also means we don't have
+  // to split as many edges.
+  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+  const Value *Cond = PHBI->getCondition();
+  const bool HasConditionalPreHeader =
+      !isa<ConstantInt>(Cond) ||
+      PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader;
+
+  updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped);
 
-    LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+  if (HasConditionalPreHeader) {
+    // The conditional branch can't be folded, handle the general case.
+    // Split edges as necessary to preserve LoopSimplify form.
+
+    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
+    BasicBlock *NewPH = SplitCriticalEdge(
+        OrigPreheader, NewHeader,
+        CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+    NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor. Note that Exit could be an exit block for multiple
+    // nested loops, causing both of the edges to now be critical and need to
+    // be split.
+    SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit));
+    bool SplitLatchEdge = false;
+    for (BasicBlock *ExitPred : ExitPreds) {
+      // We only need to split loop exit edges.
+      Loop *PredLoop = LI->getLoopFor(ExitPred);
+      if (!PredLoop || PredLoop->contains(Exit) ||
+          isa<IndirectBrInst>(ExitPred->getTerminator()))
+        continue;
+      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+      BasicBlock *ExitSplit = SplitCriticalEdge(
+          ExitPred, Exit,
+          CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+      ExitSplit->moveBefore(Exit);
+    }
+    assert(SplitLatchEdge &&
+           "Despite splitting all preds, failed to split latch exit?");
+    (void)SplitLatchEdge;
+  } else {
+    // We can fold the conditional branch in the preheader, this makes things
+    // simpler. The first step is to remove the extra edge to the Exit block.
+    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());
+    NewBI->setDebugLoc(PHBI->getDebugLoc());
+    PHBI->eraseFromParent();
+
+    // With our CFG finalized, update DomTree if it is available.
+    if (DT)
+      DT->deleteEdge(OrigPreheader, Exit);
+
+    // Update MSSA too, if available.
+    if (MSSAU)
+      MSSAU->removeEdge(OrigPreheader, Exit);
+  }
 
-    ++NumRotated;
+  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
 
-    Rotated = true;
-    SimplifiedLatch = false;
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // Now that the CFG and DomTree are in a consistent state again, try to merge
+  // the OrigHeader block into OrigLatch.  This will succeed if they are
+  // connected by an unconditional branch.  This is just a cleanup so the
+  // emitted code isn't too gross in this common case.
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
+  bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
+  if (DidMerge)
+    RemoveRedundantDbgInstrs(PredBB);
 
-    // Check that new latch is a deoptimizing exit and then repeat rotation if possible.
-    // Deoptimizing latch exit is not a generally typical case, so we just loop over.
-    // TODO: if it becomes a performance bottleneck extend rotation algorithm
-    // to handle multiple rotations in one go.
-  } while (MultiRotate && canRotateDeoptimizingLatchExit(L));
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
+  LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
 
   return true;
 }
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index bf882d7..6312831 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -201,18 +201,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
 /// unroll count is non-zero.
 ///
 /// This function performs the following:
-/// - Update PHI nodes at the unrolling loop exit and epilog loop exit
-/// - Create PHI nodes at the unrolling loop exit to combine
-///   values that exit the unrolling loop code and jump around it.
+/// - Update PHI nodes at the epilog loop exit
+/// - Create PHI nodes at the unrolling loop exit and epilog preheader to
+///   combine values that exit the unrolling loop code and jump around it.
 /// - Update PHI operands in the epilog loop by the new PHI nodes
-/// - Branch around the epilog loop if extra iters (ModVal) is zero.
+/// - At the unrolling loop exit, branch around the epilog loop if extra iters
+//    (ModVal) is zero.
+/// - At the epilog preheader, add an llvm.assume call that extra iters is
+///   non-zero.  If the unrolling loop exit is the predecessor, the above new
+///   branch guarantees that assumption.  If the unrolling loop preheader is the
+///   predecessor, then the required first iteration from the original loop has
+///   yet to be executed, so it must be executed in the epilog loop.  If we
+///   later unroll the epilog loop, that llvm.assume call somehow enables
+///   ScalarEvolution to compute a epilog loop maximum trip count, which enables
+///   eliminating the branch at the end of the final unrolled epilog iteration.
 ///
 static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
                           BasicBlock *Exit, BasicBlock *PreHeader,
                           BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
                           ValueToValueMapTy &VMap, DominatorTree *DT,
                           LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE,
-                          unsigned Count) {
+                          unsigned Count, AssumptionCache &AC) {
   BasicBlock *Latch = L->getLoopLatch();
   assert(Latch && "Loop must have a latch");
   BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
@@ -231,7 +240,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
   //   EpilogLatch
   // Exit (EpilogPN)
 
-  // Update PHI nodes at NewExit and Exit.
+  // Update PHI nodes at Exit.
   for (PHINode &PN : NewExit->phis()) {
     // PN should be used in another PHI located in Exit block as
     // Exit was split by SplitBlockPredecessors into Exit and NewExit
@@ -246,15 +255,11 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
     // epilogue edges have already been added.
     //
     // There is EpilogPreHeader incoming block instead of NewExit as
-    // NewExit was spilt 1 more time to get EpilogPreHeader.
+    // NewExit was split 1 more time to get EpilogPreHeader.
     assert(PN.hasOneUse() && "The phi should have 1 use");
     PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser());
     assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
 
-    // Add incoming PreHeader from branch around the Loop
-    PN.addIncoming(PoisonValue::get(PN.getType()), PreHeader);
-    SE.forgetValue(&PN);
-
     Value *V = PN.getIncomingValueForBlock(Latch);
     Instruction *I = dyn_cast<Instruction>(V);
     if (I && L->contains(I))
@@ -271,35 +276,52 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
                                NewExit);
     // Now PHIs should look like:
     // NewExit:
-    //   PN = PHI [I, Latch], [poison, PreHeader]
+    //   PN = PHI [I, Latch]
     // ...
     // Exit:
     //   EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
   }
 
-  // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
-  // Update corresponding PHI nodes in epilog loop.
+  // Create PHI nodes at NewExit (from the unrolling loop Latch) and at
+  // EpilogPreHeader (from PreHeader and NewExit).  Update corresponding PHI
+  // nodes in epilog loop.
   for (BasicBlock *Succ : successors(Latch)) {
     // Skip this as we already updated phis in exit blocks.
     if (!L->contains(Succ))
       continue;
+
+    // Succ here appears to always be just L->getHeader().  Otherwise, how do we
+    // know its corresponding epilog block (from VMap) is EpilogHeader and thus
+    // EpilogPreHeader is the right incoming block for VPN, as set below?
+    // TODO: Can we thus avoid the enclosing loop over successors?
+    assert(Succ == L->getHeader() &&
+           "Expect the only in-loop successor of latch to be the loop header");
+
     for (PHINode &PN : Succ->phis()) {
-      // Add new PHI nodes to the loop exit block and update epilog
-      // PHIs with the new PHI values.
-      PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr");
-      NewPN->insertBefore(NewExit->getFirstNonPHIIt());
-      // Adding a value to the new PHI node from the unrolling loop preheader.
-      NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);
-      // Adding a value to the new PHI node from the unrolling loop latch.
-      NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);
+      // Add new PHI nodes to the loop exit block.
+      PHINode *NewPN0 = PHINode::Create(PN.getType(), /*NumReservedValues=*/1,
+                                        PN.getName() + ".unr");
+      NewPN0->insertBefore(NewExit->getFirstNonPHIIt());
+      // Add value to the new PHI node from the unrolling loop latch.
+      NewPN0->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);
+
+      // Add new PHI nodes to EpilogPreHeader.
+      PHINode *NewPN1 = PHINode::Create(PN.getType(), /*NumReservedValues=*/2,
+                                        PN.getName() + ".epil.init");
+      NewPN1->insertBefore(EpilogPreHeader->getFirstNonPHIIt());
+      // Add value to the new PHI node from the unrolling loop preheader.
+      NewPN1->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);
+      // Add value to the new PHI node from the epilog loop guard.
+      NewPN1->addIncoming(NewPN0, NewExit);
 
       // Update the existing PHI node operand with the value from the new PHI
       // node.  Corresponding instruction in epilog loop should be PHI.
       PHINode *VPN = cast<PHINode>(VMap[&PN]);
-      VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN);
+      VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN1);
     }
   }
 
+  // In NewExit, branch around the epilog loop if no extra iters.
   Instruction *InsertPt = NewExit->getTerminator();
   IRBuilder<> B(InsertPt);
   Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
@@ -308,7 +330,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
   SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
   SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr,
                          PreserveLCSSA);
-  // Add the branch to the exit block (around the unrolling loop)
+  // Add the branch to the exit block (around the epilog loop)
   MDNode *BranchWeights = nullptr;
   if (hasBranchWeightMD(*Latch->getTerminator())) {
     // Assume equal distribution in interval [0, Count).
@@ -322,10 +344,11 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
     DT->changeImmediateDominator(Exit, NewDom);
   }
 
-  // Split the main loop exit to maintain canonicalization guarantees.
-  SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
-  SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr,
-                         PreserveLCSSA);
+  // In EpilogPreHeader, assume extra iters is non-zero.
+  IRBuilder<> B2(EpilogPreHeader, EpilogPreHeader->getFirstNonPHIIt());
+  Value *ModIsNotNull = B2.CreateIsNotNull(ModVal, "lcmp.mod");
+  AssumeInst *AI = cast<AssumeInst>(B2.CreateAssumption(ModIsNotNull));
+  AC.registerAssumption(AI);
 }
 
 /// Create a clone of the blocks in a loop and connect them together. A new
@@ -795,7 +818,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
                                            ConstantInt::get(BECount->getType(),
                                                             Count - 1)) :
                            B.CreateIsNotNull(ModVal, "lcmp.mod");
-  BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
+  BasicBlock *RemainderLoop =
+      UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
   BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
   // Branch to either remainder (extra iterations) loop or unrolling loop.
   MDNode *BranchWeights = nullptr;
@@ -808,7 +832,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
   PreHeaderBR->eraseFromParent();
   if (DT) {
     if (UseEpilogRemainder)
-      DT->changeImmediateDominator(NewExit, PreHeader);
+      DT->changeImmediateDominator(EpilogPreHeader, PreHeader);
     else
       DT->changeImmediateDominator(PrologExit, PreHeader);
   }
@@ -880,7 +904,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
   // from both the original loop and the remainder code reaching the exit
   // blocks. While the IDom of these exit blocks were from the original loop,
   // now the IDom is the preheader (which decides whether the original loop or
-  // remainder code should run).
+  // remainder code should run) unless the block still has just the original
+  // predecessor (such as NewExit in the case of an epilog remainder).
   if (DT && !L->getExitingBlock()) {
     SmallVector<BasicBlock *, 16> ChildrenToUpdate;
     // NB! We have to examine the dom children of all loop blocks, not just
@@ -891,7 +916,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
       auto *DomNodeBB = DT->getNode(BB);
       for (auto *DomChild : DomNodeBB->children()) {
         auto *DomChildBB = DomChild->getBlock();
-        if (!L->contains(LI->getLoopFor(DomChildBB)))
+        if (!L->contains(LI->getLoopFor(DomChildBB)) &&
+            DomChildBB->getUniquePredecessor() != BB)
           ChildrenToUpdate.push_back(DomChildBB);
       }
     }
@@ -930,7 +956,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
     // Connect the epilog code to the original loop and update the
     // PHI functions.
     ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader,
-                  NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count);
+                  NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC);
 
     // Update counter in loop for unrolling.
     // Use an incrementing IV.  Pre-incr/post-incr is backedge/trip count.
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index b8cfe3a..155fcc5 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6642,6 +6642,9 @@ public:
   /// Return true if the replacement is a lookup table.
   bool isLookupTable();
 
+  /// Return true if the replacement is a bit map.
+  bool isBitMap();
+
 private:
   // Depending on the switch, there are different alternatives.
   enum {
@@ -6932,6 +6935,8 @@ Constant *SwitchReplacement::getDefaultValue() { return DefaultValue; }
 
 bool SwitchReplacement::isLookupTable() { return Kind == LookupTableKind; }
 
+bool SwitchReplacement::isBitMap() { return Kind == BitMapKind; }
+
 static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) {
   // 40% is the default density for building a jump table in optsize/minsize
   // mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this
@@ -7097,7 +7102,8 @@ static void reuseTableCompare(
 /// lookup tables.
 static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
                                  DomTreeUpdater *DTU, const DataLayout &DL,
-                                 const TargetTransformInfo &TTI) {
+                                 const TargetTransformInfo &TTI,
+                                 bool ConvertSwitchToLookupTable) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
   BasicBlock *BB = SI->getParent();
@@ -7262,6 +7268,8 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
 
   bool AnyLookupTables = any_of(
       PhiToReplacementMap, [](auto &KV) { return KV.second.isLookupTable(); });
+  bool AnyBitMaps = any_of(PhiToReplacementMap,
+                           [](auto &KV) { return KV.second.isBitMap(); });
 
   // A few conditions prevent the generation of lookup tables:
   //     1. The target does not support lookup tables.
@@ -7274,6 +7282,12 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
        Fn->getFnAttribute("no-jump-tables").getValueAsBool()))
     return false;
 
+  // In the early optimization pipeline, disable formation of lookup tables,
+  // bit maps and mask checks, as they may inhibit further optimization.
+  if (!ConvertSwitchToLookupTable &&
+      (AnyLookupTables || AnyBitMaps || NeedMask))
+    return false;
+
   Builder.SetInsertPoint(SI);
   // TableIndex is the switch condition - TableIndexOffset if we don't
   // use the condition directly
@@ -7929,14 +7943,13 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (Options.ForwardSwitchCondToPhi && forwardSwitchConditionToPHI(SI))
     return requestResimplify();
 
-  // The conversion from switch to lookup tables results in difficult-to-analyze
-  // code and makes pruning branches much harder. This is a problem if the
-  // switch expression itself can still be restricted as a result of inlining or
-  // CVP. Therefore, only apply this transformation during late stages of the
-  // optimisation pipeline.
-  if (Options.ConvertSwitchToLookupTable &&
-      simplifySwitchLookup(SI, Builder, DTU, DL, TTI))
-    return requestResimplify();
+  // The conversion of switches to arithmetic or lookup table is disabled in
+  // the early optimization pipeline, as it may lose information or make the
+  // resulting code harder to analyze.
+  if (Options.ConvertSwitchToArithmetic || Options.ConvertSwitchToLookupTable)
+    if (simplifySwitchLookup(SI, Builder, DTU, DL, TTI,
+                             Options.ConvertSwitchToLookupTable))
+      return requestResimplify();
 
   if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI))
     return requestResimplify();