68 files changed, 3029 insertions, 718 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 299ea33..b3f5b12 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -866,21 +866,6 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
 
   Type *IntIdxTy = DL.getIndexType(Ptr->getType());
 
-  // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
-  // "inttoptr (sub (ptrtoint Ptr), V)"
-  if (Ops.size() == 2 && ResElemTy->isIntegerTy(8)) {
-    auto *CE = dyn_cast<ConstantExpr>(Ops[1]);
-    assert((!CE || CE->getType() == IntIdxTy) &&
-           "CastGEPIndices didn't canonicalize index types!");
-    if (CE && CE->getOpcode() == Instruction::Sub &&
-        CE->getOperand(0)->isNullValue()) {
-      Constant *Res = ConstantExpr::getPtrToInt(Ptr, CE->getType());
-      Res = ConstantExpr::getSub(Res, CE->getOperand(1));
-      Res = ConstantExpr::getIntToPtr(Res, ResTy);
-      return ConstantFoldConstant(Res, DL, TLI);
-    }
-  }
-
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
     if (!isa<ConstantInt>(Ops[i]))
       return nullptr;
@@ -1336,6 +1321,19 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
             DL, BaseOffset, /*AllowNonInbounds=*/true));
         if (Base->isNullValue()) {
           FoldedValue = ConstantInt::get(CE->getContext(), BaseOffset);
+        } else {
+          // ptrtoint (gep i8, Ptr, (sub 0, V)) -> sub (ptrtoint Ptr), V
+          if (GEP->getNumIndices() == 1 &&
+              GEP->getSourceElementType()->isIntegerTy(8)) {
+            auto *Ptr = cast<Constant>(GEP->getPointerOperand());
+            auto *Sub = dyn_cast<ConstantExpr>(GEP->getOperand(1));
+            Type *IntIdxTy = DL.getIndexType(Ptr->getType());
+            if (Sub && Sub->getType() == IntIdxTy &&
+                Sub->getOpcode() == Instruction::Sub &&
+                Sub->getOperand(0)->isNullValue())
+              FoldedValue = ConstantExpr::getSub(
+                  ConstantExpr::getPtrToInt(Ptr, IntIdxTy), Sub->getOperand(1));
+          }
         }
       }
       if (FoldedValue) {
@@ -3038,7 +3036,7 @@ static Constant *ConstantFoldFixedVectorCall(
     // Gather a column of constants.
     for (unsigned J = 0, JE = Operands.size(); J != JE; ++J) {
       // Some intrinsics use a scalar type for certain arguments.
-      if (hasVectorIntrinsicScalarOpd(IntrinsicID, J)) {
+      if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, J)) {
         Lane[J] = Operands[J];
         continue;
       }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index e03cf6c..e4d706a 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -227,12 +227,10 @@ static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
   return true;
 }
 
-bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
-                                           Loop *TheLoop, FastMathFlags FuncFMF,
-                                           RecurrenceDescriptor &RedDes,
-                                           DemandedBits *DB,
-                                           AssumptionCache *AC,
-                                           DominatorTree *DT) {
+bool RecurrenceDescriptor::AddReductionVar(
+    PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
+    RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC,
+    DominatorTree *DT, ScalarEvolution *SE) {
   if (Phi->getNumIncomingValues() != 2)
     return false;
 
@@ -249,6 +247,12 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // This includes users of the reduction, variables (which form a cycle
   // which ends in the phi node).
   Instruction *ExitInstruction = nullptr;
+
+  // Variable to keep last visited store instruction. By the end of the
+  // algorithm this variable will be either empty or having intermediate
+  // reduction value stored in invariant address.
+  StoreInst *IntermediateStore = nullptr;
+
   // Indicates that we found a reduction operation in our scan.
   bool FoundReduxOp = false;
 
@@ -314,6 +318,10 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   //  - By instructions outside of the loop (safe).
   //      * One value may have several outside users, but all outside
   //        uses must be of the same value.
+  //  - By store instructions with a loop invariant address (safe with
+  //    the following restrictions):
+  //      * If there are several stores, all must have the same address.
+  //      * Final value should be stored in that loop invariant address.
   //  - By an instruction that is not part of the reduction (not safe).
   //    This is either:
   //      * An instruction type other than PHI or the reduction operation.
@@ -321,6 +329,43 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   while (!Worklist.empty()) {
     Instruction *Cur = Worklist.pop_back_val();
 
+    // Store instructions are allowed iff it is the store of the reduction
+    // value to the same loop invariant memory location.
+    if (auto *SI = dyn_cast<StoreInst>(Cur)) {
+      if (!SE) {
+        LLVM_DEBUG(dbgs() << "Store instructions are not processed without "
+                          << "Scalar Evolution Analysis\n");
+        return false;
+      }
+
+      const SCEV *PtrScev = SE->getSCEV(SI->getPointerOperand());
+      // Check it is the same address as previous stores
+      if (IntermediateStore) {
+        const SCEV *OtherScev =
+            SE->getSCEV(IntermediateStore->getPointerOperand());
+
+        if (OtherScev != PtrScev) {
+          LLVM_DEBUG(dbgs() << "Storing reduction value to different addresses "
+                            << "inside the loop: " << *SI->getPointerOperand()
+                            << " and "
+                            << *IntermediateStore->getPointerOperand() << '\n');
+          return false;
+        }
+      }
+
+      // Check the pointer is loop invariant
+      if (!SE->isLoopInvariant(PtrScev, TheLoop)) {
+        LLVM_DEBUG(dbgs() << "Storing reduction value to non-uniform address "
+                          << "inside the loop: " << *SI->getPointerOperand()
+                          << '\n');
+        return false;
+      }
+
+      // IntermediateStore is always the last store in the loop.
+      IntermediateStore = SI;
+      continue;
+    }
+
     // No Users.
     // If the instruction has no users then this is a broken chain and can't be
     // a reduction variable.
@@ -443,10 +488,17 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
       // reductions which are represented as a cmp followed by a select.
       InstDesc IgnoredVal(false, nullptr);
       if (VisitedInsts.insert(UI).second) {
-        if (isa<PHINode>(UI))
+        if (isa<PHINode>(UI)) {
           PHIs.push_back(UI);
-        else
+        } else {
+          StoreInst *SI = dyn_cast<StoreInst>(UI);
+          if (SI && SI->getPointerOperand() == Cur) {
+            // Reduction variable chain can only be stored somewhere but it
+            // can't be used as an address.
+            return false;
+          }
           NonPHIs.push_back(UI);
+        }
       } else if (!isa<PHINode>(UI) &&
                  ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
                    !isa<SelectInst>(UI)) ||
@@ -474,6 +526,32 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1)
     return false;
 
+  if (IntermediateStore) {
+    // Check that stored value goes to the phi node again. This way we make sure
+    // that the value stored in IntermediateStore is indeed the final reduction
+    // value.
+    if (!is_contained(Phi->operands(), IntermediateStore->getValueOperand())) {
+      LLVM_DEBUG(dbgs() << "Not a final reduction value stored: "
+                        << *IntermediateStore << '\n');
+      return false;
+    }
+
+    // If there is an exit instruction it's value should be stored in
+    // IntermediateStore
+    if (ExitInstruction &&
+        IntermediateStore->getValueOperand() != ExitInstruction) {
+      LLVM_DEBUG(dbgs() << "Last store Instruction of reduction value does not "
+                           "store last calculated value of the reduction: "
+                        << *IntermediateStore << '\n');
+      return false;
+    }
+
+    // If all uses are inside the loop (intermediate stores), then the
+    // reduction value after the loop will be the one used in the last store.
+    if (!ExitInstruction)
+      ExitInstruction = cast<Instruction>(IntermediateStore->getValueOperand());
+  }
+
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
 
@@ -535,9 +613,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // is saved as part of the RecurrenceDescriptor.
 
   // Save the description of this reduction variable.
-  RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst,
-                          RecurrenceType, IsSigned, IsOrdered, CastInsts,
-                          MinWidthCastToRecurrenceType);
+  RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind,
+                          FMF, ExactFPMathInst, RecurrenceType, IsSigned,
+                          IsOrdered, CastInsts, MinWidthCastToRecurrenceType);
   RedDes = RD;
 
   return true;
@@ -761,7 +839,8 @@ bool RecurrenceDescriptor::hasMultipleUsesOf(
 bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
                                           RecurrenceDescriptor &RedDes,
                                           DemandedBits *DB, AssumptionCache *AC,
-                                          DominatorTree *DT) {
+                                          DominatorTree *DT,
+                                          ScalarEvolution *SE) {
   BasicBlock *Header = TheLoop->getHeader();
   Function &F = *Header->getParent();
   FastMathFlags FMF;
@@ -770,72 +849,85 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
   FMF.setNoSignedZeros(
       F.getFnAttribute("no-signed-zeros-fp-math").getValueAsBool());
 
-  if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC,
-                      DT)) {
+                      DT, SE)) {
     LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI."
                       << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n");
     return true;
   }
   if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC,
-                      DT)) {
+                      DT, SE)) {
     LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI."
                       << " PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC,
-                      DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
     LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n");
     return true;
   }
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index b1773db..d0276df 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1993,9 +1993,12 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   for (StoreInst *ST : Stores) {
     Value *Ptr = ST->getPointerOperand();
 
-    if (isUniform(Ptr))
+    if (isUniform(Ptr)) {
+      // Record store instructions to loop invariant addresses
+      StoresToInvariantAddresses.push_back(ST);
       HasDependenceInvolvingLoopInvariantAddress |=
           !UniformStores.insert(Ptr).second;
+    }
 
     // If we did *not* see this pointer before, insert it to  the read-write
     // list. At this phase it is only a 'write' list.
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index b7806b3..eacd2621 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -103,14 +103,24 @@ static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize,
   return StepRec == &ElemSize;
 }
 
-/// Compute the trip count for the given loop \p L. Return the SCEV expression
-/// for the trip count or nullptr if it cannot be computed.
-static const SCEV *computeTripCount(const Loop &L, ScalarEvolution &SE) {
+/// Compute the trip count for the given loop \p L or assume a default value if
+/// it is not a compile time constant. Return the SCEV expression for the trip
+/// count.
+static const SCEV *computeTripCount(const Loop &L, const SCEV &ElemSize,
+                                    ScalarEvolution &SE) {
   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L);
-  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
-      !isa<SCEVConstant>(BackedgeTakenCount))
-    return nullptr;
-  return SE.getTripCountFromExitCount(BackedgeTakenCount);
+  const SCEV *TripCount = (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
+                           isa<SCEVConstant>(BackedgeTakenCount))
+                              ? SE.getTripCountFromExitCount(BackedgeTakenCount)
+                              : nullptr;
+
+  if (!TripCount) {
+    LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
+               << " could not be computed, using DefaultTripCount\n");
+    TripCount = SE.getConstant(ElemSize.getType(), DefaultTripCount);
+  }
+
+  return TripCount;
 }
 
 //===----------------------------------------------------------------------===//
@@ -274,22 +284,18 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
     return 1;
   }
 
-  const SCEV *TripCount = computeTripCount(L, SE);
-  if (!TripCount) {
-    LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
-                      << " could not be computed, using DefaultTripCount\n");
-    const SCEV *ElemSize = Sizes.back();
-    TripCount = SE.getConstant(ElemSize->getType(), DefaultTripCount);
-  }
+  const SCEV *TripCount = computeTripCount(L, *Sizes.back(), SE);
+  assert(TripCount && "Expecting valid TripCount");
   LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n");
 
-  // If the indexed reference is 'consecutive' the cost is
-  // (TripCount*Stride)/CLS, otherwise the cost is TripCount.
-  const SCEV *RefCost = TripCount;
-
+  const SCEV *RefCost = nullptr;
   if (isConsecutive(L, CLS)) {
+    // If the indexed reference is 'consecutive' the cost is
+    // (TripCount*Stride)/CLS.
     const SCEV *Coeff = getLastCoefficient();
     const SCEV *ElemSize = Sizes.back();
+    assert(Coeff->getType() == ElemSize->getType() &&
+           "Expecting the same type");
     const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
     Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType());
     const SCEV *CacheLineSize = SE.getConstant(WiderType, CLS);
@@ -303,10 +309,33 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
     LLVM_DEBUG(dbgs().indent(4)
                << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
                << *RefCost << "\n");
-  } else
+  } else {
+    // If the indexed reference is not 'consecutive' the cost is proportional to
+    // the trip count and the depth of the dimension which the subject loop
+    // subscript is accessing. We try to estimate this by multiplying the cost
+    // by the trip counts of loops corresponding to the inner dimensions. For
+    // example, given the indexed reference 'A[i][j][k]', and assuming the
+    // i-loop is in the innermost position, the cost would be equal to the
+    // iterations of the i-loop multiplied by iterations of the j-loop.
+    RefCost = TripCount;
+
+    int Index = getSubscriptIndex(L);
+    assert(Index >= 0 && "Cound not locate a valid Index");
+
+    for (unsigned I = Index + 1; I < getNumSubscripts() - 1; ++I) {
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(I));
+      assert(AR && AR->getLoop() && "Expecting valid loop");
+      const SCEV *TripCount =
+          computeTripCount(*AR->getLoop(), *Sizes.back(), SE);
+      Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType());
+      RefCost = SE.getMulExpr(SE.getNoopOrAnyExtend(RefCost, WiderType),
+                              SE.getNoopOrAnyExtend(TripCount, WiderType));
+    }
+
     LLVM_DEBUG(dbgs().indent(4)
-               << "Access is not consecutive: RefCost=TripCount=" << *RefCost
-               << "\n");
+               << "Access is not consecutive: RefCost=" << *RefCost << "\n");
+  }
+  assert(RefCost && "Expecting a valid RefCost");
 
   // Attempt to fold RefCost into a constant.
   if (auto ConstantCost = dyn_cast<SCEVConstant>(RefCost))
@@ -481,6 +510,16 @@ bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
   return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize);
 }
 
+int IndexedReference::getSubscriptIndex(const Loop &L) const {
+  for (auto Idx : seq<int>(0, getNumSubscripts())) {
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(Idx));
+    if (AR && AR->getLoop() == &L) {
+      return Idx;
+    }
+  }
+  return -1;
+}
+
 const SCEV *IndexedReference::getLastCoefficient() const {
   const SCEV *LastSubscript = getLastSubscript();
   auto *AR = cast<SCEVAddRecExpr>(LastSubscript);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 75381f5..0144ce4 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -282,6 +282,20 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
         match(LHS, m_c_And(m_Specific(M), m_Value())))
       return true;
   }
+
+  // X op (Y & ~X)
+  if (match(RHS, m_c_And(m_Not(m_Specific(LHS)), m_Value())) ||
+      match(LHS, m_c_And(m_Not(m_Specific(RHS)), m_Value())))
+    return true;
+
+  // X op ((X & Y) ^ Y) -- this is the canonical form of the previous pattern
+  // for constant Y.
+  Value *Y;
+  if (match(RHS,
+            m_c_Xor(m_c_And(m_Specific(LHS), m_Value(Y)), m_Deferred(Y))) ||
+      match(LHS, m_c_Xor(m_c_And(m_Specific(RHS), m_Value(Y)), m_Deferred(Y))))
+    return true;
+
   // Look for: (A & B) op ~(A | B)
   {
     Value *A, *B;
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 5f8fa13..a53b216 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -40,7 +40,7 @@ static cl::opt<unsigned> MaxInterleaveGroupFactor(
 /// Return true if all of the intrinsic's arguments and return type are scalars
 /// for the scalar form of the intrinsic, and vectors for the vector form of the
 /// intrinsic (except operands that are marked as always being scalar by
-/// hasVectorIntrinsicScalarOpd).
+/// isVectorIntrinsicWithScalarOpAtArg).
 bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   switch (ID) {
   case Intrinsic::abs:   // Begin integer bit-manipulation.
@@ -89,6 +89,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::fmuladd:
   case Intrinsic::powi:
   case Intrinsic::canonicalize:
+  case Intrinsic::fptosi_sat:
+  case Intrinsic::fptoui_sat:
     return true;
   default:
     return false;
@@ -96,8 +98,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
 }
 
 /// Identifies if the vector form of the intrinsic has a scalar operand.
-bool llvm::hasVectorIntrinsicScalarOpd(Intrinsic::ID ID,
-                                       unsigned ScalarOpdIdx) {
+bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
+                                              unsigned ScalarOpdIdx) {
   switch (ID) {
   case Intrinsic::abs:
   case Intrinsic::ctlz:
@@ -114,11 +116,14 @@ bool llvm::hasVectorIntrinsicScalarOpd(Intrinsic::ID ID,
   }
 }
 
-bool llvm::hasVectorIntrinsicOverloadedScalarOpd(Intrinsic::ID ID,
-                                                 unsigned ScalarOpdIdx) {
+bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
+                                                  unsigned OpdIdx) {
   switch (ID) {
+  case Intrinsic::fptosi_sat:
+  case Intrinsic::fptoui_sat:
+    return OpdIdx == 0;
   case Intrinsic::powi:
-    return (ScalarOpdIdx == 1);
+    return OpdIdx == 1;
   default:
     return false;
   }
diff --git a/llvm/lib/BinaryFormat/Magic.cpp b/llvm/lib/BinaryFormat/Magic.cpp
index 5d999a90..d48adb1 100644
--- a/llvm/lib/BinaryFormat/Magic.cpp
+++ b/llvm/lib/BinaryFormat/Magic.cpp
@@ -225,6 +225,11 @@ file_magic llvm::identify_magic(StringRef Magic) {
     if (startswith(Magic, "--- !tapi") || startswith(Magic, "---\narchs:"))
       return file_magic::tapi_file;
     break;
+  
+  case 'D': // DirectX container file - DXBC
+    if (startswith(Magic, "DXBC") && Magic.size() == 4)
+      return file_magic::dxcontainer_object;
+    break;
 
   default:
     break;
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 06830e8..f2b0024 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -1033,7 +1033,32 @@ void MachineFunction::substituteDebugValuesForInst(const MachineInstr &Old,
   }
 }
 
-auto MachineFunction::salvageCopySSA(MachineInstr &MI)
+auto MachineFunction::salvageCopySSA(
+    MachineInstr &MI, DenseMap<Register, DebugInstrOperandPair> &DbgPHICache)
+    -> DebugInstrOperandPair {
+  const TargetInstrInfo &TII = *getSubtarget().getInstrInfo();
+
+  // Check whether this copy-like instruction has already been salvaged into
+  // an operand pair.
+  Register Dest;
+  if (auto CopyDstSrc = TII.isCopyInstr(MI)) {
+    Dest = CopyDstSrc->Destination->getReg();
+  } else {
+    assert(MI.isSubregToReg());
+    Dest = MI.getOperand(0).getReg();
+  }
+
+  auto CacheIt = DbgPHICache.find(Dest);
+  if (CacheIt != DbgPHICache.end())
+    return CacheIt->second;
+
+  // Calculate the instruction number to use, or install a DBG_PHI.
+  auto OperandPair = salvageCopySSAImpl(MI);
+  DbgPHICache.insert({Dest, OperandPair});
+  return OperandPair;
+}
+
+auto MachineFunction::salvageCopySSAImpl(MachineInstr &MI)
     -> DebugInstrOperandPair {
   MachineRegisterInfo &MRI = getRegInfo();
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
@@ -1189,6 +1214,7 @@ void MachineFunction::finalizeDebugInstrRefs() {
     MI.getOperand(1).ChangeToRegister(0, false);
   };
 
+  DenseMap<Register, DebugInstrOperandPair> ArgDbgPHIs;
   for (auto &MBB : *this) {
     for (auto &MI : MBB) {
       if (!MI.isDebugRef() || !MI.getOperand(0).isReg())
@@ -1211,7 +1237,7 @@ void MachineFunction::finalizeDebugInstrRefs() {
       // instruction that defines the source value, see salvageCopySSA docs
       // for why this is important.
       if (DefMI.isCopyLike() || TII->isCopyInstr(DefMI)) {
-        auto Result = salvageCopySSA(DefMI);
+        auto Result = salvageCopySSA(DefMI, ArgDbgPHIs);
         MI.getOperand(0).ChangeToImmediate(Result.first);
         MI.getOperand(1).setImm(Result.second);
       } else {
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 6887347..87b8ac5 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -109,7 +109,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
     auto *ArgType = Arg.value()->getType();
     // Vector calls to intrinsics can still have
     // scalar operands for specific arguments.
-    if (hasVectorIntrinsicScalarOpd(IntrinsicID, Arg.index())) {
+    if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) {
       ScalarTypes.push_back(ArgType);
     } else {
       // The argument in this place should be a vector if
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e139cf6..e483c3a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -519,7 +519,9 @@ namespace {
 
     SDValue XformToShuffleWithZero(SDNode *N);
     bool reassociationCanBreakAddressingModePattern(unsigned Opc,
-                                                    const SDLoc &DL, SDValue N0,
+                                                    const SDLoc &DL,
+                                                    SDNode *N,
+                                                    SDValue N0,
                                                     SDValue N1);
     SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0,
                                       SDValue N1);
@@ -996,6 +998,7 @@ static bool canSplitIdx(LoadSDNode *LD) {
 
 bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
                                                              const SDLoc &DL,
+                                                             SDNode *N,
                                                              SDValue N0,
                                                              SDValue N1) {
   // Currently this only tries to ensure we don't undo the GEP splits done by
@@ -1025,7 +1028,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
     return false;
   const int64_t CombinedValue = CombinedValueIntVal.getSExtValue();
 
-  for (SDNode *Node : N0->uses()) {
+  for (SDNode *Node : N->uses()) {
     auto LoadStore = dyn_cast<MemSDNode>(Node);
     if (LoadStore) {
       // Is x[offset2] already not a legal addressing mode? If so then
@@ -2447,7 +2450,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
     return NewSel;
 
   // reassociate add
-  if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N0, N1)) {
+  if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N, N0, N1)) {
     if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
       return RADD;
 
@@ -15527,7 +15530,7 @@ static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
   // This means this is also safe for a signed input and unsigned output, since
   // a negative input would lead to undefined behavior.
   unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
-  unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned;
+  unsigned OutputSize = (int)VT.getScalarSizeInBits();
   unsigned ActualSize = std::min(InputSize, OutputSize);
   const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d667988..90e4b5d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4684,26 +4684,33 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
   return false;
 }
 
+static bool haveNoCommonBitsSetCommutative(SDValue A, SDValue B) {
+  // Match masked merge pattern (X & ~M) op (Y & M)
+  // Including degenerate case (X & ~M) op M
+  auto MatchNoCommonBitsPattern = [&](SDValue NotM, SDValue Other) {
+    if (isBitwiseNot(NotM, true)) {
+      SDValue NotOperand = NotM->getOperand(0);
+      if (Other == NotOperand)
+        return true;
+      if (Other->getOpcode() == ISD::AND)
+        return NotOperand == Other->getOperand(0) ||
+               NotOperand == Other->getOperand(1);
+    }
+    return false;
+  };
+  if (A->getOpcode() == ISD::AND)
+    return MatchNoCommonBitsPattern(A->getOperand(0), B) ||
+           MatchNoCommonBitsPattern(A->getOperand(1), B);
+  return false;
+}
+
 // FIXME: unify with llvm::haveNoCommonBitsSet.
 bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
   assert(A.getValueType() == B.getValueType() &&
          "Values must have the same type");
-  // Match masked merge pattern (X & ~M) op (Y & M)
-  if (A->getOpcode() == ISD::AND && B->getOpcode() == ISD::AND) {
-    auto MatchNoCommonBitsPattern = [&](SDValue NotM, SDValue And) {
-      if (isBitwiseNot(NotM, true)) {
-        SDValue NotOperand = NotM->getOperand(0);
-        return NotOperand == And->getOperand(0) ||
-               NotOperand == And->getOperand(1);
-      }
-      return false;
-    };
-    if (MatchNoCommonBitsPattern(A->getOperand(0), B) ||
-        MatchNoCommonBitsPattern(A->getOperand(1), B) ||
-        MatchNoCommonBitsPattern(B->getOperand(0), A) ||
-        MatchNoCommonBitsPattern(B->getOperand(1), A))
-      return true;
-  }
+  if (haveNoCommonBitsSetCommutative(A, B) ||
+      haveNoCommonBitsSetCommutative(B, A))
+    return true;
   return KnownBits::haveNoCommonBitsSet(computeKnownBits(A),
                                         computeKnownBits(B));
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9732a17..b209aecf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9044,7 +9044,9 @@ void TargetLowering::expandUADDSUBO(
   if (IsAdd && isOneConstant(RHS)) {
     // Special case: uaddo X, 1 overflowed if X+1 is 0. This potential reduces
     // the live range of X. We assume comparing with 0 is cheap.
-    // TODO: This generalizes to (X + C) < C.
+    // The general case (X + C) < C is not necessarily beneficial. Although we
+    // reduce the live range of X, we may introduce the materialization of
+    // constant C.
     SetCC =
         DAG.getSetCC(dl, SetCCType, Result,
                      DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETEQ);
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 3cade80..1aa2d44 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -2029,3 +2029,14 @@ void AttributeFuncs::mergeAttributesForOutlining(Function &Base,
   // that aspect in the merged function.
   mergeFnAttrs(Base, ToMerge);
 }
+
+void AttributeFuncs::updateMinLegalVectorWidthAttr(Function &Fn,
+                                                   uint64_t Width) {
+  Attribute Attr = Fn.getFnAttribute("min-legal-vector-width");
+  if (Attr.isValid()) {
+    uint64_t OldWidth;
+    Attr.getValueAsString().getAsInteger(0, OldWidth);
+    if (Width > OldWidth)
+      Fn.addFnAttr("min-legal-vector-width", llvm::utostr(Width));
+  }
+}
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 5dcf1ba..c182513 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -2068,6 +2068,17 @@ Constant *ConstantExpr::getTruncOrBitCast(Constant *C, Type *Ty) {
   return getTrunc(C, Ty);
 }
 
+Constant *ConstantExpr::getSExtOrTrunc(Constant *C, Type *Ty) {
+  assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() &&
+         "Can only sign extend/truncate integers!");
+  Type *CTy = C->getType();
+  if (CTy->getScalarSizeInBits() < Ty->getScalarSizeInBits())
+    return getSExt(C, Ty);
+  if (CTy->getScalarSizeInBits() > Ty->getScalarSizeInBits())
+    return getTrunc(C, Ty);
+  return C;
+}
+
 Constant *ConstantExpr::getPointerCast(Constant *S, Type *Ty) {
   assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
   assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) &&
diff --git a/llvm/lib/Object/Binary.cpp b/llvm/lib/Object/Binary.cpp
index 67ed44a..1703f76 100644
--- a/llvm/lib/Object/Binary.cpp
+++ b/llvm/lib/Object/Binary.cpp
@@ -84,6 +84,7 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
   case file_magic::unknown:
   case file_magic::cuda_fatbinary:
   case file_magic::coff_cl_gl_object:
+  case file_magic::dxcontainer_object:
     // Unrecognized object file format.
     return errorCodeToError(object_error::invalid_file_type);
   case file_magic::minidump:
diff --git a/llvm/lib/Object/CMakeLists.txt b/llvm/lib/Object/CMakeLists.txt
index 0825210..ba612e3 100644
--- a/llvm/lib/Object/CMakeLists.txt
+++ b/llvm/lib/Object/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMObject
   COFFModuleDefinition.cpp
   COFFObjectFile.cpp
   Decompressor.cpp
+  DXContainer.cpp
   ELF.cpp
   ELFObjectFile.cpp
   Error.cpp
diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp
new file mode 100644
index 0000000..e1aea562
--- /dev/null
+++ b/llvm/lib/Object/DXContainer.cpp
@@ -0,0 +1,44 @@
+//===- DXContainer.cpp - DXContainer object file implementation -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/DXContainer.h"
+#include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/Object/Error.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+static Error parseFailed(const Twine &Msg) {
+  return make_error<GenericBinaryError>(Msg.str(), object_error::parse_failed);
+}
+
+template <typename T>
+static Error readStruct(StringRef Buffer, const char *P, T &Struct) {
+  // Don't read before the beginning or past the end of the file
+  if (P < Buffer.begin() || P + sizeof(T) > Buffer.end())
+    return parseFailed("Reading structure out of file bounds");
+
+  memcpy(&Struct, P, sizeof(T));
+  // DXContainer is always BigEndian
+  if (sys::IsBigEndianHost)
+    Struct.byteSwap();
+  return Error::success();
+}
+
+DXContainer::DXContainer(MemoryBufferRef O) : Data(O) {}
+
+Error DXContainer::parseHeader() {
+  return readStruct(Data.getBuffer(), Data.getBuffer().data(), Header);
+}
+
+Expected<DXContainer> DXContainer::create(MemoryBufferRef Object) {
+  DXContainer Container(Object);
+  if (Error Err = Container.parseHeader())
+    return std::move(Err);
+  return Container;
+}
diff --git a/llvm/lib/Object/ObjectFile.cpp b/llvm/lib/Object/ObjectFile.cpp
index fed6726..609dfae 100644
--- a/llvm/lib/Object/ObjectFile.cpp
+++ b/llvm/lib/Object/ObjectFile.cpp
@@ -147,6 +147,7 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type,
   case file_magic::minidump:
   case file_magic::goff_object:
   case file_magic::cuda_fatbinary:
+  case file_magic::dxcontainer_object:
     return errorCodeToError(object_error::invalid_file_type);
   case file_magic::tapi_file:
     return errorCodeToError(object_error::invalid_file_type);
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 4c92502..3e5fff9 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -918,21 +918,34 @@ static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) {
   return I - 1;
 }
 
-// Windows treats whitespace, double quotes, and backslashes specially.
+// Windows treats whitespace, double quotes, and backslashes specially, except
+// when parsing the first token of a full command line, in which case
+// backslashes are not special.
 static bool isWindowsSpecialChar(char C) {
   return isWhitespaceOrNull(C) || C == '\\' || C == '\"';
 }
+static bool isWindowsSpecialCharInCommandName(char C) {
+  return isWhitespaceOrNull(C) || C == '\"';
+}
 
 // Windows tokenization implementation. The implementation is designed to be
 // inlined and specialized for the two user entry points.
-static inline void
-tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
-                               function_ref<void(StringRef)> AddToken,
-                               bool AlwaysCopy, function_ref<void()> MarkEOL) {
+static inline void tokenizeWindowsCommandLineImpl(
+    StringRef Src, StringSaver &Saver, function_ref<void(StringRef)> AddToken,
+    bool AlwaysCopy, function_ref<void()> MarkEOL, bool InitialCommandName) {
   SmallString<128> Token;
 
+  // Sometimes, this function will be handling a full command line including an
+  // executable pathname at the start. In that situation, the initial pathname
+  // needs different handling from the following arguments, because when
+  // CreateProcess or cmd.exe scans the pathname, it doesn't treat \ as
+  // escaping the quote character, whereas when libc scans the rest of the
+  // command line, it does.
+  bool CommandName = InitialCommandName;
+
   // Try to do as much work inside the state machine as possible.
   enum { INIT, UNQUOTED, QUOTED } State = INIT;
+
   for (size_t I = 0, E = Src.size(); I < E; ++I) {
     switch (State) {
     case INIT: {
@@ -947,19 +960,29 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
       if (I >= E)
         break;
       size_t Start = I;
-      while (I < E && !isWindowsSpecialChar(Src[I]))
-        ++I;
+      if (CommandName) {
+        while (I < E && !isWindowsSpecialCharInCommandName(Src[I]))
+          ++I;
+      } else {
+        while (I < E && !isWindowsSpecialChar(Src[I]))
+          ++I;
+      }
       StringRef NormalChars = Src.slice(Start, I);
       if (I >= E || isWhitespaceOrNull(Src[I])) {
         // No special characters: slice out the substring and start the next
         // token. Copy the string if the caller asks us to.
         AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars);
-        if (I < E && Src[I] == '\n')
+        if (I < E && Src[I] == '\n') {
           MarkEOL();
+          CommandName = InitialCommandName;
+        } else {
+          CommandName = false;
+        }
       } else if (Src[I] == '\"') {
         Token += NormalChars;
         State = QUOTED;
       } else if (Src[I] == '\\') {
+        assert(!CommandName && "or else we'd have treated it as a normal char");
         Token += NormalChars;
         I = parseBackslash(Src, I, Token);
         State = UNQUOTED;
@@ -976,12 +999,16 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
         // token.
         AddToken(Saver.save(Token.str()));
         Token.clear();
-        if (Src[I] == '\n')
+        if (Src[I] == '\n') {
+          CommandName = InitialCommandName;
           MarkEOL();
+        } else {
+          CommandName = false;
+        }
         State = INIT;
       } else if (Src[I] == '\"') {
         State = QUOTED;
-      } else if (Src[I] == '\\') {
+      } else if (Src[I] == '\\' && !CommandName) {
         I = parseBackslash(Src, I, Token);
       } else {
         Token.push_back(Src[I]);
@@ -999,7 +1026,7 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
           // Otherwise, end the quoted portion and return to the unquoted state.
           State = UNQUOTED;
         }
-      } else if (Src[I] == '\\') {
+      } else if (Src[I] == '\\' && !CommandName) {
         I = parseBackslash(Src, I, Token);
       } else {
         Token.push_back(Src[I]);
@@ -1008,7 +1035,7 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
     }
   }
 
-  if (State == UNQUOTED)
+  if (State != INIT)
     AddToken(Saver.save(Token.str()));
 }
 
@@ -1021,7 +1048,7 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
       NewArgv.push_back(nullptr);
   };
   tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
-                                 /*AlwaysCopy=*/true, OnEOL);
+                                 /*AlwaysCopy=*/true, OnEOL, false);
 }
 
 void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
@@ -1029,7 +1056,19 @@ void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
   auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok); };
   auto OnEOL = []() {};
   tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, /*AlwaysCopy=*/false,
-                                 OnEOL);
+                                 OnEOL, false);
+}
+
+void cl::TokenizeWindowsCommandLineFull(StringRef Src, StringSaver &Saver,
+                                        SmallVectorImpl<const char *> &NewArgv,
+                                        bool MarkEOLs) {
+  auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok.data()); };
+  auto OnEOL = [&]() {
+    if (MarkEOLs)
+      NewArgv.push_back(nullptr);
+  };
+  tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
+                                 /*AlwaysCopy=*/true, OnEOL, true);
 }
 
 void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver,
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index 98272bb..976599f 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -296,6 +296,12 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
     }
   }
 
+  if (Implementer == "0xc0") { // Ampere Computing
+    return StringSwitch<const char *>(Part)
+        .Case("0xac3", "ampere1")
+        .Default("generic");
+  }
+
   return "generic";
 }
 
diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index dfaab16..e415674 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -247,7 +247,7 @@ windows::GetCommandLineArguments(SmallVectorImpl<const char *> &Args,
 
   SmallVector<const char *, 20> TmpArgs;
   StringSaver Saver(Alloc);
-  cl::TokenizeWindowsCommandLine(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false);
+  cl::TokenizeWindowsCommandLineFull(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false);
 
   for (const char *Arg : TmpArgs) {
     EC = WildcardExpand(Arg, Args, Saver);
@@ -255,6 +255,9 @@ windows::GetCommandLineArguments(SmallVectorImpl<const char *> &Args,
       return EC;
   }
 
+  if (Args.size() == 0)
+    return std::make_error_code(std::errc::invalid_argument);
+
   SmallVector<char, MAX_PATH> Arg0(Args[0], Args[0] + strlen(Args[0]));
   SmallVector<char, MAX_PATH> Filename;
   sys::path::remove_filename(Arg0);
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index bd6deb5..2682b9b 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -567,6 +567,7 @@ include "AArch64Schedule.td"
 include "AArch64InstrInfo.td"
 include "AArch64SchedPredicates.td"
 include "AArch64SchedPredExynos.td"
+include "AArch64SchedPredAmpere.td"
 include "AArch64Combine.td"
 
 def AArch64InstrInfo : InstrInfo;
@@ -636,6 +637,7 @@ include "AArch64SchedThunderX2T99.td"
 include "AArch64SchedA64FX.td"
 include "AArch64SchedThunderX3T110.td"
 include "AArch64SchedTSV110.td"
+include "AArch64SchedAmpere1.td"
 
 def TuneA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                 "Cortex-A35 ARM processors">;
@@ -956,6 +958,16 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
                                   FeatureFuseAES,
                                   FeaturePostRAScheduler]>;
 
+def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
+                                   "Ampere Computing Ampere-1 processors", [
+                                   FeaturePostRAScheduler,
+                                   FeatureFuseAES,
+                                   FeatureLSLFast,
+                                   FeatureAggressiveFMA,
+                                   FeatureArithmeticBccFusion,
+                                   FeatureCmpBccFusion,
+                                   FeatureFuseAddress,
+                                   FeatureFuseLiterals]>;
 
 def ProcessorFeatures {
   list<SubtargetFeature> A53  = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
@@ -1067,6 +1079,8 @@ def ProcessorFeatures {
   list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                    FeatureNEON, FeaturePerfMon, FeatureSPE,
                                    FeatureFullFP16, FeatureFP16FML, FeatureDotProd];
+  list<SubtargetFeature> Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
+                                    FeatureMTE, FeatureSSBS];
 
   // ETE and TRBE are future architecture extensions. We temporarily enable them
   // by default for users targeting generic AArch64. The extensions do not
@@ -1205,6 +1219,10 @@ def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX,
 def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel,
                      [TuneCarmel]>;
 
+// Ampere Computing
+def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1,
+                     [TuneAmpere1]>;
+
 //===----------------------------------------------------------------------===//
 // Assembly parser
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index c367d2d..71911b6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5092,12 +5092,19 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
                                                    SDValue &OffImm) {
   const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);
   const DataLayout &DL = CurDAG->getDataLayout();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
 
   if (N.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
-    OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
-    return true;
+    // We can only encode VL scaled offsets, so only fold in frame indexes
+    // referencing SVE objects.
+    if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+      OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
+      return true;
+    }
+
+    return false;
   }
 
   if (MemVT == EVT())
@@ -5124,7 +5131,10 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
   Base = N.getOperand(0);
   if (Base.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+    // We can only encode VL scaled offsets, so only fold in frame indexes
+    // referencing SVE objects.
+    if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector)
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
   }
 
   OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
new file mode 100644
index 0000000..32f7299
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
@@ -0,0 +1,1136 @@
+//=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the Ampere Computing Ampere-1 to
+// support instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+// The Ampere-1 core is an out-of-order micro-architecture.  The front
+// end has branch prediction, with a 10-cycle recovery time from a
+// mispredicted branch.  Instructions coming out of the front end are
+// decoded into internal micro-ops (uops).
+
+def Ampere1Model : SchedMachineModel {
+  let IssueWidth            =   4;  // 4-way decode and dispatch
+  let MicroOpBufferSize     = 174;  // micro-op re-order buffer size
+  let LoadLatency           =   4;  // Optimistic load latency
+  let MispredictPenalty     =  10;  // Branch mispredict penalty
+  let LoopMicroOpBufferSize =  32;  // Instruction queue size
+  let CompleteModel = 1;
+
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    SMEUnsupported.F);
+}
+
+let SchedModel = Ampere1Model in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Ampere-1.
+// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP,
+// and 2 memory) issue into.  The integer and FP schedulers can each issue
+// one uop per cycle, while the memory schedulers can each issue one load
+// and one store address calculation per cycle.
+
+def Ampere1UnitA  : ProcResource<2>;  // integer single-cycle, branch, and flags r/w
+def Ampere1UnitB  : ProcResource<2>;  // integer single-cycle, and complex shifts
+def Ampere1UnitBS : ProcResource<1>;  // integer multi-cycle
+def Ampere1UnitL  : ProcResource<2>;  // load
+def Ampere1UnitS  : ProcResource<2>;  // store address calculation
+def Ampere1UnitX  : ProcResource<1>;  // FP and vector operations, and flag write
+def Ampere1UnitY  : ProcResource<1>;  // FP and vector operations, and crypto
+def Ampere1UnitZ  : ProcResource<1>;  // FP store data and FP-to-integer moves
+
+def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>;
+def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>;
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Ampere-1.
+
+def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
+  let Latency = 2;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
+                                                             Ampere1UnitS]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
+                                                                Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 2;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS,
+                                                               Ampere1UnitAB]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 4;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitS, Ampere1UnitS,
+                                             Ampere1UnitZ, Ampere1UnitZ,
+                                             Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 5;
+  let NumMicroOps = 8;
+}
+
+def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 5;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 9;
+}
+
+def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                          Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 7;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 7;
+  let NumMicroOps = 12;
+}
+
+def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA,
+                                                             Ampere1UnitA]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                           Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY,
+                                              Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 8;
+  let NumMicroOps = 8;
+}
+
+def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY,
+                                              Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 8;
+}
+
+def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                              Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 9;
+  let NumMicroOps = 5;
+}
+
+def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 9;
+  let NumMicroOps = 14;
+}
+
+def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitXY, Ampere1UnitXY,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitS, Ampere1UnitS,
+                                                 Ampere1UnitZ, Ampere1UnitZ,
+                                                 Ampere1UnitZ, Ampere1UnitZ]> {
+  let Latency = 9;
+  let NumMicroOps = 16;
+}
+
+def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 10;
+  let NumMicroOps = 6;
+}
+
+def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> {
+  let Latency = 11;
+  let NumMicroOps = 2;
+}
+
+def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 11;
+  let NumMicroOps = 12;
+}
+
+def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitL, Ampere1UnitL,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY,
+                                               Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 12;
+  let NumMicroOps = 12;
+}
+
+def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 12;
+  let NumMicroOps = 3;
+}
+
+def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
+                                            Ampere1UnitXY, Ampere1UnitXY]> {
+  let Latency = 12;
+  let NumMicroOps = 4;
+}
+
+def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 18;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 19;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 25;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 32;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
+  let Latency = 34;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 34;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 39;
+  let NumMicroOps = 1;
+}
+
+def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
+  let Latency = 62;
+  let NumMicroOps = 1;
+}
+
+// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
+// which are a single uop, and for extended registers, which have full flexibility
+// across Unit A or B for both uops.
+def Ampere1Write_Arith : SchedWriteVariant<[
+                                SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>,
+                                SchedVar<AmpereCheapLSL,  [Ampere1Write_1cyc_1AB]>,
+                                SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1AB]>]>;
+
+def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[
+                                SchedVar<RegExtendedPred, [Ampere1Write_2cyc_1AB_1A]>,
+                                SchedVar<AmpereCheapLSL,  [Ampere1Write_1cyc_1A]>,
+                                SchedVar<NoSchedPred,     [Ampere1Write_2cyc_1B_1A]>]>;
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latencies for Ampere-1.
+// This provides a coarse model, which is then specialised below.
+
+def : WriteRes<WriteImm,   [Ampere1UnitAB]>;  // MOVN, MOVZ
+def : WriteRes<WriteI,     [Ampere1UnitAB]>;  // ALU
+def : WriteRes<WriteISReg, [Ampere1UnitB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}  // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [Ampere1UnitAB, Ampere1UnitA]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}  // ALU of Extended-Reg
+def : WriteRes<WriteExtr,  [Ampere1UnitB]>;  // EXTR shifts a reg pair
+def : WriteRes<WriteIS,    [Ampere1UnitB]>;  // Shift/Scale
+def : WriteRes<WriteID32,  [Ampere1UnitBS]> {
+  let Latency = 18;
+}  // 32-bit Divide
+def : WriteRes<WriteID64,  [Ampere1UnitBS]> {
+  let Latency = 34;
+}  // 64-bit Divide
+def : WriteRes<WriteIM32,  [Ampere1UnitBS]> {
+  let Latency = 3;
+}  // 32-bit Multiply
+def : WriteRes<WriteIM64,  [Ampere1UnitBS]> {
+  let Latency = 3;
+}  // 32-bit Multiply
+def : WriteRes<WriteBr,    [Ampere1UnitA]>;
+def : WriteRes<WriteBrReg, [Ampere1UnitA, Ampere1UnitA]>;
+def : WriteRes<WriteLD,    [Ampere1UnitL]> {
+  let Latency = 4;
+}  // Load from base addr plus immediate offset
+def : WriteRes<WriteST,    [Ampere1UnitS]> {
+  let Latency = 1;
+}  // Store to base addr plus immediate offset
+def : WriteRes<WriteSTP,   [Ampere1UnitS, Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}  // Store a register pair.
+def : WriteRes<WriteAdr,   [Ampere1UnitAB]>;
+def : WriteRes<WriteLDIdx, [Ampere1UnitAB, Ampere1UnitS]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}  // Load from a register index (maybe scaled).
+def : WriteRes<WriteSTIdx, [Ampere1UnitS, Ampere1UnitS]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}  // Store to a register index (maybe scaled).
+def : WriteRes<WriteF,  [Ampere1UnitXY]> {
+  let Latency = 2;
+}  // General floating-point ops.
+def : WriteRes<WriteFCmp,  [Ampere1UnitX]> {
+  let Latency = 5;
+}  // Floating-point compare.
+def : WriteRes<WriteFCvt,  [Ampere1UnitXY]> {
+  let Latency = 6;
+}  // Float conversion.
+def : WriteRes<WriteFCopy, [Ampere1UnitXY]> {
+}  // Float-int register copy.
+def : WriteRes<WriteFImm,  [Ampere1UnitXY]> {
+  let Latency = 2;
+}  // Float-int register copy.
+def : WriteRes<WriteFMul,  [Ampere1UnitXY]> {
+  let Latency = 5;
+}  // Floating-point multiply.
+def : WriteRes<WriteFDiv,  [Ampere1UnitXY]> {
+  let Latency = 34;
+}  // Floating-point division.
+def : WriteRes<WriteVd,    [Ampere1UnitXY]> {
+  let Latency = 3;
+}  // 64bit Vector D ops.
+def : WriteRes<WriteVq,    [Ampere1UnitXY]> {
+  let Latency = 3;
+}  // 128bit Vector Q ops.
+def : WriteRes<WriteVLD,   [Ampere1UnitL, Ampere1UnitL]> {
+  let Latency = 5;
+}  // Vector loads.
+def : WriteRes<WriteVST,   [Ampere1UnitS, Ampere1UnitZ]> {
+  let Latency = 2;
+}  // Vector stores.
+
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 4;
+}  // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
+
+// Forwarding logic.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     1, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadST,      0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+//===----------------------------------------------------------------------===//
+// Specialising the scheduling model further for Ampere-1.
+
+def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>;
+
+// Branch instructions
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
+def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>;
+
+// Cryptography instructions
+// -- AES encryption/decryption
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>;
+// -- Polynomial multiplication
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
+// -- SHA-256 hash
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>;
+// -- SHA-256 schedule update
+def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>;
+// -- SHA-3 instructions
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
+// -- SHA-512 hash
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>;
+// -- SHA-512 schedule update
+def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>;
+// -- SHA1 choose/majority/parity
+def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>;
+// -- SHA1 hash/schedule update
+def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>;
+def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>;
+
+// FP and vector load instructions
+// -- Load 1-element structure to one/all lanes
+// ---- all lanes
+def : InstRW<[Ampere1Write_7cyc_1L_1XY],
+        (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// ---- one lane
+def : InstRW<[Ampere1Write_7cyc_1L_1XY],
+        (instregex "^LD1i(8|16|32|64)")>;
+// -- Load 1-element structure to one/all lanes, 1D size
+def : InstRW<[Ampere1Write_5cyc_1L],
+        (instregex "^LD1Rv1d")>;
+// -- Load 1-element structures to 1 register
+def : InstRW<[Ampere1Write_5cyc_1L],
+        (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 2 registers
+def : InstRW<[Ampere1Write_5cyc_2L],
+        (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 3 registers
+def : InstRW<[Ampere1Write_6cyc_3L],
+        (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 1-element structures to 4 registers
+def : InstRW<[Ampere1Write_6cyc_4L],
+        (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to all lanes of 2 registers, 1D size
+def : InstRW<[Ampere1Write_5cyc_2L],
+        (instregex "^LD2Rv1d")>;
+// -- Load 2-element structure to all lanes of 2 registers, other sizes
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+        (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 2-element structure to one lane of 2 registers
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+        (instregex "^LD2i(8|16|32|64)")>;
+// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
+def : InstRW<[Ampere1Write_7cyc_2L_2XY],
+        (instregex "^LD2Twov(16b|8h|4s|2d)")>;
+// -- Load 2-element structures to 2 registers, 8B/4H/2S size
+def : InstRW<[Ampere1Write_9cyc_2L_3XY],
+        (instregex "^LD2Twov(8b|4h|2s)")>;
+// -- Load 3-element structure to all lanes of 3 registers, 1D size
+def : InstRW<[Ampere1Write_6cyc_3L],
+        (instregex "^LD3Rv1d")>;
+// -- Load 3-element structure to all lanes of 3 registers, other sizes
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+        (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 3-element structure to one lane of 3 registers
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+        (instregex "^LD3i(8|16|32|64)")>;
+// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1Write_9cyc_3L_3XY],
+        (instregex "^LD3Threev(16b|8h|4s)")>;
+// -- Load 3-element structures to 3 registers, 2D size
+def : InstRW<[Ampere1Write_8cyc_3L_3XY],
+        (instregex "^LD3Threev2d")>;
+// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_10cyc_3L_3XY],
+        (instregex "^LD3Threev(8b|4h|2s)")>;
+// -- Load 4-element structure to all lanes of 4 registers, 1D size
+def : InstRW<[Ampere1Write_6cyc_4L],
+        (instregex "^LD4Rv1d")>;
+// -- Load 4-element structure to all lanes of 4 registers, other sizes
+def : InstRW<[Ampere1Write_8cyc_4L_4XY],
+        (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
+// -- Load 4-element structure to one lane of 4 registers
+def : InstRW<[Ampere1Write_6cyc_4L],
+        (instregex "^LD4i(8|16|32|64)")>;
+// -- Load 4-element structures to 4 registers, 2D size
+def : InstRW<[Ampere1Write_9cyc_4L_4XY],
+        (instregex "^LD4Fourv2d")>;
+// -- Load 4-element structures to 4 registers, 2S size
+def : InstRW<[Ampere1Write_12cyc_4L_8XY],
+        (instregex "^LD4Fourv2s")>;
+// -- Load 4-element structures to 4 registers, other sizes
+def : InstRW<[Ampere1Write_11cyc_4L_8XY],
+        (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
+// -- Load pair, Q-form
+def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>;
+// -- Load pair, S/D-form
+def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
+// -- Load register
+def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
+// -- Load register, sign-extended register
+def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
+
+// FP and vector store instructions
+// -- Store 1-element structure from one lane of 1 register
+def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z],
+        (instregex "^ST1i(8|16|32|64)")>;
+// -- Store 1-element structures from 1 register
+def : InstRW<[Ampere1Write_2cyc_1S_1Z],
+        (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 2 registers
+def : InstRW<[Ampere1Write_3cyc_2S_2Z],
+        (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 3 registers
+def : InstRW<[Ampere1Write_4cyc_3S_3Z],
+        (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 1-element structures from 4 registers
+def : InstRW<[Ampere1Write_5cyc_4S_4Z],
+        (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 2-element structure from one lane of 2 registers
+def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
+        (instregex "^ST2i(8|16|32|64)")>;
+// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
+def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
+        (instregex "^ST2Twov(16b|8h|4s|2d)")>;
+// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z],
+        (instregex "^ST2Twov(8b|4h|2s)")>;
+// -- Store 3-element structure from one lane of 3 registers
+def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
+        (instregex "^ST3i(8|16|32|64)")>;
+// -- Store 3-element structures from 3 registers
+def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
+        (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
+// -- Store 4-element structure from one lane of 4 registers
+def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
+        (instregex "^ST4i(8|16|32|64)")>;
+// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
+def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z],
+        (instregex "^ST4Fourv(16b|8h|4s)")>;
+// -- Store 4-element structures from 4 registers, 2D sizes
+def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
+        (instregex "^ST4Fourv2d")>;
+// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
+def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z],
+        (instregex "^ST4Fourv(8b|4h|2s)")>;
+// -- Store pair, Q-form
+def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>;
+// -- Store pair, S/D-form
+def : InstRW<[Ampere1Write_3cyc_1S_2Z],	(instregex "^STN?P[SD]")>;
+// -- Store register
+def : InstRW<[Ampere1Write_2cyc_1S_1Z],	(instregex "^STU?R[BHSDQ](ui|i)")>;
+// -- Store register, sign-extended register offset
+def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
+
+// FP data processing, bfloat16 format
+def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>;
+def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
+def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>;
+
+// FP data processing, scalar/vector, half precision
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
+def : InstRW<[Ampere1Write_4cyc_1X],
+        (instregex "^FCMPE?H")>;
+def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X],
+        (instregex "^FCCMPE?H")>;
+def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY],
+        (instregex "^FCSELH")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>;
+def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
+def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
+def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
+def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
+def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>;
+
+// FP data processing, scalar/vector, single/double precision
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+        (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY],
+        (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1X],
+        (instregex "^FCMPE?(S|D)")>;
+def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X],
+        (instregex "^FCCMPE?(S|D)")>;
+def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY],
+        (instregex "^FCSEL(S|D)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>;
+def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>;
+def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>;
+def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
+def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
+
+// FP miscellaneous instructions
+def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
+def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>;
+def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>;
+def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>;
+def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
+def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
+def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>;
+def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
+
+// Integer arithmetic and logical instructions
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "ADC(W|X)r", "SBC(W|X)r")>;
+def : InstRW<[Ampere1Write_Arith],
+        (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r")>;
+def : InstRW<[Ampere1Write_ArithFlagsetting],
+        (instregex "(ADD|AND|BIC|SUB)S(W|X)r")>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "(ADC|SBC)S(W|X)r")>;
+def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "(CCMN|CCMP)(X|W)")>;
+def : InstRW<[Ampere1Write_1cyc_1A],
+        (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
+def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>;
+def : InstRW<[Ampere1Write_3cyc_1BS],
+        (instregex "(S|U)MULHr")>;
+def : InstRW<[Ampere1Write_4cyc_1BS],
+        (instregex "(S|U)?M(ADD|SUB)L?r")>;
+
+// Integer load instructions
+def : InstRW<[Ampere1Write_4cyc_2L],
+        (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDR(B|D|H|Q|S)ui")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDR(D|Q|W|X)l")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDTR(B|H|W|X)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1Write_4cyc_1L],
+        (instregex "LDURS(BW|BX|HW|HX|W)i")>;
+def : InstRW<[Ampere1Write_5cyc_1AB_1L],
+        (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1L],
+        (instrs PRFMl, PRFUMi, PRFUMi)>;
+def : InstRW<[Ampere1Write_2cyc_1AB_1L],
+        (instrs PRFMroW, PRFMroX)>;
+
+// Integer miscellaneous instructions
+def : InstRW<[Ampere1Write_1cyc_1A],  (instrs ADR, ADRP)>;
+def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "EXTR(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "(S|U)?BFM(W|X)")>;
+def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
+def : InstRW<[Ampere1Write_1cyc_1B],  (instregex "CLS(W|X)")>;
+def : InstRW<[Ampere1Write_1cyc_1A],  (instrs SETF8, SETF16)>;
+def : InstRW<[Ampere1Write_1cyc_1AB],
+        (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+def : InstRW<[Ampere1Write_1cyc_1B],
+        (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
+def : InstRW<[Ampere1Write_1cyc_1B],
+        (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
+
+// Integer store instructions
+def : InstRW<[Ampere1Write_1cyc_2S],  (instregex "STNP(X|W)i")>;
+def : InstRW<[Ampere1Write_2cyc_1B_1S],
+        (instrs STPWi, STPXi)>;
+def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB],
+        (instregex "STP(W|X)(pre|post)")>;
+def : InstRW<[Ampere1Write_1cyc_1S],
+        (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
+def : InstRW<[Ampere1Write_1cyc_1S],
+        (instregex "STUR(BB|HH|X|W)i",
+                   "STR(X|W)ui",
+                   "STUR(BB|HH|X|W)i")>;
+def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>;
+def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>;
+
+// Pointer authentication
+//def : InstRW<[Ampere1Write_7cyc_1BS],
+//	(instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>;
+def : InstRW<[Ampere1Write_8cyc_1BS_1A],
+        (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
+def : InstRW<[Ampere1Write_8cyc_1BS_2A],
+        (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
+//def : InstRW<[Ampere1Write_7cyc_1BS],
+//	(instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>;
+def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
+def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>;
+
+// Vector integer instructions
+// -- absolute difference
+def : InstRW<[Ampere1Write_3cyc_1XY],
+             (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
+                        "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
+// -- arithmetic
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
+                   "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
+                   "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
+// -- arithmetic, horizontal, 16B
+def : InstRW<[Ampere1Write_12cyc_4XY],
+            (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
+def : InstRW<[Ampere1Write_12cyc_4XY],
+            (instregex "^[SU](MIN|MAX)Vv16i8v")>;
+// -- arithmetic, horizontal, 4H/4S
+def : InstRW<[Ampere1Write_6cyc_2XY],
+            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
+def : InstRW<[Ampere1Write_6cyc_2XY],
+            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
+// -- arithmetic, horizontal, 8B/8H
+def : InstRW<[Ampere1Write_9cyc_3XY],
+            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
+def : InstRW<[Ampere1Write_9cyc_3XY],
+            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
+// -- arithmetic, narrowing
+def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
+// -- arithmetic, pairwise
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
+// -- arithmetic, saturating
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
+// -- bit count
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^(CLS|CLZ|CNT)v")>;
+// -- compare
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
+                   "^CMHIv", "^CMHSv")>;
+// -- compare non-zero
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>;
+// -- dot product
+def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
+// -- fp reciprocal estimate
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>;
+// -- integer reciprocal estimate
+def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
+// -- logical
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+// -- logical, narrowing
+def : InstRW<[Ampere1Write_5cyc_2XY],
+        (instregex "RSHRNv",
+                   "SHRNv", "SQSHRNv", "SQSHRUNv",
+                   "UQXTNv")>;
+// -- matrix multiply
+def : InstRW<[Ampere1Write_6cyc_2XY],
+        (instrs SMMLA, UMMLA, USMMLA)>;
+// -- max/min
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
+// -- move immediate
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
+// -- multiply
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
+// -- multiply accumulate
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
+// -- negation, saturating
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
+// -- reverse bits/bytes
+def : InstRW<[Ampere1Write_2cyc_1XY],
+        (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
+// -- shift
+def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// -- shift and accumulate
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
+// -- shift, saturating
+def : InstRW<[Ampere1Write_3cyc_1XY],
+        (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
+                   "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
+                   "^UQSHL")>;
+
+// Vector miscellaneous instructions
+// -- duplicate element
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>;
+// -- duplicate from GPR
+def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>;
+// -- extract narrow
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>;
+// -- insert/extract element
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
+// -- move FP immediate
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>;
+// -- move element to GPR
+def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>;
+// -- move from GPR to any element
+def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
+// -- table lookup
+def : InstRW<[Ampere1Write_2cyc_1XY],
+            (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
+def : InstRW<[Ampere1Write_4cyc_2XY],
+            (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
+def : InstRW<[Ampere1Write_6cyc_3XY],
+            (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
+def : InstRW<[Ampere1Write_8cyc_4XY],
+            (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
+// -- transpose
+def : InstRW<[Ampere1Write_2cyc_1XY],
+              (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
+// -- zip/unzip
+def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
+
+} // SchedModel = Ampere1Model
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td
new file mode 100644
index 0000000..8552c07
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td
@@ -0,0 +1,25 @@
+//===- AArch64SchedPredAmpere.td - AArch64 Sched Preds -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines scheduling predicate definitions that are used by the
+// AArch64 Ampere Computing processors.
+//
+//===----------------------------------------------------------------------===//
+
+// Auxiliary predicates.
+
+// Check for a LSL shift <= 4
+def AmpereCheapLSL : MCSchedPredicate<
+                                CheckAny<[CheckShiftBy0,
+                                 CheckAll<
+                                   [CheckShiftLSL,
+                                    CheckAny<
+                                      [CheckShiftBy1,
+                                       CheckShiftBy2,
+                                       CheckShiftBy3,
+                                       CheckShiftBy4]>]>]>>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
index 5402b8b..4473f3a 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
@@ -53,7 +53,7 @@ let FunctionMapper = "AArch64_AM::getShiftType" in {
 }
 
 // Check for shifting in arithmetic and logic instructions.
-foreach I = {0-3, 8} in {
+foreach I = {0-4, 8} in {
   let FunctionMapper = "AArch64_AM::getShiftValue" in
   def CheckShiftBy#I        : CheckImmOperand<3, I>;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index b3eb65d..f9b7ca8 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -238,6 +238,12 @@ void AArch64Subtarget::initializeProperties() {
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
+  case Ampere1:
+    CacheLineSize = 64;
+    PrefFunctionLogAlignment = 6;
+    PrefLoopLogAlignment = 6;
+    MaxInterleaveFactor = 4;
+    break;
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index e919263..d7878c4 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -40,6 +40,7 @@ public:
   enum ARMProcFamilyEnum : uint8_t {
     Others,
     A64FX,
+    Ampere1,
     AppleA7,
     AppleA10,
     AppleA11,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a184159..135b94d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -371,6 +371,49 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
         return Entry->Cost;
     break;
   }
+  case Intrinsic::fptosi_sat:
+  case Intrinsic::fptoui_sat: {
+    if (ICA.getArgTypes().empty())
+      break;
+    bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
+    auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]);
+    EVT MTy = TLI->getValueType(DL, RetTy);
+    // Check for the legal types, which are where the size of the input and the
+    // output are the same, or we are using cvt f64->i32 or f32->i64.
+    if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
+         LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
+         LT.second == MVT::v2f64) &&
+        (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
+         (LT.second == MVT::f64 && MTy == MVT::i32) ||
+         (LT.second == MVT::f32 && MTy == MVT::i64)))
+      return LT.first;
+    // Similarly for fp16 sizes
+    if (ST->hasFullFP16() &&
+        ((LT.second == MVT::f16 && MTy == MVT::i32) ||
+         ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
+          (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
+      return LT.first;
+
+    // Otherwise we use a legal convert followed by a min+max
+    if ((LT.second.getScalarType() == MVT::f32 ||
+         LT.second.getScalarType() == MVT::f64 ||
+         (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
+        LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
+      Type *LegalTy =
+          Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
+      if (LT.second.isVector())
+        LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
+      InstructionCost Cost = 1;
+      IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
+                                    LegalTy, {LegalTy, LegalTy});
+      Cost += getIntrinsicInstrCost(Attrs1, CostKind);
+      IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
+                                    LegalTy, {LegalTy, LegalTy});
+      Cost += getIntrinsicInstrCost(Attrs2, CostKind);
+      return LT.first * Cost;
+    }
+    break;
+  }
   default:
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 805b6c7..bfe2e9b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -126,7 +126,6 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
          LLT::scalar(64));
   const LLT S32 = LLT::scalar(32);
 
-  B.setMBB(*MI.getParent());
   B.setInstrAndDebugLoc(MI);
 
   auto Unmerge = B.buildUnmerge(S32, Src);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 76c8edc..a8310c2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -737,7 +737,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
-      setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Custom);
     }
   }
 
@@ -4772,6 +4772,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return lowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:
+    return lowerSCALAR_TO_VECTOR(Op, DAG);
   case ISD::BUILD_VECTOR:
     return lowerBUILD_VECTOR(Op, DAG);
   case ISD::FP_ROUND:
@@ -5768,14 +5770,11 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
   EVT EltVT = VecVT.getVectorElementType();
   unsigned VecSize = VecVT.getSizeInBits();
   unsigned EltSize = EltVT.getSizeInBits();
+  SDLoc SL(Op);
 
-
-  assert(VecSize <= 64);
-
+  // Specially handle the case of v4i16 with static indexing.
   unsigned NumElts = VecVT.getVectorNumElements();
-  SDLoc SL(Op);
   auto KIdx = dyn_cast<ConstantSDNode>(Idx);
-
   if (NumElts == 4 && EltSize == 16 && KIdx) {
     SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
 
@@ -5803,35 +5802,41 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
     return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
   }
 
+  // Static indexing does not lower to stack access, and hence there is no need
+  // for special custom lowering to avoid stack access.
   if (isa<ConstantSDNode>(Idx))
     return SDValue();
 
-  MVT IntVT = MVT::getIntegerVT(VecSize);
-
-  // Avoid stack access for dynamic indexing.
+  // Avoid stack access for dynamic indexing by custom lowering to
   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
 
-  // Create a congruent vector with the target value in each element so that
-  // the required element can be masked and ORed into the target vector.
-  SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
-                               DAG.getSplatBuildVector(VecVT, SL, InsVal));
+  assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
 
+  MVT IntVT = MVT::getIntegerVT(VecSize);
+
+  // Convert vector index to bit-index and get the required bit mask.
   assert(isPowerOf2_32(EltSize));
   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
-
-  // Convert vector index to bit-index.
   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
-
-  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
   SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
                             DAG.getConstant(0xffff, SL, IntVT),
                             ScaledIdx);
 
+  // 1. Create a congruent vector with the target value in each element.
+  SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
+                               DAG.getSplatBuildVector(VecVT, SL, InsVal));
+
+  // 2. Mask off all other indicies except the required index within (1).
   SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
+
+  // 3. Mask off the required index within the target vector.
+  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
   SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
                             DAG.getNOT(SL, BFM, IntVT), BCVec);
 
+  // 4. Get (2) and (3) ORed into the target vector.
   SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+
   return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
 }
 
@@ -5954,6 +5959,22 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
   return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
 }
 
+SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDValue SVal = Op.getOperand(0);
+  EVT ResultVT = Op.getValueType();
+  EVT SValVT = SVal.getValueType();
+  SDValue UndefVal = DAG.getUNDEF(SValVT);
+  SDLoc SL(Op);
+
+  SmallVector<SDValue, 8> VElts;
+  VElts.push_back(SVal);
+  for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
+    VElts.push_back(UndefVal);
+
+  return DAG.getBuildVector(ResultVT, SL, VElts);
+}
+
 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc SL(Op);
@@ -10661,39 +10682,64 @@ static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
 }
 
-SDValue SITargetLowering::performAddCombine(SDNode *N,
+// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z).
+SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  assert(N->getOpcode() == ISD::ADD);
+
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
   SDLoc SL(N);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
-  if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
-      && Subtarget->hasMad64_32() &&
-      !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
-      VT.getScalarSizeInBits() <= 64) {
-    if (LHS.getOpcode() != ISD::MUL)
-      std::swap(LHS, RHS);
+  if (VT.isVector())
+    return SDValue();
 
-    SDValue MulLHS = LHS.getOperand(0);
-    SDValue MulRHS = LHS.getOperand(1);
-    SDValue AddRHS = RHS;
+  unsigned NumBits = VT.getScalarSizeInBits();
+  if (NumBits <= 32 || NumBits > 64)
+    return SDValue();
 
-    // TODO: Maybe restrict if SGPR inputs.
-    if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
-        numBitsUnsigned(MulRHS, DAG) <= 32) {
-      MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
-      MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
-      AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
-      return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
-    }
+  if (LHS.getOpcode() != ISD::MUL) {
+    assert(RHS.getOpcode() == ISD::MUL);
+    std::swap(LHS, RHS);
+  }
+
+  SDValue MulLHS = LHS.getOperand(0);
+  SDValue MulRHS = LHS.getOperand(1);
+  SDValue AddRHS = RHS;
+
+  // TODO: Maybe restrict if SGPR inputs.
+  if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
+      numBitsUnsigned(MulRHS, DAG) <= 32) {
+    MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
+    MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
+    AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
+    return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
+  }
+
+  if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) {
+    MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
+    MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
+    AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
+    return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performAddCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDLoc SL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
 
-    if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) {
-      MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
-      MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
-      AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
-      return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
+  if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
+    if (Subtarget->hasMad64_32()) {
+      if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+        return Folded;
     }
 
     return SDValue();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 72a00c1..18bb9fb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -151,6 +151,7 @@ private:
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
@@ -197,6 +198,7 @@ private:
   SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
+  SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index c2cc302..b2765b2 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2562,8 +2562,9 @@ void ARMFrameLowering::adjustForSegmentedStacks(
     // Make sure the LiveIns are still sorted and unique.
     MBB->sortUniqueLiveIns();
     // Replace the edges to PrologueMBB by edges to the sequences
-    // we are about to add.
-    MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]);
+    // we are about to add, but only update for immediate predecessors.
+    if (MBB->isSuccessor(&PrologueMBB))
+      MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]);
   }
 
   // The required stack size that is aligned to ARM constant criterion.
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index bbd5f5f..44a323d 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -376,7 +376,8 @@ def ProcessorFeatures {
      FeaturePartwordAtomic,
      FeatureQuadwordAtomic,
      FeaturePredictableSelectIsExpensive,
-     FeatureISA2_07
+     FeatureISA2_07,
+     FeatureCRBits
     ];
 
   list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion,
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 9d8290f..5b28f0d 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -32,7 +32,7 @@ class PassRegistry;
 
 bool lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP);
-bool LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
+bool lowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
                                          MCOperand &MCOp, const AsmPrinter &AP);
 
 FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 7d1ec2a..5b2a247 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -63,7 +63,7 @@ public:
 
   // Wrapper needed for tblgenned pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
-    return LowerRISCVMachineOperandToMCOperand(MO, MCOp, *this);
+    return lowerRISCVMachineOperandToMCOperand(MO, MCOp, *this);
   }
 
   void emitStartOfAsmFile(Module &M) override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index de9c151..7fae031 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1121,16 +1121,15 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       SDValue V0 = CurDAG->getRegister(RISCV::V0, VT);
 
       // Otherwise use
-      // vmslt{u}.vx vd, va, x, v0.t; if mask policy is agnostic.
+      // vmslt{u}.vx vd, va, x, v0.t; vmxor.mm vd, vd, v0
+      // The result is mask undisturbed.
+      // We use the same instructions to emulate mask agnostic behavior, because
+      // the agnostic result can be either undisturbed or all 1.
       SDValue Cmp = SDValue(
           CurDAG->getMachineNode(VMSLTMaskOpcode, DL, VT,
                                  {MaskedOff, Src1, Src2, V0, VL, SEW, Glue}),
           0);
-      if (MaskedOff.isUndef()) {
-        ReplaceNode(Node, Cmp.getNode());
-        return;
-      }
-      // Need vmxor.mm vd, vd, v0 to assign inactive value.
+      // vmxor.mm vd, vd, v0 is used to update active value.
       ReplaceNode(Node, CurDAG->getMachineNode(VMXOROpcode, DL, VT,
                                                {Cmp, Mask, VL, MaskSEW}));
       return;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4cb3188..ff63b22 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -6918,7 +6918,10 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Overflow;
     if (IsAdd && isOneConstant(RHS)) {
       // Special case uaddo X, 1 overflowed if the addition result is 0.
-      // FIXME: We can do this for any constant RHS by using (X + C) < C.
+      // The general case (X + C) < C is not necessarily beneficial. Although we
+      // reduce the live range of X, we may introduce the materialization of
+      // constant C, especially when the setcc result is used by branch. We have
+      // no compare with constant and branch instructions.
       Overflow = DAG.getSetCC(DL, N->getValueType(1), Res,
                               DAG.getConstant(0, DL, MVT::i64), ISD::SETEQ);
     } else {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index f7a1998..3831dc5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -114,11 +114,11 @@ defm : FPFMADynFrmAlias_m<FNMSUB_D, "fnmsub.d", DINX>;
 defm : FPFMADynFrmAlias_m<FNMADD_D, "fnmadd.d", DINX>;
 
 let SchedRW = [WriteFALU64, ReadFALU64, ReadFALU64] in {
-defm FADD_D : FPALU_rr_frm_m<0b0000001, "fadd.d", DINX>;
+defm FADD_D : FPALU_rr_frm_m<0b0000001, "fadd.d", DINX, /*Commutable*/1>;
 defm FSUB_D : FPALU_rr_frm_m<0b0000101, "fsub.d", DINX>;
 }
 let SchedRW = [WriteFMul64, ReadFMul64, ReadFMul64] in
-defm FMUL_D : FPALU_rr_frm_m<0b0001001, "fmul.d", DINX>;
+defm FMUL_D : FPALU_rr_frm_m<0b0001001, "fmul.d", DINX, /*Commutable*/1>;
 
 let SchedRW = [WriteFDiv64, ReadFDiv64, ReadFDiv64] in
 defm FDIV_D : FPALU_rr_frm_m<0b0001101, "fdiv.d", DINX>;
@@ -140,8 +140,8 @@ defm FSGNJX_D : FPALU_rr_m<0b0010001, 0b010, "fsgnjx.d", DINX>;
 }
 
 let SchedRW = [WriteFMinMax64, ReadFMinMax64, ReadFMinMax64] in {
-defm FMIN_D   : FPALU_rr_m<0b0010101, 0b000, "fmin.d", DINX>;
-defm FMAX_D   : FPALU_rr_m<0b0010101, 0b001, "fmax.d", DINX>;
+defm FMIN_D   : FPALU_rr_m<0b0010101, 0b000, "fmin.d", DINX, /*Commutable*/1>;
+defm FMAX_D   : FPALU_rr_m<0b0010101, 0b001, "fmax.d", DINX, /*Commutable*/1>;
 }
 
 defm FCVT_S_D : FPUnaryOp_r_frm_m<0b0100000, 0b00001, FDINX, "fcvt.s.d">,
@@ -152,7 +152,7 @@ defm FCVT_D_S : FPUnaryOp_r_m<0b0100001, 0b00000, 0b000, DFINX, "fcvt.d.s">,
                 Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]>;
 
 let SchedRW = [WriteFCmp64, ReadFCmp64, ReadFCmp64] in {
-defm FEQ_D : FPCmp_rr_m<0b1010001, 0b010, "feq.d", DINX>;
+defm FEQ_D : FPCmp_rr_m<0b1010001, 0b010, "feq.d", DINX, /*Commutable*/1>;
 defm FLT_D : FPCmp_rr_m<0b1010001, 0b001, "flt.d", DINX>;
 defm FLE_D : FPCmp_rr_m<0b1010001, 0b000, "fle.d", DINX>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index a2cd4a0..b1077ae 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -187,28 +187,32 @@ multiclass FPFMADynFrmAlias_m<FPFMA_rrr_frm Inst, string OpcodeStr,
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
 class FPALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
-               DAGOperand rty>
+               DAGOperand rty, bit Commutable>
     : RVInstR<funct7, funct3, OPC_OP_FP, (outs rty:$rd),
-              (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+              (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2"> {
+  let isCommutable = Commutable;
+}
 multiclass FPALU_rr_m<bits<7> funct7, bits<3> funct3, string opcodestr,
-                      list<ExtInfo_r> Exts> {
+                      list<ExtInfo_r> Exts, bit Commutable = 0> {
   foreach Ext = Exts in
     let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
-    def Ext.Suffix : FPALU_rr<funct7, funct3, opcodestr, Ext.Reg>;
+    def Ext.Suffix : FPALU_rr<funct7, funct3, opcodestr, Ext.Reg, Commutable>;
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
     UseNamedOperandTable = 1, hasPostISelHook = 1 in
-class FPALU_rr_frm<bits<7> funct7, string opcodestr, DAGOperand rty>
+class FPALU_rr_frm<bits<7> funct7, string opcodestr, DAGOperand rty,
+                   bit Commutable>
     : RVInstRFrm<funct7, OPC_OP_FP, (outs rty:$rd),
                  (ins rty:$rs1, rty:$rs2, frmarg:$frm), opcodestr,
-                  "$rd, $rs1, $rs2, $frm">;
-
+                  "$rd, $rs1, $rs2, $frm"> {
+  let isCommutable = Commutable;
+}
 multiclass FPALU_rr_frm_m<bits<7> funct7, string opcodestr,
-                          list<ExtInfo_r> Exts> {
+                          list<ExtInfo_r> Exts, bit Commutable = 0> {
   foreach Ext = Exts in
     let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
-    def Ext.Suffix : FPALU_rr_frm<funct7, opcodestr, Ext.Reg>;
+    def Ext.Suffix : FPALU_rr_frm<funct7, opcodestr, Ext.Reg, Commutable>;
 }
 
 class FPALUDynFrmAlias<FPALU_rr_frm Inst, string OpcodeStr,
@@ -269,14 +273,16 @@ multiclass FPUnaryOpDynFrmAlias_m<FPUnaryOp_r_frm Inst, string OpcodeStr,
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
 class FPCmp_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
-               DAGOperand rty>
+               DAGOperand rty, bit Commutable>
     : RVInstR<funct7, funct3, OPC_OP_FP, (outs GPR:$rd),
-              (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+              (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2"> {
+  let isCommutable = Commutable;
+}
 multiclass FPCmp_rr_m<bits<7> funct7, bits<3> funct3, string opcodestr,
-                      list<ExtInfo_r> Exts> {
+                      list<ExtInfo_r> Exts, bit Commutable = 0> {
   foreach Ext = Exts in
     let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
-    def Ext.Suffix : FPCmp_rr<funct7, funct3, opcodestr, Ext.Reg>;
+    def Ext.Suffix : FPCmp_rr<funct7, funct3, opcodestr, Ext.Reg, Commutable>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -305,11 +311,11 @@ defm : FPFMADynFrmAlias_m<FNMSUB_S, "fnmsub.s", FINX>;
 defm : FPFMADynFrmAlias_m<FNMADD_S, "fnmadd.s", FINX>;
 
 let SchedRW = [WriteFALU32, ReadFALU32, ReadFALU32] in {
-defm FADD_S : FPALU_rr_frm_m<0b0000000, "fadd.s", FINX>;
+defm FADD_S : FPALU_rr_frm_m<0b0000000, "fadd.s", FINX, /*Commutable*/1>;
 defm FSUB_S : FPALU_rr_frm_m<0b0000100, "fsub.s", FINX>;
 }
 let SchedRW = [WriteFMul32, ReadFMul32, ReadFMul32] in
-defm FMUL_S : FPALU_rr_frm_m<0b0001000, "fmul.s", FINX>;
+defm FMUL_S : FPALU_rr_frm_m<0b0001000, "fmul.s", FINX, /*Commutable*/1>;
 
 let SchedRW = [WriteFDiv32, ReadFDiv32, ReadFDiv32] in
 defm FDIV_S : FPALU_rr_frm_m<0b0001100, "fdiv.s", FINX>;
@@ -331,8 +337,8 @@ defm FSGNJX_S : FPALU_rr_m<0b0010000, 0b010, "fsgnjx.s", FINX>;
 }
 
 let SchedRW = [WriteFMinMax32, ReadFMinMax32, ReadFMinMax32] in {
-defm FMIN_S   : FPALU_rr_m<0b0010100, 0b000, "fmin.s", FINX>;
-defm FMAX_S   : FPALU_rr_m<0b0010100, 0b001, "fmax.s", FINX>;
+defm FMIN_S   : FPALU_rr_m<0b0010100, 0b000, "fmin.s", FINX, /*Commutable*/1>;
+defm FMAX_S   : FPALU_rr_m<0b0010100, 0b001, "fmax.s", FINX, /*Commutable*/1>;
 }
 
 defm FCVT_W_S : FPUnaryOp_r_frm_m<0b1100000, 0b00000, XFINX, "fcvt.w.s">,
@@ -348,7 +354,7 @@ def FMV_X_W : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR32, "fmv.x.w">,
               Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>;
 
 let SchedRW = [WriteFCmp32, ReadFCmp32, ReadFCmp32] in {
-defm FEQ_S : FPCmp_rr_m<0b1010000, 0b010, "feq.s", FINX>;
+defm FEQ_S : FPCmp_rr_m<0b1010000, 0b010, "feq.s", FINX, /*Commutable*/1>;
 defm FLT_S : FPCmp_rr_m<0b1010000, 0b001, "flt.s", FINX>;
 defm FLE_S : FPCmp_rr_m<0b1010000, 0b000, "fle.s", FINX>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index edaf158..835a0f5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -109,11 +109,11 @@ defm : FPFMADynFrmAlias_m<FNMSUB_H, "fnmsub.h", HINX>;
 defm : FPFMADynFrmAlias_m<FNMADD_H, "fnmadd.h", HINX>;
 
 let SchedRW = [WriteFALU16, ReadFALU16, ReadFALU16] in {
-defm FADD_H : FPALU_rr_frm_m<0b0000010, "fadd.h", HINX>;
+defm FADD_H : FPALU_rr_frm_m<0b0000010, "fadd.h", HINX, /*Commutable*/1>;
 defm FSUB_H : FPALU_rr_frm_m<0b0000110, "fsub.h", HINX>;
 }
 let SchedRW = [WriteFMul16, ReadFMul16, ReadFMul16] in
-defm FMUL_H : FPALU_rr_frm_m<0b0001010, "fmul.h", HINX>;
+defm FMUL_H : FPALU_rr_frm_m<0b0001010, "fmul.h", HINX, /*Commutable*/1>;
 
 let SchedRW = [WriteFDiv16, ReadFDiv16, ReadFDiv16] in
 defm FDIV_H : FPALU_rr_frm_m<0b0001110, "fdiv.h", HINX>;
@@ -135,8 +135,8 @@ defm FSGNJX_H : FPALU_rr_m<0b0010010, 0b010, "fsgnjx.h", HINX>;
 }
 
 let SchedRW = [WriteFMinMax16, ReadFMinMax16, ReadFMinMax16] in {
-defm FMIN_H   : FPALU_rr_m<0b0010110, 0b000, "fmin.h", HINX>;
-defm FMAX_H   : FPALU_rr_m<0b0010110, 0b001, "fmax.h", HINX>;
+defm FMIN_H   : FPALU_rr_m<0b0010110, 0b000, "fmin.h", HINX, /*Commutable*/1>;
+defm FMAX_H   : FPALU_rr_m<0b0010110, 0b001, "fmax.h", HINX, /*Commutable*/1>;
 }
 
 defm FCVT_W_H : FPUnaryOp_r_frm_m<0b1100010, 0b00000, XHINX, "fcvt.w.h">,
@@ -173,7 +173,7 @@ def FMV_H_X : FPUnaryOp_r<0b1111010, 0b00000, 0b000, FPR16, GPR, "fmv.h.x">,
 } // Predicates = [HasStdExtZfhOrZfhmin]
 
 let SchedRW = [WriteFCmp16, ReadFCmp16, ReadFCmp16] in {
-defm FEQ_H : FPCmp_rr_m<0b1010010, 0b010, "feq.h", HINX>;
+defm FEQ_H : FPCmp_rr_m<0b1010010, 0b010, "feq.h", HINX, /*Commutable*/1>;
 defm FLT_H : FPCmp_rr_m<0b1010010, 0b001, "flt.h", HINX>;
 defm FLE_H : FPCmp_rr_m<0b1010010, 0b000, "fle.h", HINX>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
index c167c09..4b34bba 100644
--- a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -87,7 +87,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   return MCOperand::createExpr(ME);
 }
 
-bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
+bool llvm::lowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
                                                MCOperand &MCOp,
                                                const AsmPrinter &AP) {
   switch (MO.getType()) {
@@ -214,7 +214,7 @@ bool llvm::lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
 
   for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
-    if (LowerRISCVMachineOperandToMCOperand(MO, MCOp, AP))
+    if (lowerRISCVMachineOperandToMCOperand(MO, MCOp, AP))
       OutMI.addOperand(MCOp);
   }
 
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 4f37acc..60e1b05 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -1590,9 +1590,11 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
   if (getParser().parseExpression(Expr))
     return MatchOperand_NoMatch;
 
-  auto isOutOfRangeConstant = [&](const MCExpr *E) -> bool {
+  auto isOutOfRangeConstant = [&](const MCExpr *E, bool Negate) -> bool {
     if (auto *CE = dyn_cast<MCConstantExpr>(E)) {
       int64_t Value = CE->getValue();
+      if (Negate)
+        Value = -Value;
       if ((Value & 1) || Value < MinVal || Value > MaxVal)
         return true;
     }
@@ -1606,7 +1608,7 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
       Error(StartLoc, "Expected PC-relative expression");
       return MatchOperand_ParseFail;
     }
-    if (isOutOfRangeConstant(CE)) {
+    if (isOutOfRangeConstant(CE, false)) {
       Error(StartLoc, "offset out of range");
       return MatchOperand_ParseFail;
     }
@@ -1621,8 +1623,9 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
   // For consistency with the GNU assembler, conservatively assume that a
   // constant offset must by itself be within the given size range.
   if (const auto *BE = dyn_cast<MCBinaryExpr>(Expr))
-    if (isOutOfRangeConstant(BE->getLHS()) ||
-        isOutOfRangeConstant(BE->getRHS())) {
+    if (isOutOfRangeConstant(BE->getLHS(), false) ||
+        isOutOfRangeConstant(BE->getRHS(),
+                             BE->getOpcode() == MCBinaryExpr::Sub)) {
       Error(StartLoc, "offset out of range");
       return MatchOperand_ParseFail;
     }
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index d825981..368b05e 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -48,14 +48,18 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
                      VEX, T8XD;
 
     // Pseduo instruction for RA.
+    let mayLoad = 1 in
     def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src),
                               [(int_x86_ldtilecfg_internal addr:$src)]>;
+    let mayLoad = 1 in
     def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
                                                      GR16:$src2,
                                                      opaquemem:$src3), []>;
+    let mayLoad = 1 in
     def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
                                                        GR16:$src2,
                                                        opaquemem:$src3), []>;
+    let mayStore = 1 in
     def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
                                             GR16:$src2, opaquemem:$src3,
                                             TILE:$src4), []>;
@@ -67,9 +71,12 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.
       // To be translated to the actual instructions in X86ISelLowering.cpp
+      let mayLoad = 1 in
       def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+      let mayLoad = 1 in
       def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1,
                                           sibmem:$src2), []>;
+      let mayStore = 1 in
       def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
       def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
                               [(int_x86_tilezero timm:$src)]>;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 0ad3e6c..81f258d 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -964,7 +964,7 @@ static void combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
 static bool combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
   bool Change = false;
   for (auto *Cast : Casts) {
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(Cast);
+    auto *II = cast<IntrinsicInst>(Cast);
     // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector(x86_amx %42)
     // store <256 x i32> %43, <256 x i32>* %p, align 64
     // -->
@@ -984,7 +984,7 @@ static bool combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
         Store->eraseFromParent();
     } else { // x86_cast_vector_to_tile
       SmallVector<Instruction *, 2> DeadLoads;
-      LoadInst *Load = dyn_cast<LoadInst>(Cast->getOperand(0));
+      auto *Load = dyn_cast<LoadInst>(Cast->getOperand(0));
       if (!Load || !Load->hasOneUse())
         continue;
       // %65 = load <256 x i32>, <256 x i32>* %p, align 64
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 67ed1d8..05364e3 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -935,8 +935,7 @@ def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> {
   let ResourceCycles = [1];
 }
 def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)",
-                                             "MOVZX(16|32|64)rm(8|16)",
-                                             "(V?)MOVDDUPrm")>; // TODO: Should this be SKLWriteResGroup67?
+                                             "MOVZX(16|32|64)rm(8|16)")>;
 
 def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> {
   let Latency = 5;
@@ -993,7 +992,8 @@ def: InstRW<[SKLWriteResGroup67], (instrs VBROADCASTSSrm,
                                           VPBROADCASTDrm,
                                           VPBROADCASTQrm)>;
 def: InstRW<[SKLWriteResGroup67], (instregex "(V?)MOVSHDUPrm",
-                                             "(V?)MOVSLDUPrm")>;
+                                             "(V?)MOVSLDUPrm",
+                                             "(V?)MOVDDUPrm")>;
 
 def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> {
   let Latency = 6;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 0189acd..b682b51 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -1055,8 +1055,7 @@ def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> {
   let ResourceCycles = [1];
 }
 def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)",
-                                             "MOVZX(16|32|64)rm(8|16)",
-                                             "(V?)MOVDDUPrm")>;  // TODO: Should this be SKXWriteResGroup71?
+                                             "MOVZX(16|32|64)rm(8|16)")>;
 
 def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> {
   let Latency = 5;
@@ -1159,11 +1158,10 @@ def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> {
 }
 def: InstRW<[SKXWriteResGroup71], (instrs VBROADCASTSSrm,
                                           VPBROADCASTDrm,
-                                          VPBROADCASTQrm,
-                                          VMOVSHDUPrm,
-                                          VMOVSLDUPrm,
-                                          MOVSHDUPrm,
-                                          MOVSLDUPrm)>;
+                                          VPBROADCASTQrm)>;
+def: InstRW<[SKXWriteResGroup71], (instregex "(V?)MOVSHDUPrm",
+                                             "(V?)MOVSLDUPrm",
+                                             "(V?)MOVDDUPrm")>;
 
 def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> {
   let Latency = 6;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
index 2103169..5051d4c 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -485,12 +485,6 @@ defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>;
 def Zn2WriteMicrocoded : SchedWriteRes<[]> {
   let Latency = 100;
 }
-defm : Zn2WriteResPair<WriteDPPS, [], 15>;
-defm : Zn2WriteResPair<WriteFHAdd, [], 7>;
-defm : Zn2WriteResPair<WriteFHAddY, [], 7>;
-defm : Zn2WriteResPair<WritePHAdd, [], 3>;
-defm : Zn2WriteResPair<WritePHAddX, [], 3>;
-defm : Zn2WriteResPair<WritePHAddY, [], 3>;
 
 def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>;
 def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>;
@@ -1108,6 +1102,14 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
 
 //-- Arithmetic instructions --//
 
+// HADD, HSUB PS/PD
+// PHADD|PHSUB (S) W/D.
+defm : Zn2WriteResPair<WriteFHAdd, [], 7>;
+defm : Zn2WriteResPair<WriteFHAddY, [], 7>;
+defm : Zn2WriteResPair<WritePHAdd, [], 3>;
+defm : Zn2WriteResPair<WritePHAddX, [], 3>;
+defm : Zn2WriteResPair<WritePHAddY, [], 3>;
+
 // PCMPGTQ.
 def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>;
 def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
@@ -1478,6 +1480,7 @@ def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>;
 
 // DPPS.
 // x,x,i / v,v,v,i.
+defm : Zn2WriteResPair<WriteDPPS, [], 15>;
 def : SchedAlias<WriteDPPSY,  Zn2WriteMicrocoded>;
 
 // x,m,i / v,v,m,i.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 7f4ca3a..a0bde8dc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3833,10 +3833,21 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
     }
   }
 
-  // TODO: Use default extraction for now, but we should investigate extending this
-  // to handle repeated subvector extraction.
-  if (Extract)
+  if (Extract) {
+    // vXi1 can be efficiently extracted with MOVMSK.
+    // TODO: AVX512 predicate mask handling.
+    // NOTE: This doesn't work well for roundtrip scalarization.
+    if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
+      unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+      unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
+      unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
+      return MOVMSKCost;
+    }
+
+    // TODO: Use default extraction for now, but we should investigate extending
+    // this to handle repeated subvector extraction.
     Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
+  }
 
   return Cost;
 }
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 65bc392..a33cb0b 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -218,10 +218,17 @@ static Function *doPromotion(
   LLVM_DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
                     << "From: " << *F);
 
+  uint64_t LargestVectorWidth = 0;
+  for (auto *I : Params)
+    if (auto *VT = dyn_cast<llvm::VectorType>(I))
+      LargestVectorWidth = std::max(
+          LargestVectorWidth, VT->getPrimitiveSizeInBits().getKnownMinSize());
+
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
   NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttrs(),
                                        PAL.getRetAttrs(), ArgAttrVec));
+  AttributeFuncs::updateMinLegalVectorWidthAttr(*NF, LargestVectorWidth);
   ArgAttrVec.clear();
 
   F->getParent()->getFunctionList().insert(F->getIterator(), NF);
@@ -313,6 +320,9 @@ static Function *doPromotion(
     Args.clear();
     ArgAttrVec.clear();
 
+    AttributeFuncs::updateMinLegalVectorWidthAttr(*CB.getCaller(),
+                                                  LargestVectorWidth);
+
     // Update the callgraph to know that the callsite has been transformed.
     if (ReplaceCallSite)
       (*ReplaceCallSite)(CB, *NewCS);
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 247da10..40dd6ee 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -2486,6 +2486,12 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
       }
     }
 
+    uint64_t LargestVectorWidth = 0;
+    for (auto *I : NewArgumentTypes)
+      if (auto *VT = dyn_cast<llvm::VectorType>(I))
+        LargestVectorWidth = std::max(
+            LargestVectorWidth, VT->getPrimitiveSizeInBits().getKnownMinSize());
+
     FunctionType *OldFnTy = OldFn->getFunctionType();
     Type *RetTy = OldFnTy->getReturnType();
 
@@ -2515,6 +2521,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
     NewFn->setAttributes(AttributeList::get(
         Ctx, OldFnAttributeList.getFnAttrs(), OldFnAttributeList.getRetAttrs(),
         NewArgumentAttributes));
+    AttributeFuncs::updateMinLegalVectorWidthAttr(*NewFn, LargestVectorWidth);
 
     // Since we have now created the new function, splice the body of the old
     // function right into the new function, leaving the old rotting hulk of the
@@ -2592,6 +2599,9 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
           Ctx, OldCallAttributeList.getFnAttrs(),
           OldCallAttributeList.getRetAttrs(), NewArgOperandAttributes));
 
+      AttributeFuncs::updateMinLegalVectorWidthAttr(*NewCB->getCaller(),
+                                                    LargestVectorWidth);
+
       CallSitePairs.push_back({OldCB, NewCB});
       return true;
     };
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 72b94cd..157cb27e 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator_range.h"
@@ -904,7 +905,7 @@ OptimizeGlobalAddressOfAllocation(GlobalVariable *GV, CallInst *CI,
     }
   }
 
-  SmallPtrSet<Constant *, 1> RepValues;
+  SmallSetVector<Constant *, 1> RepValues;
   RepValues.insert(NewGV);
 
   // If there is a comparison against null, we will insert a global bool to
diff --git a/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
index 6ec3c61..76f8f1a 100644
--- a/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -29,7 +29,7 @@ static bool inferAllPrototypeAttributes(
     // explicitly visited by CGSCC passes in the new pass manager.)
     if (F.isDeclaration() && !F.hasOptNone()) {
       if (!F.hasFnAttribute(Attribute::NoBuiltin))
-        Changed |= inferLibFuncAttributes(F, GetTLI(F));
+        Changed |= inferNonMandatoryLibFuncAttrs(F, GetTLI(F));
       Changed |= inferAttributesFromOthers(F);
     }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 4a62afc..39a32e5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -759,7 +759,7 @@ getAsConstantIndexedAddress(Type *ElemTy, Value *V, const DataLayout &DL) {
         V = GEP->getOperand(0);
         Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
         Index = ConstantExpr::getAdd(
-            Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType));
+            Index, ConstantExpr::getSExtOrTrunc(GEPIndex, IndexType));
         continue;
       }
       break;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index ab35698..b044b8a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1298,6 +1298,8 @@ static Instruction *foldFDivPowDivisor(BinaryOperator &I,
 }
 
 Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
+  Module *M = I.getModule();
+
   if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
@@ -1363,8 +1365,8 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
         !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) &&
                   match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X)));
 
-    if ((IsTan || IsCot) &&
-        hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) {
+    if ((IsTan || IsCot) && hasFloatFn(M, &TLI, I.getType(), LibFunc_tan,
+                                       LibFunc_tanf, LibFunc_tanl)) {
       IRBuilder<> B(&I);
       IRBuilder<>::FastMathFlagGuard FMFGuard(B);
       B.setFastMathFlags(I.getFastMathFlags());
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 0e65a44..780a446 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -408,6 +409,25 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
   if (Access.Addr->isSwiftError())
     return None;
 
+  // Peel off GEPs and BitCasts.
+  auto *Addr = Access.Addr->stripInBoundsOffsets();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    // Do not instrument PGO counter updates.
+    if (GV->hasSection()) {
+      StringRef SectionName = GV->getSection();
+      // Check if the global is in the PGO counters section.
+      auto OF = Triple(I->getModule()->getTargetTriple()).getObjectFormat();
+      if (SectionName.endswith(
+              getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
+        return None;
+    }
+
+    // Do not instrument accesses to LLVM internal variables.
+    if (GV->getName().startswith("__llvm"))
+      return None;
+  }
+
   const DataLayout &DL = I->getModule()->getDataLayout();
   Access.TypeSize = DL.getTypeStoreSizeInBits(Access.AccessTy);
   return Access;
@@ -613,8 +633,6 @@ bool MemProfiler::instrumentFunction(Function &F) {
 
   initializeCallbacks(*F.getParent());
 
-  FunctionModified |= insertDynamicShadowAtFunctionEntry(F);
-
   SmallVector<Instruction *, 16> ToInstrument;
 
   // Fill the set of memory operations to instrument.
@@ -625,6 +643,15 @@ bool MemProfiler::instrumentFunction(Function &F) {
     }
   }
 
+  if (ToInstrument.empty()) {
+    LLVM_DEBUG(dbgs() << "MEMPROF done instrumenting: " << FunctionModified
+                      << " " << F << "\n");
+
+    return FunctionModified;
+  }
+
+  FunctionModified |= insertDynamicShadowAtFunctionEntry(F);
+
   int NumInstrumented = 0;
   for (auto *Inst : ToInstrument) {
     if (ClDebugMin < 0 || ClDebugMax < 0 ||
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 5338032..2610ef1 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -774,12 +774,9 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
 
   unsigned NumOrigPreds = Preds.size();
   // We can only sink instructions through unconditional branches.
-  for (auto I = Preds.begin(); I != Preds.end();) {
-    if ((*I)->getTerminator()->getNumSuccessors() != 1)
-      I = Preds.erase(I);
-    else
-      ++I;
-  }
+  llvm::erase_if(Preds, [](BasicBlock *BB) {
+    return BB->getTerminator()->getNumSuccessors() != 1;
+  });
 
   LockstepReverseIterator LRI(Preds);
   SmallVector<SinkingInstructionCandidate, 4> Candidates;
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 11c756c..87202f7 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1100,6 +1100,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Value *StoredVal, Instruction *TheStore,
     SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
     const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
+  Module *M = TheStore->getModule();
   Value *SplatValue = isBytewiseValue(StoredVal, *DL);
   Constant *PatternValue = nullptr;
 
@@ -1182,15 +1183,14 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     NewCall = Builder.CreateMemSet(
         BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
         /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
-  } else {
+  } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
     // Everything is emitted in default address space
     Type *Int8PtrTy = DestInt8PtrTy;
 
-    Module *M = TheStore->getModule();
     StringRef FuncName = "memset_pattern16";
-    FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
-                                                Int8PtrTy, Int8PtrTy, IntIdxTy);
-    inferLibFuncAttributes(M, FuncName, *TLI);
+    FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
+                            Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
+    inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
     // an constant array of 16-bytes.  Plop the value into a mergable global.
@@ -1201,7 +1201,9 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     GV->setAlignment(Align(16));
     Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
     NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
-  }
+  } else
+    return Changed;
+
   NewCall->setDebugLoc(TheStore->getDebugLoc());
 
   if (MSSAU) {
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index cff8f51..344f89e 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -575,9 +575,11 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
     if (OpI->getType()->isVectorTy()) {
       Scattered[I] = scatter(&CI, OpI);
       assert(Scattered[I].size() == NumElems && "mismatched call operands");
+      if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
+        Tys.push_back(OpI->getType()->getScalarType());
     } else {
       ScalarOperands[I] = OpI;
-      if (hasVectorIntrinsicOverloadedScalarOpd(ID, I))
+      if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
         Tys.push_back(OpI->getType());
     }
   }
@@ -593,7 +595,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
     ScalarCallOps.clear();
 
     for (unsigned J = 0; J != NumArgs; ++J) {
-      if (hasVectorIntrinsicScalarOpd(ID, J))
+      if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
         ScalarCallOps.push_back(ScalarOperands[J]);
       else
         ScalarCallOps.push_back(Scattered[J][Elem]);
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index b3f1229..14c1fed 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -204,13 +204,19 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
 /// branch on a single value.
 static void buildPartialUnswitchConditionalBranch(
     BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction,
-    BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze) {
+    BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze,
+    Instruction *I, AssumptionCache *AC, DominatorTree &DT) {
   IRBuilder<> IRB(&BB);
 
-  Value *Cond = Direction ? IRB.CreateOr(Invariants) :
-    IRB.CreateAnd(Invariants);
-  if (InsertFreeze)
-    Cond = IRB.CreateFreeze(Cond, Cond->getName() + ".fr");
+  SmallVector<Value *> FrozenInvariants;
+  for (Value *Inv : Invariants) {
+    if (InsertFreeze && !isGuaranteedNotToBeUndefOrPoison(Inv, AC, I, &DT))
+      Inv = IRB.CreateFreeze(Inv, Inv->getName() + ".fr");
+    FrozenInvariants.push_back(Inv);
+  }
+
+  Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants)
+                          : IRB.CreateAnd(FrozenInvariants);
   IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
                    Direction ? &NormalSucc : &UnswitchedSucc);
 }
@@ -572,10 +578,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
              " condition!");
     buildPartialUnswitchConditionalBranch(
         *OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH,
-        FreezeLoopUnswitchCond && any_of(Invariants, [&](Value *C) {
-          return !isGuaranteedNotToBeUndefOrPoison(C, nullptr,
-                                                   OldPH->getTerminator(), &DT);
-        }));
+        FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT);
   }
 
   // Update the dominator tree with the added edge.
@@ -2318,11 +2321,9 @@ static void unswitchNontrivialInvariants(
       buildPartialInvariantUnswitchConditionalBranch(
           *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU);
     else {
-      buildPartialUnswitchConditionalBranch(
-          *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH,
-          InsertFreeze && any_of(Invariants, [&](Value *C) {
-            return !isGuaranteedNotToBeUndefOrPoison(C, &AC, BI, &DT);
-          }));
+      buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
+                                            *ClonedPH, *LoopPH, InsertFreeze,
+                                            BI, &AC, DT);
     }
     DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
 
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 1f4f1c9..40fd407 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -39,7 +39,6 @@ STATISTIC(NumInaccessibleMemOrArgMemOnly,
 STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
 STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
 STATISTIC(NumWriteOnlyArg, "Number of arguments inferred as writeonly");
-STATISTIC(NumExtArg, "Number of arguments inferred as signext/zeroext.");
 STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
 STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
 STATISTIC(NumNoUndef, "Number of function returns inferred as noundef returns");
@@ -147,16 +146,6 @@ static bool setOnlyWritesMemory(Function &F, unsigned ArgNo) {
   return true;
 }
 
-static bool setArgExtAttr(Function &F, unsigned ArgNo,
-                          const TargetLibraryInfo &TLI, bool Signed = true) {
-  Attribute::AttrKind ExtAttr = TLI.getExtAttrForI32Param(Signed);
-  if (ExtAttr == Attribute::None || F.hasParamAttribute(ArgNo, ExtAttr))
-    return false;
-  F.addParamAttr(ArgNo, ExtAttr);
-  ++NumExtArg;
-  return true;
-}
-
 static bool setRetNoUndef(Function &F) {
   if (!F.getReturnType()->isVoidTy() &&
       !F.hasRetAttribute(Attribute::NoUndef)) {
@@ -231,6 +220,13 @@ static bool setAlignedAllocParam(Function &F, unsigned ArgNo) {
   return true;
 }
 
+static bool setAllocatedPointerParam(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::AllocatedPointer))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::AllocatedPointer);
+  return true;
+}
+
 static bool setAllocSize(Function &F, unsigned ElemSizeArg,
                          Optional<unsigned> NumElemsArg) {
   if (F.hasFnAttribute(Attribute::AllocSize))
@@ -240,15 +236,23 @@ static bool setAllocSize(Function &F, unsigned ElemSizeArg,
   return true;
 }
 
-bool llvm::inferLibFuncAttributes(Module *M, StringRef Name,
-                                  const TargetLibraryInfo &TLI) {
+static bool setAllocFamily(Function &F, StringRef Family) {
+  if (F.hasFnAttribute("alloc-family"))
+    return false;
+  F.addFnAttr("alloc-family", Family);
+  return true;
+}
+
+bool llvm::inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name,
+                                         const TargetLibraryInfo &TLI) {
   Function *F = M->getFunction(Name);
   if (!F)
     return false;
-  return inferLibFuncAttributes(*F, TLI);
+  return inferNonMandatoryLibFuncAttrs(*F, TLI);
 }
 
-bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
+bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
+                                         const TargetLibraryInfo &TLI) {
   LibFunc TheLibFunc;
   if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
     return false;
@@ -376,6 +380,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setArgNoUndef(F, 1);
     LLVM_FALLTHROUGH;
   case LibFunc_strdup:
+    Changed |= setAllocFamily(F, "malloc");
     Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
@@ -437,7 +442,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     LLVM_FALLTHROUGH;
   case LibFunc_valloc:
   case LibFunc_malloc:
+    Changed |= setAllocFamily(F, "malloc");
+    LLVM_FALLTHROUGH;
   case LibFunc_vec_malloc:
+    Changed |= setAllocFamily(F, "vec_malloc");
     Changed |= setAllocSize(F, 0, None);
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetAndArgsNoUndef(F);
@@ -501,6 +509,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_memalign:
+    Changed |= setAllocFamily(F, "malloc");
     Changed |= setAllocSize(F, 1, None);
     Changed |= setAlignedAllocParam(F, 0);
     Changed |= setOnlyAccessesInaccessibleMemory(F);
@@ -522,8 +531,12 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_realloc:
-  case LibFunc_vec_realloc:
   case LibFunc_reallocf:
+    Changed |= setAllocFamily(F, "malloc");
+    LLVM_FALLTHROUGH;
+  case LibFunc_vec_realloc:
+    Changed |= setAllocFamily(F, "vec_malloc");
+    Changed |= setAllocatedPointerParam(F, 0);
     Changed |= setAllocSize(F, 1, None);
     Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
     Changed |= setRetNoUndef(F);
@@ -597,7 +610,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyWritesMemory(F, 0);
     return Changed;
   case LibFunc_calloc:
+    Changed |= setAllocFamily(F, "malloc");
+    LLVM_FALLTHROUGH;
   case LibFunc_vec_calloc:
+    Changed |= setAllocFamily(F, "vec_malloc");
     Changed |= setAllocSize(F, 0, 1);
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetAndArgsNoUndef(F);
@@ -656,7 +672,11 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_free:
+    Changed |= setAllocFamily(F, "malloc");
+    LLVM_FALLTHROUGH;
   case LibFunc_vec_free:
+    Changed |= setAllocFamily(F, "vec_malloc");
+    Changed |= setAllocatedPointerParam(F, 0);
     Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
     Changed |= setArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -845,7 +865,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_putchar:
   case LibFunc_putchar_unlocked:
     Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setArgExtAttr(F, 0, TLI);
     Changed |= setDoesNotThrow(F);
     return Changed;
   case LibFunc_popen:
@@ -1066,7 +1085,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_ldexp:
   case LibFunc_ldexpf:
   case LibFunc_ldexpl:
-    Changed |= setArgExtAttr(F, 1, TLI);
     Changed |= setWillReturn(F);
     return Changed;
   case LibFunc_abs:
@@ -1203,34 +1221,141 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   }
 }
 
-bool llvm::hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+static void setArgExtAttr(Function &F, unsigned ArgNo,
+                          const TargetLibraryInfo &TLI, bool Signed = true) {
+  Attribute::AttrKind ExtAttr = TLI.getExtAttrForI32Param(Signed);
+  if (ExtAttr != Attribute::None && !F.hasParamAttribute(ArgNo, ExtAttr))
+    F.addParamAttr(ArgNo, ExtAttr);
+}
+
+FunctionCallee llvm::getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI,
+                                        LibFunc TheLibFunc, FunctionType *T,
+                                        AttributeList AttributeList) {
+  assert(TLI.has(TheLibFunc) &&
+         "Creating call to non-existing library function.");
+  StringRef Name = TLI.getName(TheLibFunc);
+  FunctionCallee C = M->getOrInsertFunction(Name, T, AttributeList);
+
+  // Make sure any mandatory argument attributes are added.
+
+  // Any outgoing i32 argument should be handled with setArgExtAttr() which
+  // will add an extension attribute if the target ABI requires it. Adding
+  // argument extensions is typically done by the front end but when an
+  // optimizer is building a library call on its own it has to take care of
+  // this. Each such generated function must be handled here with sign or
+  // zero extensions as needed.  F is retreived with cast<> because we demand
+  // of the caller to have called isLibFuncEmittable() first.
+  Function *F = cast<Function>(C.getCallee());
+  assert(F->getFunctionType() == T && "Function type does not match.");
+  switch (TheLibFunc) {
+  case LibFunc_fputc:
+  case LibFunc_putchar:
+    setArgExtAttr(*F, 0, TLI);
+    break;
+  case LibFunc_ldexp:
+  case LibFunc_ldexpf:
+  case LibFunc_ldexpl:
+  case LibFunc_memchr:
+  case LibFunc_strchr:
+    setArgExtAttr(*F, 1, TLI);
+    break;
+  case LibFunc_memccpy:
+    setArgExtAttr(*F, 2, TLI);
+    break;
+
+    // These are functions that are known to not need any argument extension
+    // on any target: A size_t argument (which may be an i32 on some targets)
+    // should not trigger the assert below.
+  case LibFunc_bcmp:
+  case LibFunc_calloc:
+  case LibFunc_fwrite:
+  case LibFunc_malloc:
+  case LibFunc_memcmp:
+  case LibFunc_memcpy_chk:
+  case LibFunc_mempcpy:
+  case LibFunc_memset_pattern16:
+  case LibFunc_snprintf:
+  case LibFunc_stpncpy:
+  case LibFunc_strlcat:
+  case LibFunc_strlcpy:
+  case LibFunc_strncat:
+  case LibFunc_strncmp:
+  case LibFunc_strncpy:
+  case LibFunc_vsnprintf:
+    break;
+
+  default:
+#ifndef NDEBUG
+    for (unsigned i = 0; i < T->getNumParams(); i++)
+      assert(!isa<IntegerType>(T->getParamType(i)) &&
+             "Unhandled integer argument.");
+#endif
+    break;
+  }
+
+  return C;
+}
+
+FunctionCallee llvm::getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI,
+                                        LibFunc TheLibFunc, FunctionType *T) {
+  return getOrInsertLibFunc(M, TLI, TheLibFunc, T, AttributeList());
+}
+
+bool llvm::isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI,
+                              LibFunc TheLibFunc) {
+  StringRef FuncName = TLI->getName(TheLibFunc);
+  if (!TLI->has(TheLibFunc))
+    return false;
+
+  // Check if the Module already has a GlobalValue with the same name, in
+  // which case it must be a Function with the expected type.
+  if (GlobalValue *GV = M->getNamedValue(FuncName)) {
+    if (auto *F = dyn_cast<Function>(GV))
+      return TLI->isValidProtoForLibFunc(*F->getFunctionType(), TheLibFunc, *M);
+    return false;
+  }
+
+  return true;
+}
+
+bool llvm::isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI,
+                              StringRef Name) {
+  LibFunc TheLibFunc;
+  return TLI->getLibFunc(Name, TheLibFunc) &&
+         isLibFuncEmittable(M, TLI, TheLibFunc);
+}
+
+bool llvm::hasFloatFn(const Module *M, const TargetLibraryInfo *TLI, Type *Ty,
                       LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) {
   switch (Ty->getTypeID()) {
   case Type::HalfTyID:
     return false;
   case Type::FloatTyID:
-    return TLI->has(FloatFn);
+    return isLibFuncEmittable(M, TLI, FloatFn);
   case Type::DoubleTyID:
-    return TLI->has(DoubleFn);
+    return isLibFuncEmittable(M, TLI, DoubleFn);
   default:
-    return TLI->has(LongDoubleFn);
+    return isLibFuncEmittable(M, TLI, LongDoubleFn);
   }
 }
 
-StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty,
-                               LibFunc DoubleFn, LibFunc FloatFn,
-                               LibFunc LongDoubleFn) {
-  assert(hasFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
+StringRef llvm::getFloatFn(const Module *M, const TargetLibraryInfo *TLI,
+                           Type *Ty, LibFunc DoubleFn, LibFunc FloatFn,
+                           LibFunc LongDoubleFn, LibFunc &TheLibFunc) {
+  assert(hasFloatFn(M, TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
          "Cannot get name for unavailable function!");
 
   switch (Ty->getTypeID()) {
   case Type::HalfTyID:
     llvm_unreachable("No name for HalfTy!");
   case Type::FloatTyID:
+    TheLibFunc = FloatFn;
     return TLI->getName(FloatFn);
   case Type::DoubleTyID:
+    TheLibFunc = DoubleFn;
     return TLI->getName(DoubleFn);
   default:
+    TheLibFunc = LongDoubleFn;
     return TLI->getName(LongDoubleFn);
   }
 }
@@ -1247,14 +1372,14 @@ static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType,
                           ArrayRef<Value *> Operands, IRBuilderBase &B,
                           const TargetLibraryInfo *TLI,
                           bool IsVaArgs = false) {
-  if (!TLI->has(TheLibFunc))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, TheLibFunc))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef FuncName = TLI->getName(TheLibFunc);
   FunctionType *FuncType = FunctionType::get(ReturnType, ParamTypes, IsVaArgs);
-  FunctionCallee Callee = M->getOrInsertFunction(FuncName, FuncType);
-  inferLibFuncAttributes(M, FuncName, *TLI);
+  FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, FuncType);
+  inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
   CallInst *CI = B.CreateCall(Callee, Operands, FuncName);
   if (const Function *F =
           dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
@@ -1323,16 +1448,16 @@ Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
 Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
                            IRBuilderBase &B, const DataLayout &DL,
                            const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_memcpy_chk))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_memcpy_chk))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   AttributeList AS;
   AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex,
                           Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  FunctionCallee MemCpy = M->getOrInsertFunction(
-      "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
+  FunctionCallee MemCpy = getOrInsertLibFunc(M, *TLI, LibFunc_memcpy_chk,
+      AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
       B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
       DL.getIntPtrType(Context));
   Dst = castToCStr(Dst, B);
@@ -1466,14 +1591,15 @@ static void appendTypeSuffix(Value *Op, StringRef &Name,
   }
 }
 
-static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
-                                         IRBuilderBase &B,
-                                         const AttributeList &Attrs) {
+static Value *emitUnaryFloatFnCallHelper(Value *Op, LibFunc TheLibFunc,
+                                         StringRef Name, IRBuilderBase &B,
+                                         const AttributeList &Attrs,
+                                         const TargetLibraryInfo *TLI) {
   assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall");
 
   Module *M = B.GetInsertBlock()->getModule();
-  FunctionCallee Callee =
-      M->getOrInsertFunction(Name, Op->getType(), Op->getType());
+  FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, Op->getType(),
+                                             Op->getType());
   CallInst *CI = B.CreateCall(Callee, Op, Name);
 
   // The incoming attribute set may have come from a speculatable intrinsic, but
@@ -1488,12 +1614,16 @@ static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
   return CI;
 }
 
-Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilderBase &B,
+Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
+                                  StringRef Name, IRBuilderBase &B,
                                   const AttributeList &Attrs) {
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op, Name, NameBuffer);
 
-  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+  LibFunc TheLibFunc;
+  TLI->getLibFunc(Name, TheLibFunc);
+
+  return emitUnaryFloatFnCallHelper(Op, TheLibFunc, Name, B, Attrs, TLI);
 }
 
 Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
@@ -1501,23 +1631,25 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
                                   LibFunc LongDoubleFn, IRBuilderBase &B,
                                   const AttributeList &Attrs) {
   // Get the name of the function according to TLI.
-  StringRef Name = getFloatFnName(TLI, Op->getType(),
-                                  DoubleFn, FloatFn, LongDoubleFn);
+  Module *M = B.GetInsertBlock()->getModule();
+  LibFunc TheLibFunc;
+  StringRef Name = getFloatFn(M, TLI, Op->getType(), DoubleFn, FloatFn,
+                              LongDoubleFn, TheLibFunc);
 
-  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+  return emitUnaryFloatFnCallHelper(Op, TheLibFunc, Name, B, Attrs, TLI);
 }
 
 static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
+                                          LibFunc TheLibFunc,
                                           StringRef Name, IRBuilderBase &B,
                                           const AttributeList &Attrs,
-                                          const TargetLibraryInfo *TLI = nullptr) {
+                                          const TargetLibraryInfo *TLI) {
   assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
 
   Module *M = B.GetInsertBlock()->getModule();
-  FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(),
-                                                 Op1->getType(), Op2->getType());
-  if (TLI != nullptr)
-    inferLibFuncAttributes(M, Name, *TLI);
+  FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, Op1->getType(),
+                                             Op1->getType(), Op2->getType());
+  inferNonMandatoryLibFuncAttrs(M, Name, *TLI);
   CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name);
 
   // The incoming attribute set may have come from a speculatable intrinsic, but
@@ -1532,15 +1664,19 @@ static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
   return CI;
 }
 
-Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
-                                   IRBuilderBase &B,
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
+                                   const TargetLibraryInfo *TLI,
+                                   StringRef Name, IRBuilderBase &B,
                                    const AttributeList &Attrs) {
   assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
 
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op1, Name, NameBuffer);
 
-  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
+  LibFunc TheLibFunc;
+  TLI->getLibFunc(Name, TheLibFunc);
+
+  return emitBinaryFloatFnCallHelper(Op1, Op2, TheLibFunc, Name, B, Attrs, TLI);
 }
 
 Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
@@ -1549,22 +1685,24 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
                                    LibFunc LongDoubleFn, IRBuilderBase &B,
                                    const AttributeList &Attrs) {
   // Get the name of the function according to TLI.
-  StringRef Name = getFloatFnName(TLI, Op1->getType(),
-                                  DoubleFn, FloatFn, LongDoubleFn);
+  Module *M = B.GetInsertBlock()->getModule();
+  LibFunc TheLibFunc;
+  StringRef Name = getFloatFn(M, TLI, Op1->getType(), DoubleFn, FloatFn,
+                              LongDoubleFn, TheLibFunc);
 
-  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs, TLI);
+  return emitBinaryFloatFnCallHelper(Op1, Op2, TheLibFunc, Name, B, Attrs, TLI);
 }
 
 Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B,
                          const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_putchar))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_putchar))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef PutCharName = TLI->getName(LibFunc_putchar);
-  FunctionCallee PutChar =
-      M->getOrInsertFunction(PutCharName, B.getInt32Ty(), B.getInt32Ty());
-  inferLibFuncAttributes(M, PutCharName, *TLI);
+  FunctionCallee PutChar = getOrInsertLibFunc(M, *TLI, LibFunc_putchar,
+                                              B.getInt32Ty(), B.getInt32Ty());
+  inferNonMandatoryLibFuncAttrs(M, PutCharName, *TLI);
   CallInst *CI = B.CreateCall(PutChar,
                               B.CreateIntCast(Char,
                               B.getInt32Ty(),
@@ -1580,14 +1718,14 @@ Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B,
 
 Value *llvm::emitPutS(Value *Str, IRBuilderBase &B,
                       const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_puts))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_puts))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef PutsName = TLI->getName(LibFunc_puts);
-  FunctionCallee PutS =
-      M->getOrInsertFunction(PutsName, B.getInt32Ty(), B.getInt8PtrTy());
-  inferLibFuncAttributes(M, PutsName, *TLI);
+  FunctionCallee PutS = getOrInsertLibFunc(M, *TLI, LibFunc_puts, B.getInt32Ty(),
+                                           B.getInt8PtrTy());
+  inferNonMandatoryLibFuncAttrs(M, PutsName, *TLI);
   CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), PutsName);
   if (const Function *F =
           dyn_cast<Function>(PutS.getCallee()->stripPointerCasts()))
@@ -1597,15 +1735,15 @@ Value *llvm::emitPutS(Value *Str, IRBuilderBase &B,
 
 Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B,
                        const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fputc))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_fputc))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef FPutcName = TLI->getName(LibFunc_fputc);
-  FunctionCallee F = M->getOrInsertFunction(FPutcName, B.getInt32Ty(),
-                                            B.getInt32Ty(), File->getType());
+  FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fputc, B.getInt32Ty(),
+                                        B.getInt32Ty(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M, FPutcName, *TLI);
+    inferNonMandatoryLibFuncAttrs(M, FPutcName, *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
                          "chari");
   CallInst *CI = B.CreateCall(F, {Char, File}, FPutcName);
@@ -1618,15 +1756,15 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B,
 
 Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B,
                        const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fputs))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_fputs))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef FPutsName = TLI->getName(LibFunc_fputs);
-  FunctionCallee F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
-                                            B.getInt8PtrTy(), File->getType());
+  FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fputs, B.getInt32Ty(),
+                                        B.getInt8PtrTy(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M, FPutsName, *TLI);
+    inferNonMandatoryLibFuncAttrs(M, FPutsName, *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsName);
 
   if (const Function *Fn =
@@ -1637,18 +1775,18 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B,
 
 Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fwrite))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_fwrite))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   StringRef FWriteName = TLI->getName(LibFunc_fwrite);
-  FunctionCallee F = M->getOrInsertFunction(
-      FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
-      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+  FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fwrite,
+       DL.getIntPtrType(Context), B.getInt8PtrTy(), DL.getIntPtrType(Context),
+       DL.getIntPtrType(Context), File->getType());
 
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M, FWriteName, *TLI);
+    inferNonMandatoryLibFuncAttrs(M, FWriteName, *TLI);
   CallInst *CI =
       B.CreateCall(F, {castToCStr(Ptr, B), Size,
                        ConstantInt::get(DL.getIntPtrType(Context), 1), File});
@@ -1661,15 +1799,15 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B,
 
 Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
                         const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_malloc))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, TLI, LibFunc_malloc))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef MallocName = TLI->getName(LibFunc_malloc);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  FunctionCallee Malloc = M->getOrInsertFunction(MallocName, B.getInt8PtrTy(),
-                                                 DL.getIntPtrType(Context));
-  inferLibFuncAttributes(M, MallocName, *TLI);
+  FunctionCallee Malloc = getOrInsertLibFunc(M, *TLI, LibFunc_malloc,
+                                 B.getInt8PtrTy(), DL.getIntPtrType(Context));
+  inferNonMandatoryLibFuncAttrs(M, MallocName, *TLI);
   CallInst *CI = B.CreateCall(Malloc, Num, MallocName);
 
   if (const Function *F =
@@ -1681,16 +1819,16 @@ Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
 
 Value *llvm::emitCalloc(Value *Num, Value *Size, IRBuilderBase &B,
                         const TargetLibraryInfo &TLI) {
-  if (!TLI.has(LibFunc_calloc))
+  Module *M = B.GetInsertBlock()->getModule();
+  if (!isLibFuncEmittable(M, &TLI, LibFunc_calloc))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getModule();
   StringRef CallocName = TLI.getName(LibFunc_calloc);
   const DataLayout &DL = M->getDataLayout();
   IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
-  FunctionCallee Calloc =
-      M->getOrInsertFunction(CallocName, B.getInt8PtrTy(), PtrType, PtrType);
-  inferLibFuncAttributes(M, CallocName, TLI);
+  FunctionCallee Calloc = getOrInsertLibFunc(M, TLI, LibFunc_calloc,
+                                             B.getInt8PtrTy(), PtrType, PtrType);
+  inferNonMandatoryLibFuncAttrs(M, CallocName, TLI);
   CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName);
 
   if (const auto *F =
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 7a9a272..e72e3ce 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -500,6 +500,13 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
     if (isMathLibCallNoop(Call, TLI))
       return true;
 
+  // Non-volatile atomic loads from constants can be removed.
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    if (auto *GV = dyn_cast<GlobalVariable>(
+            LI->getPointerOperand()->stripPointerCasts()))
+      if (!LI->isVolatile() && GV->isConstant())
+        return true;
+
   return false;
 }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 38dca39..0710511 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1190,13 +1190,15 @@ Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI,
 }
 
 Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   if (Value *V = optimizeMemCmpBCmpCommon(CI, B))
     return V;
 
   // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0
   // bcmp can be more efficient than memcmp because it only has to know that
   // there is a difference, not how different one is to the other.
-  if (TLI->has(LibFunc_bcmp) && isOnlyUsedInZeroEqualityComparison(CI)) {
+  if (isLibFuncEmittable(M, TLI, LibFunc_bcmp) &&
+      isOnlyUsedInZeroEqualityComparison(CI)) {
     Value *LHS = CI->getArgOperand(0);
     Value *RHS = CI->getArgOperand(1);
     Value *Size = CI->getArgOperand(2);
@@ -1360,7 +1362,8 @@ static Value *valueHasFloatPrecision(Value *Val) {
 
 /// Shrink double -> float functions.
 static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
-                               bool isBinary, bool isPrecise = false) {
+                               bool isBinary, const TargetLibraryInfo *TLI,
+                               bool isPrecise = false) {
   Function *CalleeFn = CI->getCalledFunction();
   if (!CI->getType()->isDoubleTy() || !CalleeFn)
     return nullptr;
@@ -1410,22 +1413,25 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
     R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]);
   } else {
     AttributeList CalleeAttrs = CalleeFn->getAttributes();
-    R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeName, B, CalleeAttrs)
-                 : emitUnaryFloatFnCall(V[0], CalleeName, B, CalleeAttrs);
+    R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], TLI, CalleeName, B,
+                                         CalleeAttrs)
+                 : emitUnaryFloatFnCall(V[0], TLI, CalleeName, B, CalleeAttrs);
   }
   return B.CreateFPExt(R, B.getDoubleTy());
 }
 
 /// Shrink double -> float for unary functions.
 static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilderBase &B,
+                                    const TargetLibraryInfo *TLI,
                                     bool isPrecise = false) {
-  return optimizeDoubleFP(CI, B, false, isPrecise);
+  return optimizeDoubleFP(CI, B, false, TLI, isPrecise);
 }
 
 /// Shrink double -> float for binary functions.
 static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilderBase &B,
+                                     const TargetLibraryInfo *TLI,
                                      bool isPrecise = false) {
-  return optimizeDoubleFP(CI, B, true, isPrecise);
+  return optimizeDoubleFP(CI, B, true, TLI, isPrecise);
 }
 
 // cabs(z) -> sqrt((creal(z)*creal(z)) + (cimag(z)*cimag(z)))
@@ -1541,6 +1547,7 @@ static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B, unsigned DstWidth) {
 /// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x);
 /// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x).
 Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
+  Module *M = Pow->getModule();
   Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
   AttributeList Attrs; // Attributes are only meaningful on the original call
   Module *Mod = Pow->getModule();
@@ -1568,7 +1575,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
 
     Function *CalleeFn = BaseFn->getCalledFunction();
     if (CalleeFn &&
-        TLI->getLibFunc(CalleeFn->getName(), LibFn) && TLI->has(LibFn)) {
+        TLI->getLibFunc(CalleeFn->getName(), LibFn) &&
+        isLibFuncEmittable(M, TLI, LibFn)) {
       StringRef ExpName;
       Intrinsic::ID ID;
       Value *ExpFn;
@@ -1620,7 +1628,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
   // pow(2.0, itofp(x)) -> ldexp(1.0, x)
   if (match(Base, m_SpecificFP(2.0)) &&
       (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
-      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+      hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
     if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize()))
       return copyFlags(*Pow,
                        emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI,
@@ -1629,7 +1637,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
   }
 
   // pow(2.0 ** n, x) -> exp2(n * x)
-  if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
+  if (hasFloatFn(M, TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
     APFloat BaseR = APFloat(1.0);
     BaseR.convert(BaseF->getSemantics(), APFloat::rmTowardZero, &Ignored);
     BaseR = BaseR / *BaseF;
@@ -1656,7 +1664,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
   // pow(10.0, x) -> exp10(x)
   // TODO: There is no exp10() intrinsic yet, but some day there shall be one.
   if (match(Base, m_SpecificFP(10.0)) &&
-      hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
+      hasFloatFn(M, TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
     return copyFlags(*Pow, emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10,
                                                 LibFunc_exp10f, LibFunc_exp10l,
                                                 B, Attrs));
@@ -1681,7 +1689,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
         return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration(
                                                 Mod, Intrinsic::exp2, Ty),
                                             FMul, "exp2"));
-      else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l))
+      else if (hasFloatFn(M, TLI, Ty, LibFunc_exp2, LibFunc_exp2f,
+                          LibFunc_exp2l))
         return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2,
                                                     LibFunc_exp2f,
                                                     LibFunc_exp2l, B, Attrs));
@@ -1702,7 +1711,8 @@ static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
   }
 
   // Otherwise, use the libcall for sqrt().
-  if (hasFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, LibFunc_sqrtl))
+  if (hasFloatFn(M, TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf,
+                 LibFunc_sqrtl))
     // TODO: We also should check that the target can in fact lower the sqrt()
     // libcall. We currently have no way to ask this question, so we ask if
     // the target has a sqrt() libcall, which is not exactly the same.
@@ -1892,8 +1902,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
   // Shrink pow() to powf() if the arguments are single precision,
   // unless the result is expected to be double precision.
   if (UnsafeFPShrink && Name == TLI->getName(LibFunc_pow) &&
-      hasFloatVersion(Name)) {
-    if (Value *Shrunk = optimizeBinaryDoubleFP(Pow, B, true))
+      hasFloatVersion(M, Name)) {
+    if (Value *Shrunk = optimizeBinaryDoubleFP(Pow, B, TLI, true))
       return Shrunk;
   }
 
@@ -1901,13 +1911,14 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
 }
 
 Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   AttributeList Attrs; // Attributes are only meaningful on the original call
   StringRef Name = Callee->getName();
   Value *Ret = nullptr;
   if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) &&
-      hasFloatVersion(Name))
-    Ret = optimizeUnaryDoubleFP(CI, B, true);
+      hasFloatVersion(M, Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, TLI, true);
 
   Type *Ty = CI->getType();
   Value *Op = CI->getArgOperand(0);
@@ -1915,7 +1926,7 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
   // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= IntSize
   // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < IntSize
   if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) &&
-      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+      hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
     if (Value *Exp = getIntToFPVal(Op, B, TLI->getIntSize()))
       return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI,
                                    LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
@@ -1926,12 +1937,14 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
 }
 
 Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
+
   // If we can shrink the call to a float function rather than a double
   // function, do that first.
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
-  if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name))
-    if (Value *Ret = optimizeBinaryDoubleFP(CI, B))
+  if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(M, Name))
+    if (Value *Ret = optimizeBinaryDoubleFP(CI, B, TLI))
       return Ret;
 
   // The LLVM intrinsics minnum/maxnum correspond to fmin/fmax. Canonicalize to
@@ -1962,8 +1975,8 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
   Type *Ty = Log->getType();
   Value *Ret = nullptr;
 
-  if (UnsafeFPShrink && hasFloatVersion(LogNm))
-    Ret = optimizeUnaryDoubleFP(Log, B, true);
+  if (UnsafeFPShrink && hasFloatVersion(Mod, LogNm))
+    Ret = optimizeUnaryDoubleFP(Log, B, TLI, true);
 
   // The earlier call must also be 'fast' in order to do these transforms.
   CallInst *Arg = dyn_cast<CallInst>(Log->getArgOperand(0));
@@ -2071,7 +2084,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
         Log->doesNotAccessMemory()
             ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
                            Arg->getOperand(0), "log")
-            : emitUnaryFloatFnCall(Arg->getOperand(0), LogNm, B, Attrs);
+            : emitUnaryFloatFnCall(Arg->getOperand(0), TLI, LogNm, B, Attrs);
     Value *MulY = B.CreateFMul(Arg->getArgOperand(1), LogX, "mul");
     // Since pow() may have side effects, e.g. errno,
     // dead code elimination may not be trusted to remove it.
@@ -2094,7 +2107,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
     Value *LogE = Log->doesNotAccessMemory()
                       ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
                                      Eul, "log")
-                      : emitUnaryFloatFnCall(Eul, LogNm, B, Attrs);
+                      : emitUnaryFloatFnCall(Eul, TLI, LogNm, B, Attrs);
     Value *MulY = B.CreateFMul(Arg->getArgOperand(0), LogE, "mul");
     // Since exp() may have side effects, e.g. errno,
     // dead code elimination may not be trusted to remove it.
@@ -2106,14 +2119,16 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
 }
 
 Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   Value *Ret = nullptr;
   // TODO: Once we have a way (other than checking for the existince of the
   // libcall) to tell whether our target can lower @llvm.sqrt, relax the
   // condition below.
-  if (TLI->has(LibFunc_sqrtf) && (Callee->getName() == "sqrt" ||
-                                  Callee->getIntrinsicID() == Intrinsic::sqrt))
-    Ret = optimizeUnaryDoubleFP(CI, B, true);
+  if (isLibFuncEmittable(M, TLI, LibFunc_sqrtf) &&
+      (Callee->getName() == "sqrt" ||
+       Callee->getIntrinsicID() == Intrinsic::sqrt))
+    Ret = optimizeUnaryDoubleFP(CI, B, TLI, true);
 
   if (!CI->isFast())
     return Ret;
@@ -2158,7 +2173,6 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
 
   // If we found a repeated factor, hoist it out of the square root and
   // replace it with the fabs of that factor.
-  Module *M = Callee->getParent();
   Type *ArgType = I->getType();
   Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType);
   Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs");
@@ -2175,11 +2189,12 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
 
 // TODO: Generalize to handle any trig function and its inverse.
 Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   Value *Ret = nullptr;
   StringRef Name = Callee->getName();
-  if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name))
-    Ret = optimizeUnaryDoubleFP(CI, B, true);
+  if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(M, Name))
+    Ret = optimizeUnaryDoubleFP(CI, B, TLI, true);
 
   Value *Op1 = CI->getArgOperand(0);
   auto *OpC = dyn_cast<CallInst>(Op1);
@@ -2195,7 +2210,8 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) {
   // tanl(atanl(x)) -> x
   LibFunc Func;
   Function *F = OpC->getCalledFunction();
-  if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
+  if (F && TLI->getLibFunc(F->getName(), Func) &&
+      isLibFuncEmittable(M, TLI, Func) &&
       ((Func == LibFunc_atan && Callee->getName() == "tan") ||
        (Func == LibFunc_atanf && Callee->getName() == "tanf") ||
        (Func == LibFunc_atanl && Callee->getName() == "tanl")))
@@ -2211,9 +2227,10 @@ static bool isTrigLibCall(CallInst *CI) {
          CI->hasFnAttr(Attribute::ReadNone);
 }
 
-static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
+static bool insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
                              bool UseFloat, Value *&Sin, Value *&Cos,
-                             Value *&SinCos) {
+                             Value *&SinCos, const TargetLibraryInfo *TLI) {
+  Module *M = OrigCallee->getParent();
   Type *ArgTy = Arg->getType();
   Type *ResTy;
   StringRef Name;
@@ -2233,9 +2250,12 @@ static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
     ResTy = StructType::get(ArgTy, ArgTy);
   }
 
-  Module *M = OrigCallee->getParent();
-  FunctionCallee Callee =
-      M->getOrInsertFunction(Name, OrigCallee->getAttributes(), ResTy, ArgTy);
+  if (!isLibFuncEmittable(M, TLI, Name))
+    return false;
+  LibFunc TheLibFunc;
+  TLI->getLibFunc(Name, TheLibFunc);
+  FunctionCallee Callee = getOrInsertLibFunc(
+      M, *TLI, TheLibFunc, OrigCallee->getAttributes(), ResTy, ArgTy);
 
   if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
     // If the argument is an instruction, it must dominate all uses so put our
@@ -2259,6 +2279,8 @@ static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
     Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
                                  "cospi");
   }
+
+  return true;
 }
 
 Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
@@ -2286,7 +2308,9 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
     return nullptr;
 
   Value *Sin, *Cos, *SinCos;
-  insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos);
+  if (!insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos,
+                        SinCos, TLI))
+    return nullptr;
 
   auto replaceTrigInsts = [this](SmallVectorImpl<CallInst *> &Calls,
                                  Value *Res) {
@@ -2307,6 +2331,7 @@ void LibCallSimplifier::classifyArgUse(
     SmallVectorImpl<CallInst *> &CosCalls,
     SmallVectorImpl<CallInst *> &SinCosCalls) {
   CallInst *CI = dyn_cast<CallInst>(Val);
+  Module *M = CI->getModule();
 
   if (!CI || CI->use_empty())
     return;
@@ -2317,7 +2342,8 @@ void LibCallSimplifier::classifyArgUse(
 
   Function *Callee = CI->getCalledFunction();
   LibFunc Func;
-  if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) ||
+  if (!Callee || !TLI->getLibFunc(*Callee, Func) ||
+      !isLibFuncEmittable(M, TLI, Func) ||
       !isTrigLibCall(CI))
     return;
 
@@ -2532,6 +2558,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
 
 Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
 
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
   if (Value *V = optimizePrintFString(CI, B)) {
@@ -2540,10 +2567,10 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
 
   // printf(format, ...) -> iprintf(format, ...) if no floating point
   // arguments.
-  if (TLI->has(LibFunc_iprintf) && !callHasFloatingPointArgument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
-    FunctionCallee IPrintFFn =
-        M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
+  if (isLibFuncEmittable(M, TLI, LibFunc_iprintf) &&
+      !callHasFloatingPointArgument(CI)) {
+    FunctionCallee IPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_iprintf, FT,
+                                                  Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(IPrintFFn);
     B.Insert(New);
@@ -2552,11 +2579,10 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
 
   // printf(format, ...) -> __small_printf(format, ...) if no 128-bit floating point
   // arguments.
-  if (TLI->has(LibFunc_small_printf) && !callHasFP128Argument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
-    auto SmallPrintFFn =
-        M->getOrInsertFunction(TLI->getName(LibFunc_small_printf),
-                               FT, Callee->getAttributes());
+  if (isLibFuncEmittable(M, TLI, LibFunc_small_printf) &&
+      !callHasFP128Argument(CI)) {
+    auto SmallPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_small_printf, FT,
+                                            Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(SmallPrintFFn);
     B.Insert(New);
@@ -2655,6 +2681,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
 }
 
 Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
   if (Value *V = optimizeSPrintFString(CI, B)) {
@@ -2663,10 +2690,10 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
 
   // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
   // point arguments.
-  if (TLI->has(LibFunc_siprintf) && !callHasFloatingPointArgument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
-    FunctionCallee SIPrintFFn =
-        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
+  if (isLibFuncEmittable(M, TLI, LibFunc_siprintf) &&
+      !callHasFloatingPointArgument(CI)) {
+    FunctionCallee SIPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_siprintf,
+                                                   FT, Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(SIPrintFFn);
     B.Insert(New);
@@ -2675,11 +2702,10 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
 
   // sprintf(str, format, ...) -> __small_sprintf(str, format, ...) if no 128-bit
   // floating point arguments.
-  if (TLI->has(LibFunc_small_sprintf) && !callHasFP128Argument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
-    auto SmallSPrintFFn =
-        M->getOrInsertFunction(TLI->getName(LibFunc_small_sprintf),
-                               FT, Callee->getAttributes());
+  if (isLibFuncEmittable(M, TLI, LibFunc_small_sprintf) &&
+      !callHasFP128Argument(CI)) {
+    auto SmallSPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_small_sprintf, FT,
+                                             Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(SmallSPrintFFn);
     B.Insert(New);
@@ -2835,6 +2861,7 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
 }
 
 Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
+  Module *M = CI->getModule();
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
   if (Value *V = optimizeFPrintFString(CI, B)) {
@@ -2843,10 +2870,10 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
 
   // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
   // floating point arguments.
-  if (TLI->has(LibFunc_fiprintf) && !callHasFloatingPointArgument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
-    FunctionCallee FIPrintFFn =
-        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
+  if (isLibFuncEmittable(M, TLI, LibFunc_fiprintf) &&
+      !callHasFloatingPointArgument(CI)) {
+    FunctionCallee FIPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_fiprintf,
+                                                   FT, Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(FIPrintFFn);
     B.Insert(New);
@@ -2855,11 +2882,11 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
 
   // fprintf(stream, format, ...) -> __small_fprintf(stream, format, ...) if no
   // 128-bit floating point arguments.
-  if (TLI->has(LibFunc_small_fprintf) && !callHasFP128Argument(CI)) {
-    Module *M = B.GetInsertBlock()->getParent()->getParent();
+  if (isLibFuncEmittable(M, TLI, LibFunc_small_fprintf) &&
+      !callHasFP128Argument(CI)) {
     auto SmallFPrintFFn =
-        M->getOrInsertFunction(TLI->getName(LibFunc_small_fprintf),
-                               FT, Callee->getAttributes());
+        getOrInsertLibFunc(M, *TLI, LibFunc_small_fprintf, FT,
+                           Callee->getAttributes());
     CallInst *New = cast<CallInst>(CI->clone());
     New->setCalledFunction(SmallFPrintFFn);
     B.Insert(New);
@@ -2944,21 +2971,19 @@ Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) {
                                         CI->getArgOperand(2)));
 }
 
-bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
-  LibFunc Func;
+bool LibCallSimplifier::hasFloatVersion(const Module *M, StringRef FuncName) {
   SmallString<20> FloatFuncName = FuncName;
   FloatFuncName += 'f';
-  if (TLI->getLibFunc(FloatFuncName, Func))
-    return TLI->has(Func);
-  return false;
+  return isLibFuncEmittable(M, TLI, FloatFuncName);
 }
 
 Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
                                                       IRBuilderBase &Builder) {
+  Module *M = CI->getModule();
   LibFunc Func;
   Function *Callee = CI->getCalledFunction();
   // Check for string/memory library functions.
-  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+  if (TLI->getLibFunc(*Callee, Func) && isLibFuncEmittable(M, TLI, Func)) {
     // Make sure we never change the calling convention.
     assert(
         (ignoreCallingConv(Func) ||
@@ -3039,6 +3064,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
 Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
                                                        LibFunc Func,
                                                        IRBuilderBase &Builder) {
+  const Module *M = CI->getModule();
+
   // Don't optimize calls that require strict floating point semantics.
   if (CI->isStrictFP())
     return nullptr;
@@ -3117,12 +3144,12 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
   case LibFunc_sin:
   case LibFunc_sinh:
   case LibFunc_tanh:
-    if (UnsafeFPShrink && hasFloatVersion(CI->getCalledFunction()->getName()))
-      return optimizeUnaryDoubleFP(CI, Builder, true);
+    if (UnsafeFPShrink && hasFloatVersion(M, CI->getCalledFunction()->getName()))
+      return optimizeUnaryDoubleFP(CI, Builder, TLI, true);
     return nullptr;
   case LibFunc_copysign:
-    if (hasFloatVersion(CI->getCalledFunction()->getName()))
-      return optimizeBinaryDoubleFP(CI, Builder);
+    if (hasFloatVersion(M, CI->getCalledFunction()->getName()))
+      return optimizeBinaryDoubleFP(CI, Builder, TLI);
     return nullptr;
   case LibFunc_fminf:
   case LibFunc_fmin:
@@ -3141,6 +3168,7 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
 }
 
 Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
+  Module *M = CI->getModule();
   assert(!CI->isMustTailCall() && "These transforms aren't musttail safe.");
 
   // TODO: Split out the code below that operates on FP calls so that
@@ -3219,7 +3247,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
   }
 
   // Then check for known library functions.
-  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+  if (TLI->getLibFunc(*Callee, Func) && isLibFuncEmittable(M, TLI, Func)) {
     // We never change the calling convention.
     if (!ignoreCallingConv(Func) && !IsCallingConvC)
       return nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 5ecee44..d3a944c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -441,6 +441,26 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
   return false;
 }
 
+/// Returns true if A and B have same pointer operands or same SCEVs addresses
+static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
+                               StoreInst *B) {
+  // Compare store
+  if (A == B)
+    return true;
+
+  // Otherwise Compare pointers
+  Value *APtr = A->getPointerOperand();
+  Value *BPtr = B->getPointerOperand();
+  if (APtr == BPtr)
+    return true;
+
+  // Otherwise compare address SCEVs
+  if (SE->getSCEV(APtr) == SE->getSCEV(BPtr))
+    return true;
+
+  return false;
+}
+
 int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
                                                 Value *Ptr) const {
   const ValueToValueMap &Strides =
@@ -678,7 +698,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
         RecurrenceDescriptor RedDes;
         if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
-                                                 DT)) {
+                                                 DT, PSE.getSE())) {
           Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
           AllowedExit.insert(RedDes.getLoopExitInstr());
           Reductions[Phi] = RedDes;
@@ -772,7 +792,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         auto *SE = PSE.getSE();
         Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
         for (unsigned i = 0, e = CI->arg_size(); i != e; ++i)
-          if (hasVectorIntrinsicScalarOpd(IntrinID, i)) {
+          if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, i)) {
             if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
               reportVectorizationFailure("Found unvectorizable intrinsic",
                   "intrinsic instruction cannot be vectorized",
@@ -913,11 +933,66 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   if (!LAI->canVectorizeMemory())
     return false;
 
-  if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
-    reportVectorizationFailure("Stores to a uniform address",
-        "write to a loop invariant address could not be vectorized",
-        "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
-    return false;
+  // We can vectorize stores to invariant address when final reduction value is
+  // guaranteed to be stored at the end of the loop. Also, if decision to
+  // vectorize loop is made, runtime checks are added so as to make sure that
+  // invariant address won't alias with any other objects.
+  if (!LAI->getStoresToInvariantAddresses().empty()) {
+    // For each invariant address, check its last stored value is unconditional.
+    for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+      if (isInvariantStoreOfReduction(SI) &&
+          blockNeedsPredication(SI->getParent())) {
+        reportVectorizationFailure(
+            "We don't allow storing to uniform addresses",
+            "write of conditional recurring variant value to a loop "
+            "invariant address could not be vectorized",
+            "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+        return false;
+      }
+    }
+
+    if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
+      // For each invariant address, check its last stored value is the result
+      // of one of our reductions.
+      //
+      // We do not check if dependence with loads exists because they are
+      // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this
+      // behaviour changes we have to modify this code.
+      ScalarEvolution *SE = PSE.getSE();
+      SmallVector<StoreInst *, 4> UnhandledStores;
+      for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+        if (isInvariantStoreOfReduction(SI)) {
+          // Earlier stores to this address are effectively deadcode.
+          // With opaque pointers it is possible for one pointer to be used with
+          // different sizes of stored values:
+          //    store i32 0, ptr %x
+          //    store i8 0, ptr %x
+          // The latest store doesn't complitely overwrite the first one in the
+          // example. That is why we have to make sure that types of stored
+          // values are same.
+          // TODO: Check that bitwidth of unhandled store is smaller then the
+          // one that overwrites it and add a test.
+          erase_if(UnhandledStores, [SE, SI](StoreInst *I) {
+            return storeToSameAddress(SE, SI, I) &&
+                   I->getValueOperand()->getType() ==
+                       SI->getValueOperand()->getType();
+          });
+          continue;
+        }
+        UnhandledStores.push_back(SI);
+      }
+
+      bool IsOK = UnhandledStores.empty();
+      // TODO: we should also validate against InvariantMemSets.
+      if (!IsOK) {
+        reportVectorizationFailure(
+            "We don't allow storing to uniform addresses",
+            "write to a loop invariant address could not "
+            "be vectorized",
+            "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+        return false;
+      }
+    }
   }
 
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
@@ -944,13 +1019,34 @@ bool LoopVectorizationLegality::canVectorizeFPMath(
 
   // We can now only vectorize if all reductions with Exact FP math also
   // have the isOrdered flag set, which indicates that we can move the
-  // reduction operations in-loop.
+  // reduction operations in-loop, and do not have intermediate store.
   return (all_of(getReductionVars(), [&](auto &Reduction) -> bool {
     const RecurrenceDescriptor &RdxDesc = Reduction.second;
-    return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
+    return !RdxDesc.hasExactFPMath() ||
+           (RdxDesc.isOrdered() && !RdxDesc.IntermediateStore);
   }));
 }
 
+bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
+  return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+    const RecurrenceDescriptor &RdxDesc = Reduction.second;
+    return RdxDesc.IntermediateStore == SI;
+  });
+}
+
+bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
+  return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+    const RecurrenceDescriptor &RdxDesc = Reduction.second;
+    if (!RdxDesc.IntermediateStore)
+      return false;
+
+    ScalarEvolution *SE = PSE.getSE();
+    Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
+    return V == InvariantAddress ||
+           SE->getSCEV(V) == SE->getSCEV(InvariantAddress);
+  });
+}
+
 bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
   Value *In0 = const_cast<Value *>(V);
   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3bedf4b..d59abd2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3998,6 +3998,17 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   // Set the resume value for this reduction
   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
 
+  // If there were stores of the reduction value to a uniform memory address
+  // inside the loop, create the final store here.
+  if (StoreInst *SI = RdxDesc.IntermediateStore) {
+    StoreInst *NewSI =
+        Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
+    propagateMetadata(NewSI, SI);
+
+    // If the reduction value is used in other places,
+    // then let the code below create PHI's for that.
+  }
+
   // Now, we need to fix the users of the reduction variable
   // inside and outside of the scalar remainder loop.
 
@@ -4244,13 +4255,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
       // Some intrinsics have a scalar argument - don't replace it with a
       // vector.
       Value *Arg;
-      if (!UseVectorIntrinsic || !hasVectorIntrinsicScalarOpd(ID, I.index()))
+      if (!UseVectorIntrinsic ||
+          !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
         Arg = State.get(I.value(), Part);
-      else {
+      else
         Arg = State.get(I.value(), VPIteration(0, 0));
-        if (hasVectorIntrinsicOverloadedScalarOpd(ID, I.index()))
-          TysForDecl.push_back(Arg->getType());
-      }
+      if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
+        TysForDecl.push_back(Arg->getType());
       Args.push_back(Arg);
     }
 
@@ -7340,6 +7351,16 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   // Ignore ephemeral values.
   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
 
+  // Find all stores to invariant variables. Since they are going to sink
+  // outside the loop we do not need calculate cost for them.
+  for (BasicBlock *BB : TheLoop->blocks())
+    for (Instruction &I : *BB) {
+      StoreInst *SI;
+      if ((SI = dyn_cast<StoreInst>(&I)) &&
+          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+        ValuesToIgnore.insert(&I);
+    }
+
   // Ignore type-promoting instructions we identified during reduction
   // detection.
   for (auto &Reduction : Legal->getReductionVars()) {
@@ -8329,6 +8350,8 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
     return nullptr;
 
   auto willWiden = [&](ElementCount VF) -> bool {
+    if (VF.isScalar())
+       return false;
     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
     // The following case may be scalarized depending on the VF.
     // The flag shows whether we use Intrinsic or a usual Call for vectorized
@@ -8843,6 +8866,13 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
         continue;
       }
 
+      // Invariant stores inside loop will be deleted and a single store
+      // with the final reduction value will be added to the exit block
+      StoreInst *SI;
+      if ((SI = dyn_cast<StoreInst>(&I)) &&
+          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+        continue;
+
       // Otherwise, if all widening options failed, Instruction is to be
       // replicated. This may create a successor for VPBB.
       VPBasicBlock *NextVPBB =
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a6b1bb8..4583308 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -641,7 +641,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
     CallInst *CI = cast<CallInst>(UserInst);
     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
     for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
-      if (hasVectorIntrinsicScalarOpd(ID, i))
+      if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
         return (CI->getArgOperand(i) == Scalar);
     }
     LLVM_FALLTHROUGH;
@@ -2042,6 +2042,36 @@ public:
     DeletedInstructions.insert(I);
   }
 
+  /// Checks if the instruction was already analyzed for being possible
+  /// reduction root.
+  bool isAnalizedReductionRoot(Instruction *I) const {
+    return AnalizedReductionsRoots.count(I);
+  }
+  /// Register given instruction as already analyzed for being possible
+  /// reduction root.
+  void analyzedReductionRoot(Instruction *I) {
+    AnalizedReductionsRoots.insert(I);
+  }
+  /// Checks if the provided list of reduced values was checked already for
+  /// vectorization.
+  bool areAnalyzedReductionVals(ArrayRef<Value *> VL) {
+    return AnalyzedReductionVals.contains(hash_value(VL));
+  }
+  /// Adds the list of reduced values to list of already checked values for the
+  /// vectorization.
+  void analyzedReductionVals(ArrayRef<Value *> VL) {
+    AnalyzedReductionVals.insert(hash_value(VL));
+  }
+  /// Clear the list of the analyzed reduction root instructions.
+  void clearReductionData() {
+    AnalizedReductionsRoots.clear();
+    AnalyzedReductionVals.clear();
+  }
+  /// Checks if the given value is gathered in one of the nodes.
+  bool isGathered(Value *V) const {
+    return MustGather.contains(V);
+  }
+
   ~BoUpSLP();
 
 private:
@@ -2603,6 +2633,12 @@ private:
   /// previously deleted instruction.
   DenseSet<Instruction *> DeletedInstructions;
 
+  /// Set of the instruction, being analyzed already for reductions.
+  SmallPtrSet<Instruction *, 16> AnalizedReductionsRoots;
+
+  /// Set of hashes for the list of reduction values already being analyzed.
+  DenseSet<size_t> AnalyzedReductionVals;
+
   /// A list of values that need to extracted out of the tree.
   /// This list holds pairs of (Internal Scalar : External User). External User
   /// can be nullptr, it means that this Internal Scalar will be used later,
@@ -4041,6 +4077,83 @@ static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
 }
 #endif
 
+/// Generates key/subkey pair for the given value to provide effective sorting
+/// of the values and better detection of the vectorizable values sequences. The
+/// keys/subkeys can be used for better sorting of the values themselves (keys)
+/// and in values subgroups (subkeys).
+static std::pair<size_t, size_t> generateKeySubkey(
+    Value *V, const TargetLibraryInfo *TLI,
+    function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
+    bool AllowAlternate) {
+  hash_code Key = hash_value(V->getValueID() + 2);
+  hash_code SubKey = hash_value(0);
+  // Sort the loads by the distance between the pointers.
+  if (auto *LI = dyn_cast<LoadInst>(V)) {
+    Key = hash_combine(hash_value(Instruction::Load), Key);
+    if (LI->isSimple())
+      SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
+    else
+      SubKey = hash_value(LI);
+  } else if (isVectorLikeInstWithConstOps(V)) {
+    // Sort extracts by the vector operands.
+    if (isa<ExtractElementInst, UndefValue>(V))
+      Key = hash_value(Value::UndefValueVal + 1);
+    if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
+      if (!isUndefVector(EI->getVectorOperand()) &&
+          !isa<UndefValue>(EI->getIndexOperand()))
+        SubKey = hash_value(EI->getVectorOperand());
+    }
+  } else if (auto *I = dyn_cast<Instruction>(V)) {
+    // Sort other instructions just by the opcodes except for CMPInst.
+    // For CMP also sort by the predicate kind.
+    if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) &&
+        isValidForAlternation(I->getOpcode())) {
+      if (AllowAlternate)
+        Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
+      else
+        Key = hash_combine(hash_value(I->getOpcode()), Key);
+      SubKey = hash_combine(
+          hash_value(I->getOpcode()), hash_value(I->getType()),
+          hash_value(isa<BinaryOperator>(I)
+                         ? I->getType()
+                         : cast<CastInst>(I)->getOperand(0)->getType()));
+    } else if (auto *CI = dyn_cast<CmpInst>(I)) {
+      CmpInst::Predicate Pred = CI->getPredicate();
+      if (CI->isCommutative())
+        Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
+      CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
+      SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
+                            hash_value(SwapPred),
+                            hash_value(CI->getOperand(0)->getType()));
+    } else if (auto *Call = dyn_cast<CallInst>(I)) {
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
+      if (isTriviallyVectorizable(ID))
+        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
+      else if (!VFDatabase(*Call).getMappings(*Call).empty())
+        SubKey = hash_combine(hash_value(I->getOpcode()),
+                              hash_value(Call->getCalledFunction()));
+      else
+        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
+      for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
+        SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
+                              hash_value(Op.Tag), SubKey);
+    } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
+      if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
+        SubKey = hash_value(Gep->getPointerOperand());
+      else
+        SubKey = hash_value(Gep);
+    } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
+               !isa<ConstantInt>(I->getOperand(1))) {
+      // Do not try to vectorize instructions with potentially high cost.
+      SubKey = hash_value(I);
+    } else {
+      SubKey = hash_value(I->getOpcode());
+    }
+    Key = hash_combine(hash_value(I->getParent()), Key);
+  }
+  return std::make_pair(Key, SubKey);
+}
+
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             const EdgeInfo &UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -4742,7 +4855,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       unsigned NumArgs = CI->arg_size();
       SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
       for (unsigned j = 0; j != NumArgs; ++j)
-        if (hasVectorIntrinsicScalarOpd(ID, j))
+        if (isVectorIntrinsicWithScalarOpAtArg(ID, j))
           ScalarArgs[j] = CI->getArgOperand(j);
       for (Value *V : VL) {
         CallInst *CI2 = dyn_cast<CallInst>(V);
@@ -4761,7 +4874,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         // Some intrinsics have scalar arguments and should be same in order for
         // them to be vectorized.
         for (unsigned j = 0; j != NumArgs; ++j) {
-          if (hasVectorIntrinsicScalarOpd(ID, j)) {
+          if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) {
             Value *A1J = CI2->getArgOperand(j);
             if (ScalarArgs[j] != A1J) {
               BS.cancelScheduling(VL, VL0);
@@ -4794,7 +4907,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
         // For scalar operands no need to to create an entry since no need to
         // vectorize it.
-        if (hasVectorIntrinsicScalarOpd(ID, i))
+        if (isVectorIntrinsicWithScalarOpAtArg(ID, i))
           continue;
         ValueList Operands;
         // Prepare the operand vector.
@@ -6238,10 +6351,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
             ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
             // Find the insertvector, vectorized in tree, if any.
             Value *Base = VU;
-            while (isa<InsertElementInst>(Base)) {
+            while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
               // Build the mask for the vectorized insertelement instructions.
-              if (const TreeEntry *E = getTreeEntry(Base)) {
-                VU = cast<InsertElementInst>(Base);
+              if (const TreeEntry *E = getTreeEntry(IEBase)) {
+                VU = IEBase;
                 do {
                   int Idx = E->findLaneForValue(Base);
                   ShuffleMask.back()[Idx] = Idx;
@@ -6257,8 +6370,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           } else {
             VecId = std::distance(FirstUsers.begin(), It);
           }
-          ShuffleMask[VecId][*InsertIdx] = EU.Lane;
-          DemandedElts[VecId].setBit(*InsertIdx);
+          int InIdx = *InsertIdx;
+          ShuffleMask[VecId][InIdx] = EU.Lane;
+          DemandedElts[VecId].setBit(InIdx);
           continue;
         }
       }
@@ -6459,6 +6573,12 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
     }
   }
 
+  if (UsedTEs.empty()) {
+    assert(all_of(TE->Scalars, UndefValue::classof) &&
+           "Expected vector of undefs only.");
+    return None;
+  }
+
   unsigned VF = 0;
   if (UsedTEs.size() == 1) {
     // Try to find the perfect match in another gather node at first.
@@ -6612,11 +6732,15 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   // should not be scheduled.
   if (E->State != TreeEntry::NeedToGather &&
       doesNotNeedToSchedule(E->Scalars)) {
-    BasicBlock::iterator InsertPt;
+    Instruction *InsertInst;
     if (all_of(E->Scalars, isUsedOutsideBlock))
-      InsertPt = FindLastInst()->getIterator();
+      InsertInst = FindLastInst();
     else
-      InsertPt = FindFirstInst()->getIterator();
+      InsertInst = FindFirstInst();
+    // If the instruction is PHI, set the insert point after all the PHIs.
+    if (isa<PHINode>(InsertInst))
+      InsertInst = BB->getFirstNonPHI();
+    BasicBlock::iterator InsertPt = InsertInst->getIterator();
     Builder.SetInsertPoint(BB, InsertPt);
     Builder.SetCurrentDebugLocation(Front->getDebugLoc());
     return;
@@ -6658,13 +6782,17 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   // not ideal. However, this should be exceedingly rare since it requires that
   // we both exit early from buildTree_rec and that the bundle be out-of-order
   // (causing us to iterate all the way to the end of the block).
-  if (!LastInst)
+  if (!LastInst) {
     LastInst = FindLastInst();
+    // If the instruction is PHI, set the insert point after all the PHIs.
+    if (isa<PHINode>(LastInst))
+      LastInst = BB->getFirstNonPHI()->getPrevNode();
+  }
   assert(LastInst && "Failed to find last instruction in bundle");
 
   // Set the insertion point after the last instruction in the bundle. Set the
   // debug location to Front.
-  Builder.SetInsertPoint(BB, ++LastInst->getIterator());
+  Builder.SetInsertPoint(BB, std::next(LastInst->getIterator()));
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
@@ -7358,11 +7486,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         ValueList OpVL;
         // Some intrinsics have scalar arguments. This argument should not be
         // vectorized.
-        if (UseIntrinsic && hasVectorIntrinsicScalarOpd(IID, j)) {
+        if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) {
           CallInst *CEI = cast<CallInst>(VL0);
           ScalarArg = CEI->getArgOperand(j);
           OpVecs.push_back(CEI->getArgOperand(j));
-          if (hasVectorIntrinsicOverloadedScalarOpd(IID, j))
+          if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
             TysForDecl.push_back(ScalarArg->getType());
           continue;
         }
@@ -7370,6 +7498,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Value *OpVec = vectorizeTree(E->getOperand(j));
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
+        if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))
+          TysForDecl.push_back(OpVec->getType());
       }
 
       Function *CF;
@@ -8804,6 +8934,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 
   // Scan the blocks in the function in post order.
   for (auto BB : post_order(&F.getEntryBlock())) {
+    // Start new block - clear the list of reduction roots.
+    R.clearReductionData();
     collectSeedInstructions(BB);
 
     // Vectorize trees that end at stores.
@@ -9273,15 +9405,16 @@ class HorizontalReduction {
   using ReductionOpsType = SmallVector<Value *, 16>;
   using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
   ReductionOpsListType ReductionOps;
-  SmallVector<Value *, 32> ReducedVals;
+  /// List of possibly reduced values.
+  SmallVector<SmallVector<Value *>> ReducedVals;
+  /// Maps reduced value to the corresponding reduction operation.
+  DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
   // Use map vector to make stable output.
   MapVector<Instruction *, Value *> ExtraArgs;
   WeakTrackingVH ReductionRoot;
   /// The type of reduction operation.
   RecurKind RdxKind;
 
-  const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max();
-
   static bool isCmpSelMinMax(Instruction *I) {
     return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
            RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
@@ -9325,26 +9458,6 @@ class HorizontalReduction {
     return I->getOperand(Index);
   }
 
-  /// Checks if the ParentStackElem.first should be marked as a reduction
-  /// operation with an extra argument or as extra argument itself.
-  void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
-                    Value *ExtraArg) {
-    if (ExtraArgs.count(ParentStackElem.first)) {
-      ExtraArgs[ParentStackElem.first] = nullptr;
-      // We ran into something like:
-      // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
-      // The whole ParentStackElem.first should be considered as an extra value
-      // in this case.
-      // Do not perform analysis of remaining operands of ParentStackElem.first
-      // instruction, this whole instruction is an extra argument.
-      ParentStackElem.second = INVALID_OPERAND_INDEX;
-    } else {
-      // We ran into something like:
-      // ParentStackElem.first += ... + ExtraArg + ...
-      ExtraArgs[ParentStackElem.first] = ExtraArg;
-    }
-  }
-
   /// Creates reduction operation with the current opcode.
   static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
                          Value *RHS, const Twine &Name, bool UseSelect) {
@@ -9429,7 +9542,7 @@ class HorizontalReduction {
   /// Creates reduction operation with the current opcode with the IR flags
   /// from \p I.
   static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
-                         Value *RHS, const Twine &Name, Instruction *I) {
+                         Value *RHS, const Twine &Name, Value *I) {
     auto *SelI = dyn_cast<SelectInst>(I);
     Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr);
     if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
@@ -9440,8 +9553,10 @@ class HorizontalReduction {
     return Op;
   }
 
-  static RecurKind getRdxKind(Instruction *I) {
-    assert(I && "Expected instruction for reduction matching");
+  static RecurKind getRdxKind(Value *V) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return RecurKind::None;
     if (match(I, m_Add(m_Value(), m_Value())))
       return RecurKind::Add;
     if (match(I, m_Mul(m_Value(), m_Value())))
@@ -9603,7 +9718,9 @@ public:
   HorizontalReduction() = default;
 
   /// Try to find a reduction tree.
-  bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) {
+  bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,
+                                 ScalarEvolution &SE, const DataLayout &DL,
+                                 const TargetLibraryInfo &TLI) {
     assert((!Phi || is_contained(Phi->operands(), Inst)) &&
            "Phi needs to use the binary operator");
     assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||
@@ -9647,124 +9764,168 @@ public:
 
     ReductionRoot = Inst;
 
-    // The opcode for leaf values that we perform a reduction on.
-    // For example: load(x) + load(y) + load(z) + fptoui(w)
-    // The leaf opcode for 'w' does not match, so we don't include it as a
-    // potential candidate for the reduction.
-    unsigned LeafOpcode = 0;
-
-    // Post-order traverse the reduction tree starting at Inst. We only handle
-    // true trees containing binary operators or selects.
-    SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
-    Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst)));
-    initReductionOps(Inst);
-    while (!Stack.empty()) {
-      Instruction *TreeN = Stack.back().first;
-      unsigned EdgeToVisit = Stack.back().second++;
-      const RecurKind TreeRdxKind = getRdxKind(TreeN);
-      bool IsReducedValue = TreeRdxKind != RdxKind;
-
-      // Postorder visit.
-      if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) {
-        if (IsReducedValue)
-          ReducedVals.push_back(TreeN);
-        else {
-          auto ExtraArgsIter = ExtraArgs.find(TreeN);
-          if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) {
-            // Check if TreeN is an extra argument of its parent operation.
-            if (Stack.size() <= 1) {
-              // TreeN can't be an extra argument as it is a root reduction
-              // operation.
-              return false;
-            }
-            // Yes, TreeN is an extra argument, do not add it to a list of
-            // reduction operations.
-            // Stack[Stack.size() - 2] always points to the parent operation.
-            markExtraArg(Stack[Stack.size() - 2], TreeN);
-            ExtraArgs.erase(TreeN);
-          } else
-            addReductionOps(TreeN);
+    // Iterate through all the operands of the possible reduction tree and
+    // gather all the reduced values, sorting them by their value id.
+    BasicBlock *BB = Inst->getParent();
+    bool IsCmpSelMinMax = isCmpSelMinMax(Inst);
+    SmallVector<Instruction *> Worklist(1, Inst);
+    // Checks if the operands of the \p TreeN instruction are also reduction
+    // operations or should be treated as reduced values or an extra argument,
+    // which is not part of the reduction.
+    auto &&CheckOperands = [this, IsCmpSelMinMax,
+                            BB](Instruction *TreeN,
+                                SmallVectorImpl<Value *> &ExtraArgs,
+                                SmallVectorImpl<Value *> &PossibleReducedVals,
+                                SmallVectorImpl<Instruction *> &ReductionOps) {
+      for (int I = getFirstOperandIndex(TreeN),
+               End = getNumberOfOperands(TreeN);
+           I < End; ++I) {
+        Value *EdgeVal = getRdxOperand(TreeN, I);
+        ReducedValsToOps[EdgeVal].push_back(TreeN);
+        auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+        // Edge has wrong parent - mark as an extra argument.
+        if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
+            !hasSameParent(EdgeInst, BB)) {
+          ExtraArgs.push_back(EdgeVal);
+          continue;
         }
-        // Retract.
-        Stack.pop_back();
-        continue;
-      }
-
-      // Visit operands.
-      Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit);
-      auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
-      if (!EdgeInst) {
-        // Edge value is not a reduction instruction or a leaf instruction.
-        // (It may be a constant, function argument, or something else.)
-        markExtraArg(Stack.back(), EdgeVal);
-        continue;
+        // If the edge is not an instruction, or it is different from the main
+        // reduction opcode or has too many uses - possible reduced value.
+        if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
+            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
+            !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {
+          PossibleReducedVals.push_back(EdgeVal);
+          continue;
+        }
+        ReductionOps.push_back(EdgeInst);
       }
-      RecurKind EdgeRdxKind = getRdxKind(EdgeInst);
-      // Continue analysis if the next operand is a reduction operation or
-      // (possibly) a leaf value. If the leaf value opcode is not set,
-      // the first met operation != reduction operation is considered as the
-      // leaf opcode.
-      // Only handle trees in the current basic block.
-      // Each tree node needs to have minimal number of users except for the
-      // ultimate reduction.
-      const bool IsRdxInst = EdgeRdxKind == RdxKind;
-      if (EdgeInst != Phi && EdgeInst != Inst &&
-          hasSameParent(EdgeInst, Inst->getParent()) &&
-          hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) &&
-          (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) {
-        if (IsRdxInst) {
-          // We need to be able to reassociate the reduction operations.
-          if (!isVectorizable(EdgeRdxKind, EdgeInst)) {
-            // I is an extra argument for TreeN (its parent operation).
-            markExtraArg(Stack.back(), EdgeInst);
-            continue;
-          }
-        } else if (!LeafOpcode) {
-          LeafOpcode = EdgeInst->getOpcode();
+    };
+    // Try to regroup reduced values so that it gets more profitable to try to
+    // reduce them. Values are grouped by their value ids, instructions - by
+    // instruction op id and/or alternate op id, plus do extra analysis for
+    // loads (grouping them by the distabce between pointers) and cmp
+    // instructions (grouping them by the predicate).
+    MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
+        PossibleReducedVals;
+    initReductionOps(Inst);
+    while (!Worklist.empty()) {
+      Instruction *TreeN = Worklist.pop_back_val();
+      SmallVector<Value *> Args;
+      SmallVector<Value *> PossibleRedVals;
+      SmallVector<Instruction *> PossibleReductionOps;
+      CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
+      // If too many extra args - mark the instruction itself as a reduction
+      // value, not a reduction operation.
+      if (Args.size() < 2) {
+        addReductionOps(TreeN);
+        // Add extra args.
+        if (!Args.empty()) {
+          assert(Args.size() == 1 && "Expected only single argument.");
+          ExtraArgs[TreeN] = Args.front();
         }
-        Stack.push_back(
-            std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst)));
-        continue;
+        // Add reduction values. The values are sorted for better vectorization
+        // results.
+        for (Value *V : PossibleRedVals) {
+          size_t Key, Idx;
+          std::tie(Key, Idx) = generateKeySubkey(
+              V, &TLI,
+              [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+                for (const auto &LoadData : PossibleReducedVals[Key]) {
+                  auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+                  if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+                                      LI->getType(), LI->getPointerOperand(),
+                                      DL, SE, /*StrictCheck=*/true))
+                    return hash_value(RLI->getPointerOperand());
+                }
+                return hash_value(LI->getPointerOperand());
+              },
+              /*AllowAlternate=*/false);
+          ++PossibleReducedVals[Key][Idx]
+                .insert(std::make_pair(V, 0))
+                .first->second;
+        }
+        Worklist.append(PossibleReductionOps.rbegin(),
+                        PossibleReductionOps.rend());
+      } else {
+        size_t Key, Idx;
+        std::tie(Key, Idx) = generateKeySubkey(
+            TreeN, &TLI,
+            [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
+              for (const auto &LoadData : PossibleReducedVals[Key]) {
+                auto *RLI = cast<LoadInst>(LoadData.second.front().first);
+                if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
+                                    LI->getType(), LI->getPointerOperand(), DL,
+                                    SE, /*StrictCheck=*/true))
+                  return hash_value(RLI->getPointerOperand());
+              }
+              return hash_value(LI->getPointerOperand());
+            },
+            /*AllowAlternate=*/false);
+        ++PossibleReducedVals[Key][Idx]
+              .insert(std::make_pair(TreeN, 0))
+              .first->second;
       }
-      // I is an extra argument for TreeN (its parent operation).
-      markExtraArg(Stack.back(), EdgeInst);
     }
+    auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
+    // Sort values by the total number of values kinds to start the reduction
+    // from the longest possible reduced values sequences.
+    for (auto &PossibleReducedVals : PossibleReducedValsVect) {
+      auto PossibleRedVals = PossibleReducedVals.second.takeVector();
+      SmallVector<SmallVector<Value *>> PossibleRedValsVect;
+      for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
+           It != E; ++It) {
+        PossibleRedValsVect.emplace_back();
+        auto RedValsVect = It->second.takeVector();
+        stable_sort(RedValsVect, [](const auto &P1, const auto &P2) {
+          return P1.second < P2.second;
+        });
+        for (const std::pair<Value *, unsigned> &Data : RedValsVect)
+          PossibleRedValsVect.back().append(Data.second, Data.first);
+      }
+      stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
+        return P1.size() > P2.size();
+      });
+      ReducedVals.emplace_back();
+      for (ArrayRef<Value *> Data : PossibleRedValsVect)
+        ReducedVals.back().append(Data.rbegin(), Data.rend());
+    }
+    // Sort the reduced values by number of same/alternate opcode and/or pointer
+    // operand.
+    stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
+      return P1.size() > P2.size();
+    });
     return true;
   }
 
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
   Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+    constexpr int ReductionLimit = 4;
     // If there are a sufficient number of reduction values, reduce
     // to a nearby power-of-2. We can safely generate oversized
     // vectors and rely on the backend to split them to legal sizes.
-    unsigned NumReducedVals = ReducedVals.size();
-    if (NumReducedVals < 4)
+    unsigned NumReducedVals = std::accumulate(
+        ReducedVals.begin(), ReducedVals.end(), 0,
+        [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); });
+    if (NumReducedVals < ReductionLimit)
       return nullptr;
 
-    // Intersect the fast-math-flags from all reduction operations.
-    FastMathFlags RdxFMF;
-    RdxFMF.set();
-    for (ReductionOpsType &RdxOp : ReductionOps) {
-      for (Value *RdxVal : RdxOp) {
-        if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
-          RdxFMF &= FPMO->getFastMathFlags();
-      }
-    }
-
     IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
-    Builder.setFastMathFlags(RdxFMF);
 
+    // Track the reduced values in case if they are replaced by extractelement
+    // because of the vectorization.
+    DenseMap<Value *, WeakTrackingVH> TrackedVals;
     BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
     // The same extra argument may be used several times, so log each attempt
     // to use it.
     for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
       assert(Pair.first && "DebugLoc must be set.");
       ExternallyUsedValues[Pair.second].push_back(Pair.first);
+      TrackedVals.try_emplace(Pair.second, Pair.second);
     }
 
     // The compare instruction of a min/max is the insertion point for new
     // instructions and may be replaced with a new compare instruction.
-    auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
+    auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
       assert(isa<SelectInst>(RdxRootInst) &&
              "Expected min/max reduction to have select root instruction");
       Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
@@ -9776,141 +9937,289 @@ public:
     // The reduction root is used as the insertion point for new instructions,
     // so set it as externally used to prevent it from being deleted.
     ExternallyUsedValues[ReductionRoot];
-    SmallVector<Value *, 16> IgnoreList;
-    for (ReductionOpsType &RdxOp : ReductionOps)
-      IgnoreList.append(RdxOp.begin(), RdxOp.end());
-
-    unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
-    if (NumReducedVals > ReduxWidth) {
-      // In the loop below, we are building a tree based on a window of
-      // 'ReduxWidth' values.
-      // If the operands of those values have common traits (compare predicate,
-      // constant operand, etc), then we want to group those together to
-      // minimize the cost of the reduction.
-
-      // TODO: This should be extended to count common operands for
-      //       compares and binops.
-
-      // Step 1: Count the number of times each compare predicate occurs.
-      SmallDenseMap<unsigned, unsigned> PredCountMap;
-      for (Value *RdxVal : ReducedVals) {
-        CmpInst::Predicate Pred;
-        if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
-          ++PredCountMap[Pred];
-      }
-      // Step 2: Sort the values so the most common predicates come first.
-      stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
-        CmpInst::Predicate PredA, PredB;
-        if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
-            match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
-          return PredCountMap[PredA] > PredCountMap[PredB];
-        }
-        return false;
-      });
-    }
+    SmallVector<Value *> IgnoreList;
+    for (ReductionOpsType &RdxOps : ReductionOps)
+      for (Value *RdxOp : RdxOps) {
+        if (!RdxOp)
+          continue;
+        IgnoreList.push_back(RdxOp);
+      }
+    bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
+
+    // Need to track reduced vals, they may be changed during vectorization of
+    // subvectors.
+    for (ArrayRef<Value *> Candidates : ReducedVals)
+      for (Value *V : Candidates)
+        TrackedVals.try_emplace(V, V);
 
+    DenseMap<Value *, unsigned> VectorizedVals;
     Value *VectorizedTree = nullptr;
-    unsigned i = 0;
-    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
-      ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, IgnoreList);
-      if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true))
-        break;
-      if (V.isLoadCombineReductionCandidate(RdxKind))
-        break;
-      V.reorderTopToBottom();
-      V.reorderBottomToTop(/*IgnoreReorder=*/true);
-      V.buildExternalUses(ExternallyUsedValues);
-
-      // For a poison-safe boolean logic reduction, do not replace select
-      // instructions with logic ops. All reduced values will be frozen (see
-      // below) to prevent leaking poison.
-      if (isa<SelectInst>(ReductionRoot) &&
-          isBoolLogicOp(cast<Instruction>(ReductionRoot)) &&
-          NumReducedVals != ReduxWidth)
-        break;
+    bool CheckForReusedReductionOps = false;
+    // Try to vectorize elements based on their type.
+    for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
+      ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
+      InstructionsState S = getSameOpcode(OrigReducedVals);
+      SmallVector<Value *> Candidates;
+      DenseMap<Value *, Value *> TrackedToOrig;
+      for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
+        Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
+        // Check if the reduction value was not overriden by the extractelement
+        // instruction because of the vectorization and exclude it, if it is not
+        // compatible with other values.
+        if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+          if (isVectorLikeInstWithConstOps(Inst) &&
+              (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))
+            continue;
+        Candidates.push_back(RdxVal);
+        TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
+      }
+      bool ShuffledExtracts = false;
+      // Try to handle shuffled extractelements.
+      if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
+          I + 1 < E) {
+        InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]);
+        if (NextS.getOpcode() == Instruction::ExtractElement &&
+            !NextS.isAltShuffle()) {
+          SmallVector<Value *> CommonCandidates(Candidates);
+          for (Value *RV : ReducedVals[I + 1]) {
+            Value *RdxVal = TrackedVals.find(RV)->second;
+            // Check if the reduction value was not overriden by the
+            // extractelement instruction because of the vectorization and
+            // exclude it, if it is not compatible with other values.
+            if (auto *Inst = dyn_cast<Instruction>(RdxVal))
+              if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
+                continue;
+            CommonCandidates.push_back(RdxVal);
+            TrackedToOrig.try_emplace(RdxVal, RV);
+          }
+          SmallVector<int> Mask;
+          if (isFixedVectorShuffle(CommonCandidates, Mask)) {
+            ++I;
+            Candidates.swap(CommonCandidates);
+            ShuffledExtracts = true;
+          }
+        }
+      }
+      unsigned NumReducedVals = Candidates.size();
+      if (NumReducedVals < ReductionLimit)
+        continue;
 
-      V.computeMinimumValueSizes();
+      unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+      unsigned Start = 0;
+      unsigned Pos = Start;
+      // Restarts vectorization attempt with lower vector factor.
+      unsigned PrevReduxWidth = ReduxWidth;
+      bool CheckForReusedReductionOpsLocal = false;
+      auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
+                                  &CheckForReusedReductionOpsLocal,
+                                  &PrevReduxWidth, &V,
+                                  &IgnoreList](bool IgnoreVL = false) {
+        bool IsAnyRedOpGathered =
+            !IgnoreVL && any_of(IgnoreList, [&V](Value *RedOp) {
+              return V.isGathered(RedOp);
+            });
+        if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
+          // Check if any of the reduction ops are gathered. If so, worth
+          // trying again with less number of reduction ops.
+          CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
+        }
+        ++Pos;
+        if (Pos < NumReducedVals - ReduxWidth + 1)
+          return IsAnyRedOpGathered;
+        Pos = Start;
+        ReduxWidth /= 2;
+        return IsAnyRedOpGathered;
+      };
+      while (Pos < NumReducedVals - ReduxWidth + 1 &&
+             ReduxWidth >= ReductionLimit) {
+        // Dependency in tree of the reduction ops - drop this attempt, try
+        // later.
+        if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
+            Start == 0) {
+          CheckForReusedReductionOps = true;
+          break;
+        }
+        PrevReduxWidth = ReduxWidth;
+        ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
+        // Beeing analyzed already - skip.
+        if (V.areAnalyzedReductionVals(VL)) {
+          (void)AdjustReducedVals(/*IgnoreVL=*/true);
+          continue;
+        }
+        // Early exit if any of the reduction values were deleted during
+        // previous vectorization attempts.
+        if (any_of(VL, [&V](Value *RedVal) {
+              auto *RedValI = dyn_cast<Instruction>(RedVal);
+              if (!RedValI)
+                return false;
+              return V.isDeleted(RedValI);
+            }))
+          break;
+        V.buildTree(VL, IgnoreList);
+        if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
+          if (!AdjustReducedVals())
+            V.analyzedReductionVals(VL);
+          continue;
+        }
+        if (V.isLoadCombineReductionCandidate(RdxKind)) {
+          if (!AdjustReducedVals())
+            V.analyzedReductionVals(VL);
+          continue;
+        }
+        V.reorderTopToBottom();
+        // No need to reorder the root node at all.
+        V.reorderBottomToTop(/*IgnoreReorder=*/true);
+        // Keep extracted other reduction values, if they are used in the
+        // vectorization trees.
+        BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
+            ExternallyUsedValues);
+        for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
+          if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
+            continue;
+          for_each(ReducedVals[Cnt],
+                   [&LocalExternallyUsedValues, &TrackedVals](Value *V) {
+                     if (isa<Instruction>(V))
+                       LocalExternallyUsedValues[TrackedVals[V]];
+                   });
+        }
+        for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
+          if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
+            continue;
+          if (VectorizedVals.count(Candidates[Cnt]))
+            continue;
+          LocalExternallyUsedValues[Candidates[Cnt]];
+        }
+        V.buildExternalUses(LocalExternallyUsedValues);
+
+        V.computeMinimumValueSizes();
+
+        // Intersect the fast-math-flags from all reduction operations.
+        FastMathFlags RdxFMF;
+        RdxFMF.set();
+        for (Value *U : IgnoreList)
+          if (auto *FPMO = dyn_cast<FPMathOperator>(U))
+            RdxFMF &= FPMO->getFastMathFlags();
+        // Estimate cost.
+        InstructionCost TreeCost = V.getTreeCost(VL);
+        InstructionCost ReductionCost =
+            getReductionCost(TTI, VL[0], ReduxWidth, RdxFMF);
+        InstructionCost Cost = TreeCost + ReductionCost;
+        if (!Cost.isValid()) {
+          LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+          return nullptr;
+        }
+        if (Cost >= -SLPCostThreshold) {
+          V.getORE()->emit([&]() {
+            return OptimizationRemarkMissed(
+                       SV_NAME, "HorSLPNotBeneficial",
+                       ReducedValsToOps.find(VL[0])->second.front())
+                   << "Vectorizing horizontal reduction is possible"
+                   << "but not beneficial with cost " << ore::NV("Cost", Cost)
+                   << " and threshold "
+                   << ore::NV("Threshold", -SLPCostThreshold);
+          });
+          if (!AdjustReducedVals())
+            V.analyzedReductionVals(VL);
+          continue;
+        }
 
-      // Estimate cost.
-      InstructionCost TreeCost =
-          V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));
-      InstructionCost ReductionCost =
-          getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF);
-      InstructionCost Cost = TreeCost + ReductionCost;
-      if (!Cost.isValid()) {
-        LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
-        return nullptr;
-      }
-      if (Cost >= -SLPCostThreshold) {
+        LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+                          << Cost << ". (HorRdx)\n");
         V.getORE()->emit([&]() {
-          return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
-                                          cast<Instruction>(VL[0]))
-                 << "Vectorizing horizontal reduction is possible"
-                 << "but not beneficial with cost " << ore::NV("Cost", Cost)
-                 << " and threshold "
-                 << ore::NV("Threshold", -SLPCostThreshold);
+          return OptimizationRemark(
+                     SV_NAME, "VectorizedHorizontalReduction",
+                     ReducedValsToOps.find(VL[0])->second.front())
+                 << "Vectorized horizontal reduction with cost "
+                 << ore::NV("Cost", Cost) << " and with tree size "
+                 << ore::NV("TreeSize", V.getTreeSize());
         });
-        break;
-      }
 
-      LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
-                        << Cost << ". (HorRdx)\n");
-      V.getORE()->emit([&]() {
-        return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
-                                  cast<Instruction>(VL[0]))
-               << "Vectorized horizontal reduction with cost "
-               << ore::NV("Cost", Cost) << " and with tree size "
-               << ore::NV("TreeSize", V.getTreeSize());
-      });
+        Builder.setFastMathFlags(RdxFMF);
 
-      // Vectorize a tree.
-      DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
-      Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
+        // Vectorize a tree.
+        Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues);
 
-      // Emit a reduction. If the root is a select (min/max idiom), the insert
-      // point is the compare condition of that select.
-      Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
-      if (isCmpSelMinMax(RdxRootInst))
-        Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
-      else
-        Builder.SetInsertPoint(RdxRootInst);
+        // Emit a reduction. If the root is a select (min/max idiom), the insert
+        // point is the compare condition of that select.
+        Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
+        if (IsCmpSelMinMax)
+          Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst));
+        else
+          Builder.SetInsertPoint(RdxRootInst);
 
-      // To prevent poison from leaking across what used to be sequential, safe,
-      // scalar boolean logic operations, the reduction operand must be frozen.
-      if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
-        VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
+        // To prevent poison from leaking across what used to be sequential,
+        // safe, scalar boolean logic operations, the reduction operand must be
+        // frozen.
+        if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
+          VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
 
-      Value *ReducedSubTree =
-          emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+        Value *ReducedSubTree =
+            emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
 
-      if (!VectorizedTree) {
-        // Initialize the final value in the reduction.
-        VectorizedTree = ReducedSubTree;
-      } else {
-        // Update the final value in the reduction.
-        Builder.SetCurrentDebugLocation(Loc);
-        VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
-                                  ReducedSubTree, "op.rdx", ReductionOps);
+        if (!VectorizedTree) {
+          // Initialize the final value in the reduction.
+          VectorizedTree = ReducedSubTree;
+        } else {
+          // Update the final value in the reduction.
+          Builder.SetCurrentDebugLocation(
+              cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+          VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+                                    ReducedSubTree, "op.rdx", ReductionOps);
+        }
+        // Count vectorized reduced values to exclude them from final reduction.
+        for (Value *V : VL)
+          ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0)
+                .first->getSecond();
+        Pos += ReduxWidth;
+        Start = Pos;
+        ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);
       }
-      i += ReduxWidth;
-      ReduxWidth = PowerOf2Floor(NumReducedVals - i);
     }
-
     if (VectorizedTree) {
       // Finish the reduction.
-      for (; i < NumReducedVals; ++i) {
-        auto *I = cast<Instruction>(ReducedVals[i]);
-        Builder.SetCurrentDebugLocation(I->getDebugLoc());
-        VectorizedTree =
-            createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
+      // Need to add extra arguments and not vectorized possible reduction
+      // values.
+      SmallPtrSet<Value *, 8> Visited;
+      for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
+        ArrayRef<Value *> Candidates = ReducedVals[I];
+        for (Value *RdxVal : Candidates) {
+          if (!Visited.insert(RdxVal).second)
+            continue;
+          Value *StableRdxVal = RdxVal;
+          auto TVIt = TrackedVals.find(RdxVal);
+          if (TVIt != TrackedVals.end())
+            StableRdxVal = TVIt->second;
+          unsigned NumOps = 0;
+          auto It = VectorizedVals.find(RdxVal);
+          if (It != VectorizedVals.end())
+            NumOps = It->second;
+          for (Instruction *RedOp :
+               makeArrayRef(ReducedValsToOps.find(RdxVal)->second)
+                   .drop_back(NumOps)) {
+            Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
+            ReductionOpsListType Ops;
+            if (auto *Sel = dyn_cast<SelectInst>(RedOp))
+              Ops.emplace_back().push_back(Sel->getCondition());
+            Ops.emplace_back().push_back(RedOp);
+            VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+                                      StableRdxVal, "op.rdx", Ops);
+          }
+        }
       }
       for (auto &Pair : ExternallyUsedValues) {
         // Add each externally used value to the final reduction.
         for (auto *I : Pair.second) {
           Builder.SetCurrentDebugLocation(I->getDebugLoc());
+          ReductionOpsListType Ops;
+          if (auto *Sel = dyn_cast<SelectInst>(I))
+            Ops.emplace_back().push_back(Sel->getCondition());
+          Ops.emplace_back().push_back(I);
+          Value *StableRdxVal = Pair.first;
+          auto TVIt = TrackedVals.find(Pair.first);
+          if (TVIt != TrackedVals.end())
+            StableRdxVal = TVIt->second;
           VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
-                                    Pair.first, "op.extra", I);
+                                    StableRdxVal, "op.rdx", Ops);
         }
       }
 
@@ -9922,20 +10231,30 @@ public:
       // deletion.
 #ifndef NDEBUG
       SmallSet<Value *, 4> IgnoreSet;
-      IgnoreSet.insert(IgnoreList.begin(), IgnoreList.end());
+      for (ArrayRef<Value *> RdxOps : ReductionOps)
+        IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
 #endif
-      for (auto *Ignore : IgnoreList) {
+      for (ArrayRef<Value *> RdxOps : ReductionOps) {
+        for (Value *Ignore : RdxOps) {
+          if (!Ignore)
+            continue;
 #ifndef NDEBUG
-        for (auto *U : Ignore->users()) {
-          assert(IgnoreSet.count(U));
-        }
+          for (auto *U : Ignore->users()) {
+            assert(IgnoreSet.count(U) &&
+                   "All users must be either in the reduction ops list.");
+          }
 #endif
-        if (!Ignore->use_empty()) {
-          Value *Undef = UndefValue::get(Ignore->getType());
-          Ignore->replaceAllUsesWith(Undef);
+          if (!Ignore->use_empty()) {
+            Value *Undef = UndefValue::get(Ignore->getType());
+            Ignore->replaceAllUsesWith(Undef);
+          }
+          V.eraseInstruction(cast<Instruction>(Ignore));
         }
-        V.eraseInstruction(cast<Instruction>(Ignore));
       }
+    } else if (!CheckForReusedReductionOps) {
+      for (ReductionOpsType &RdxOps : ReductionOps)
+        for (Value *RdxOp : RdxOps)
+          V.analyzedReductionRoot(cast<Instruction>(RdxOp));
     }
     return VectorizedTree;
   }
@@ -10201,7 +10520,8 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
 /// performed.
 static bool tryToVectorizeHorReductionOrInstOperands(
     PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
-    TargetTransformInfo *TTI,
+    TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL,
+    const TargetLibraryInfo &TLI,
     const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
   if (!ShouldVectorizeHor)
     return false;
@@ -10220,7 +10540,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
   // horizontal reduction.
   // Interrupt the process if the Root instruction itself was vectorized or all
   // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
-  // Skip the analysis of CmpInsts.Compiler implements postanalysis of the
+  // Skip the analysis of CmpInsts. Compiler implements postanalysis of the
   // CmpInsts so we can skip extra attempts in
   // tryToVectorizeHorReductionOrInstOperands and save compile time.
   std::queue<std::pair<Instruction *, unsigned>> Stack;
@@ -10228,13 +10548,16 @@ static bool tryToVectorizeHorReductionOrInstOperands(
   SmallPtrSet<Value *, 8> VisitedInstrs;
   SmallVector<WeakTrackingVH> PostponedInsts;
   bool Res = false;
-  auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0,
-                                     Value *&B1) -> Value * {
+  auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst,
+                                                     Value *&B0,
+                                                     Value *&B1) -> Value * {
+    if (R.isAnalizedReductionRoot(Inst))
+      return nullptr;
     bool IsBinop = matchRdxBop(Inst, B0, B1);
     bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
     if (IsBinop || IsSelect) {
       HorizontalReduction HorRdx;
-      if (HorRdx.matchAssociativeReduction(P, Inst))
+      if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI))
         return HorRdx.tryToReduce(R, TTI);
     }
     return nullptr;
@@ -10279,7 +10602,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
       // Do not try to vectorize CmpInst operands, this is done separately.
       // Final attempt for binop args vectorization should happen after the loop
       // to try to find reductions.
-      if (!isa<CmpInst>(Inst))
+      if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))
         PostponedInsts.push_back(Inst);
     }
 
@@ -10292,8 +10615,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
           if (auto *I = dyn_cast<Instruction>(Op))
             // Do not try to vectorize CmpInst operands,  this is done
             // separately.
-            if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&
-                I->getParent() == BB)
+            if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
+                !R.isDeleted(I) && I->getParent() == BB)
               Stack.emplace(I, Level);
   }
   // Try to vectorized binops where reductions were not found.
@@ -10317,8 +10640,8 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
   auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
     return tryToVectorize(I, R);
   };
-  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
-                                                  ExtraVectorization);
+  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL,
+                                                  *TLI, ExtraVectorization);
 }
 
 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
@@ -10486,12 +10809,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
   for (auto *I : reverse(Instructions)) {
     if (R.isDeleted(I))
       continue;
-    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
       OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
-    else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+    } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
       OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
-    else if (isa<CmpInst>(I))
+    } else if (isa<CmpInst>(I)) {
       PostponedCmps.push_back(I);
+      continue;
+    }
+    // Try to find reductions in buildvector sequnces.
+    OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI);
   }
   if (AtTerminator) {
     // Try to find reductions first.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 02550dad..21bd231 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1420,6 +1420,9 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
     getCondOp()->printAsOperand(O, SlotTracker);
   }
   O << ")";
+  if (RdxDesc->IntermediateStore)
+    O << " (with final reduction value stored in invariant address sank "
+         "outside of loop)";
 }
 
 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 05fc8c6..f26babe 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -257,12 +257,12 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
 ExtractElementInst *VectorCombine::getShuffleExtract(
     ExtractElementInst *Ext0, ExtractElementInst *Ext1,
     unsigned PreferredExtractIndex = InvalidIndex) const {
-  assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
-         isa<ConstantInt>(Ext1->getIndexOperand()) &&
-         "Expected constant extract indexes");
+  auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
+  auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
+  assert(Index0C && Index1C && "Expected constant extract indexes");
 
-  unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
-  unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
+  unsigned Index0 = Index0C->getZExtValue();
+  unsigned Index1 = Index1C->getZExtValue();
 
   // If the extract indexes are identical, no shuffle is needed.
   if (Index0 == Index1)
@@ -308,9 +308,10 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
                                           const Instruction &I,
                                           ExtractElementInst *&ConvertToShuffle,
                                           unsigned PreferredExtractIndex) {
-  assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
-         isa<ConstantInt>(Ext1->getOperand(1)) &&
-         "Expected constant extract indexes");
+  auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1));
+  auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1));
+  assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
+
   unsigned Opcode = I.getOpcode();
   Type *ScalarTy = Ext0->getType();
   auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
@@ -333,8 +334,8 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
 
   // Get cost estimates for the extract elements. These costs will factor into
   // both sequences.
-  unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
-  unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
+  unsigned Ext0Index = Ext0IndexC->getZExtValue();
+  unsigned Ext1Index = Ext1IndexC->getZExtValue();
 
   InstructionCost Extract0Cost =
       TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);