46 files changed, 2116 insertions, 1755 deletions
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 76b6c8e..e8dbc96 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -594,12 +594,13 @@ public:
 
     // Check if suitable for a bit test
     if (N <= DL.getIndexSizeInBits(0u)) {
-      SmallPtrSet<const BasicBlock *, 4> Dests;
-      for (auto I : SI.cases())
-        Dests.insert(I.getCaseSuccessor());
+      DenseMap<const BasicBlock *, unsigned int> DestMap;
+      for (auto I : SI.cases()) {
+        const BasicBlock *BB = I.getCaseSuccessor();
+        ++DestMap[BB];
+      }
 
-      if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
-                                     DL))
+      if (TLI->isSuitableForBitTests(DestMap, MinCaseVal, MaxCaseVal, DL))
         return 1;
     }
 
diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index a9e53ba..f980d3d 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -84,6 +84,10 @@ LLVM_ABI Libcall getSINCOS(EVT RetVT);
 /// UNKNOWN_LIBCALL if there is none.
 LLVM_ABI Libcall getSINCOSPI(EVT RetVT);
 
+/// Return the SINCOS_STRET_ value for the given types, or UNKNOWN_LIBCALL if
+/// there is none.
+LLVM_ABI Libcall getSINCOS_STRET(EVT RetVT);
+
 /// getMODF - Return the MODF_* value for the given types, or
 /// UNKNOWN_LIBCALL if there is none.
 LLVM_ABI Libcall getMODF(EVT RetVT);
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d6ed3a8..4058dd7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1433,9 +1433,9 @@ public:
   /// \p High as its lowest and highest case values, and expects \p NumCmps
   /// case value comparisons. Check if the number of destinations, comparison
   /// metric, and range are all suitable.
-  bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
-                             const APInt &Low, const APInt &High,
-                             const DataLayout &DL) const {
+  bool isSuitableForBitTests(
+      const DenseMap<const BasicBlock *, unsigned int> &DestCmps,
+      const APInt &Low, const APInt &High, const DataLayout &DL) const {
     // FIXME: I don't think NumCmps is the correct metric: a single case and a
     // range of cases both require only one branch to lower. Just looking at the
     // number of clusters and destinations should be enough to decide whether to
@@ -1446,6 +1446,20 @@ public:
     if (!rangeFitsInWord(Low, High, DL))
       return false;
 
+    unsigned NumDests = DestCmps.size();
+    unsigned NumCmps = 0;
+    unsigned int MaxBitTestEntry = 0;
+    for (auto &DestCmp : DestCmps) {
+      NumCmps += DestCmp.second;
+      if (DestCmp.second > MaxBitTestEntry)
+        MaxBitTestEntry = DestCmp.second;
+    }
+
+    // Comparisons might be cheaper for small number of comparisons, which can
+    // be Arch Target specific.
+    if (MaxBitTestEntry < getMinimumBitTestCmps())
+      return false;
+
     // Decide whether it's profitable to lower this range with bit tests. Each
     // destination requires a bit test and branch, and there is an overall range
     // check branch. For a small number of clusters, separate comparisons might
@@ -2055,6 +2069,9 @@ public:
 
   virtual bool isJumpTableRelative() const;
 
+  /// Retuen the minimum of largest number of comparisons in BitTest.
+  unsigned getMinimumBitTestCmps() const;
+
   /// If a physical register, this specifies the register that
   /// llvm.savestack/llvm.restorestack should save and restore.
   Register getStackPointerRegisterToSaveRestore() const {
@@ -2577,6 +2594,9 @@ protected:
   /// Set to zero to generate unlimited jump tables.
   void setMaximumJumpTableSize(unsigned);
 
+  /// Set the minimum of largest of number of comparisons to generate BitTest.
+  void setMinimumBitTestCmps(unsigned Val);
+
   /// If set to a physical register, this specifies the register that
   /// llvm.savestack/llvm.restorestack should save and restore.
   void setStackPointerRegisterToSaveRestore(Register R) {
@@ -3719,6 +3739,9 @@ private:
   /// backend supports.
   unsigned MinCmpXchgSizeInBits;
 
+  /// The minimum of largest number of comparisons to use bit test for switch.
+  unsigned MinimumBitTestCmps;
+
   /// This indicates if the target supports unaligned atomic operations.
   bool SupportsUnalignedAtomics;
 
diff --git a/llvm/include/llvm/IR/AbstractCallSite.h b/llvm/include/llvm/IR/AbstractCallSite.h
index 9e24ae7..f431e1d 100644
--- a/llvm/include/llvm/IR/AbstractCallSite.h
+++ b/llvm/include/llvm/IR/AbstractCallSite.h
@@ -137,7 +137,7 @@ public:
 
   /// Return true if @p U is the use that defines the callee of this ACS.
   bool isCallee(const Use *U) const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->isCallee(U);
 
     assert(!CI.ParameterEncoding.empty() &&
@@ -154,7 +154,7 @@ public:
 
   /// Return the number of parameters of the callee.
   unsigned getNumArgOperands() const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->arg_size();
     // Subtract 1 for the callee encoding.
     return CI.ParameterEncoding.size() - 1;
@@ -169,7 +169,7 @@ public:
   /// Return the operand index of the underlying instruction associated with
   /// the function parameter number @p ArgNo or -1 if there is none.
   int getCallArgOperandNo(unsigned ArgNo) const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return ArgNo;
     // Add 1 for the callee encoding.
     return CI.ParameterEncoding[ArgNo + 1];
@@ -183,7 +183,7 @@ public:
   /// Return the operand of the underlying instruction associated with the
   /// function parameter number @p ArgNo or nullptr if there is none.
   Value *getCallArgOperand(unsigned ArgNo) const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->getArgOperand(ArgNo);
     // Add 1 for the callee encoding.
     return CI.ParameterEncoding[ArgNo + 1] >= 0
@@ -210,7 +210,7 @@ public:
 
   /// Return the pointer to function that is being called.
   Value *getCalledOperand() const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->getCalledOperand();
     return CB->getArgOperand(getCallArgOperandNoForCallee());
   }
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index dacda0a..972a253 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2191,7 +2191,7 @@ public:
                       FMFSource);
   }
   Value *CreatePtrToAddr(Value *V, const Twine &Name = "") {
-    return CreateCast(Instruction::PtrToInt, V,
+    return CreateCast(Instruction::PtrToAddr, V,
                       BB->getDataLayout().getAddressType(V->getType()), Name);
   }
   Value *CreatePtrToInt(Value *V, Type *DestTy,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5fb7e63..431a810 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2400,10 +2400,11 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   Results.push_back(Rem);
 }
 
-/// Return true if sincos libcall is available.
+/// Return true if sincos or __sincos_stret libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
-  RTLIB::Libcall LC = RTLIB::getSINCOS(Node->getSimpleValueType(0).SimpleTy);
-  return TLI.getLibcallName(LC) != nullptr;
+  MVT::SimpleValueType VT = Node->getSimpleValueType(0).SimpleTy;
+  return TLI.getLibcallImpl(RTLIB::getSINCOS(VT)) != RTLIB::Unsupported ||
+         TLI.getLibcallImpl(RTLIB::getSINCOS_STRET(VT)) != RTLIB::Unsupported;
 }
 
 /// Only issue sincos libcall if both sin and cos are needed.
@@ -3752,9 +3753,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT VT = Node->getValueType(0);
     // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
     // fcos which share the same operand and both are used.
-    if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) ||
-         isSinCosLibcallAvailable(Node, TLI))
-        && useSinCos(Node)) {
+    if ((TLI.isOperationLegal(ISD::FSINCOS, VT) ||
+         isSinCosLibcallAvailable(Node, TLI)) &&
+        useSinCos(Node)) {
       SDVTList VTs = DAG.getVTList(VT, VT);
       Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
       if (Node->getOpcode() == ISD::FCOS)
diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
index 038c499..3fa8243 100644
--- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
@@ -198,7 +198,6 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
   assert(First <= Last);
 
   auto Prob = BranchProbability::getZero();
-  unsigned NumCmps = 0;
   std::vector<MachineBasicBlock*> Table;
   DenseMap<MachineBasicBlock*, BranchProbability> JTProbs;
 
@@ -206,12 +205,16 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
   for (unsigned I = First; I <= Last; ++I)
     JTProbs[Clusters[I].MBB] = BranchProbability::getZero();
 
+  DenseMap<const BasicBlock *, unsigned int> DestMap;
   for (unsigned I = First; I <= Last; ++I) {
     assert(Clusters[I].Kind == CC_Range);
     Prob += Clusters[I].Prob;
     const APInt &Low = Clusters[I].Low->getValue();
     const APInt &High = Clusters[I].High->getValue();
-    NumCmps += (Low == High) ? 1 : 2;
+    unsigned int NumCmp = (Low == High) ? 1 : 2;
+    const BasicBlock *BB = Clusters[I].MBB->getBasicBlock();
+    DestMap[BB] += NumCmp;
+
     if (I != First) {
       // Fill the gap between this and the previous cluster.
       const APInt &PreviousHigh = Clusters[I - 1].High->getValue();
@@ -226,9 +229,7 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
     JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
   }
 
-  unsigned NumDests = JTProbs.size();
-  if (TLI->isSuitableForBitTests(NumDests, NumCmps,
-                                 Clusters[First].Low->getValue(),
+  if (TLI->isSuitableForBitTests(DestMap, Clusters[First].Low->getValue(),
                                  Clusters[Last].High->getValue(), *DL)) {
     // Clusters[First..Last] should be lowered as bit tests instead.
     return false;
@@ -372,20 +373,19 @@ bool SwitchCG::SwitchLowering::buildBitTests(CaseClusterVector &Clusters,
   if (First == Last)
     return false;
 
-  BitVector Dests(FuncInfo.MF->getNumBlockIDs());
-  unsigned NumCmps = 0;
+  DenseMap<const BasicBlock *, unsigned int> DestMap;
   for (int64_t I = First; I <= Last; ++I) {
     assert(Clusters[I].Kind == CC_Range);
-    Dests.set(Clusters[I].MBB->getNumber());
-    NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
+    unsigned NumCmp = (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
+    const BasicBlock *BB = Clusters[I].MBB->getBasicBlock();
+    DestMap[BB] += NumCmp;
   }
-  unsigned NumDests = Dests.count();
 
   APInt Low = Clusters[First].Low->getValue();
   APInt High = Clusters[Last].High->getValue();
   assert(Low.slt(High));
 
-  if (!TLI->isSuitableForBitTests(NumDests, NumCmps, Low, High, *DL))
+  if (!TLI->isSuitableForBitTests(DestMap, Low, High, *DL))
     return false;
 
   APInt LowBound;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 59798b3..b3535eac 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -90,6 +91,11 @@ static cl::opt<unsigned> OptsizeJumpTableDensity(
     cl::desc("Minimum density for building a jump table in "
              "an optsize function"));
 
+static cl::opt<unsigned> MinimumBitTestCmpsOverride(
+    "min-bit-test-cmps", cl::init(2), cl::Hidden,
+    cl::desc("Set minimum of largest number of comparisons "
+             "to use bit test for switch."));
+
 // FIXME: This option is only to test if the strict fp operation processed
 // correctly by preventing mutating strict fp operation to normal fp operation
 // during development. When the backend supports strict float operation, this
@@ -428,6 +434,11 @@ RTLIB::Libcall RTLIB::getSINCOSPI(EVT RetVT) {
                       SINCOSPI_F128, SINCOSPI_PPCF128);
 }
 
+RTLIB::Libcall RTLIB::getSINCOS_STRET(EVT RetVT) {
+  return getFPLibCall(RetVT, SINCOS_STRET_F32, SINCOS_STRET_F64,
+                      UNKNOWN_LIBCALL, UNKNOWN_LIBCALL, UNKNOWN_LIBCALL);
+}
+
 RTLIB::Libcall RTLIB::getMODF(EVT RetVT) {
   return getFPLibCall(RetVT, MODF_F32, MODF_F64, MODF_F80, MODF_F128,
                       MODF_PPCF128);
@@ -719,6 +730,8 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
 
   MinCmpXchgSizeInBits = 0;
   SupportsUnalignedAtomics = false;
+
+  MinimumBitTestCmps = MinimumBitTestCmpsOverride;
 }
 
 // Define the virtual destructor out-of-line to act as a key method to anchor
@@ -2129,6 +2142,14 @@ bool TargetLoweringBase::isJumpTableRelative() const {
   return getTargetMachine().isPositionIndependent();
 }
 
+unsigned TargetLoweringBase::getMinimumBitTestCmps() const {
+  return MinimumBitTestCmps;
+}
+
+void TargetLoweringBase::setMinimumBitTestCmps(unsigned Val) {
+  MinimumBitTestCmps = Val;
+}
+
 Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const {
   if (TM.Options.LoopAlignment)
     return Align(TM.Options.LoopAlignment);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d16b116..60aa61e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9028,11 +9028,12 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   CallingConv::ID CallerCC = CallerF.getCallingConv();
 
   // SME Streaming functions are not eligible for TCO as they may require
-  // the streaming mode or ZA to be restored after returning from the call.
+  // the streaming mode or ZA/ZT0 to be restored after returning from the call.
   SMECallAttrs CallAttrs =
       getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
   if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
       CallAttrs.requiresPreservingAllZAState() ||
+      CallAttrs.requiresPreservingZT0() ||
       CallAttrs.caller().hasStreamingBody())
     return false;
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 313ae3d..fdba454 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1298,12 +1298,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
   }
 
-  // Use __sincos_stret if available.
-  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
-      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  }
+  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
 
   // FP-ARMv8 implements a lot of rounding-like FP operations.
   if (Subtarget->hasFPARMv8Base()) {
@@ -9835,13 +9831,18 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetDarwin());
-
   // For iOS, we want to call an alternative entry point: __sincos_stret,
   // return values are passed via sret.
   SDLoc dl(Op);
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
+  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
+  RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
+  if (SincosStret == RTLIB::Unsupported)
+    return SDValue();
+
+  assert(Subtarget->isTargetDarwin());
+
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
@@ -9871,11 +9872,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
 
   Args.emplace_back(Arg, ArgTy);
 
-  RTLIB::Libcall LC =
-      (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = getLibcallName(LC);
-  CallingConv::ID CC = getLibcallCallingConv(LC);
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
+  StringRef LibcallName = getLibcallImplName(SincosStret);
+  CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
+  SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 54c8972..0573f64 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1061,8 +1061,11 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
   SDValue W0 = isUndef(PredV)
                   ? DAG.getUNDEF(MVT::i64)
                   : DAG.getNode(HexagonISD::P2D, dl, MVT::i64, PredV);
-  Words[IdxW].push_back(HiHalf(W0, DAG));
-  Words[IdxW].push_back(LoHalf(W0, DAG));
+  if (Bytes < BitBytes) {
+    Words[IdxW].push_back(HiHalf(W0, DAG));
+    Words[IdxW].push_back(LoHalf(W0, DAG));
+  } else
+    Words[IdxW].push_back(W0);
 
   while (Bytes < BitBytes) {
     IdxW ^= 1;
@@ -1083,7 +1086,26 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
     Bytes *= 2;
   }
 
+  while (Bytes > BitBytes) {
+    IdxW ^= 1;
+    Words[IdxW].clear();
+
+    if (Bytes <= 4) {
+      for (const SDValue &W : Words[IdxW ^ 1]) {
+        SDValue T = contractPredicate(W, dl, DAG);
+        Words[IdxW].push_back(T);
+      }
+    } else {
+      for (const SDValue &W : Words[IdxW ^ 1]) {
+        Words[IdxW].push_back(W);
+      }
+    }
+    Bytes /= 2;
+  }
+
   assert(Bytes == BitBytes);
+  if (BitBytes == 1 && PredTy == MVT::v2i1)
+    ByteTy = MVT::getVectorVT(MVT::i16, HwLen);
 
   SDValue Vec = ZeroFill ? getZero(dl, ByteTy, DAG) : DAG.getUNDEF(ByteTy);
   SDValue S4 = DAG.getConstant(HwLen-4, dl, MVT::i32);
@@ -3157,6 +3179,9 @@ SDValue
 HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const {
   auto *MemN = cast<MemSDNode>(Op.getNode());
 
+  if (!MemN->getMemoryVT().isSimple())
+    return Op;
+
   MVT MemTy = MemN->getMemoryVT().getSimpleVT();
   if (!isHvxPairTy(MemTy))
     return Op;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 17f04d0..20fc849 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -138,6 +138,11 @@ static cl::opt<unsigned> PPCMinimumJumpTableEntries(
     "ppc-min-jump-table-entries", cl::init(64), cl::Hidden,
     cl::desc("Set minimum number of entries to use a jump table on PPC"));
 
+static cl::opt<unsigned> PPCMinimumBitTestCmps(
+    "ppc-min-bit-test-cmps", cl::init(3), cl::Hidden,
+    cl::desc("Set minimum of largest number of comparisons to use bit test for "
+             "switch on PPC."));
+
 static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(
     "ppc-gather-alias-max-depth", cl::init(18), cl::Hidden,
     cl::desc("max depth when checking alias info in GatherAllAliases()"));
@@ -1436,6 +1441,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // Re-evaluate this value on future HWs that can do better with mtctr.
   setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
 
+  // The default minimum of largest number in a BitTest cluster is 3.
+  setMinimumBitTestCmps(PPCMinimumBitTestCmps);
+
   setMinFunctionAlignment(Align(4));
   setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32);
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index b0bed71c..da3efdc 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -194,6 +194,22 @@ class XX3Form_XTAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31} = XT{5};
 }
 
+class XForm_RBS5<bits<6> opCode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+    : I<opCode, OOL, IOL, asmstr, itin> {
+
+  bits<5> RB;
+  bits<5> RS;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = RS;
+  let Inst{11...15} = 0;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
+  let Inst{31} = 0;
+}
+
 class XX3Form_XTAB6_S<bits<5> xo, dag OOL, dag IOL, string asmstr,
                        list<dag> pattern>
     : I<59, OOL, IOL, asmstr, NoItinerary> {
@@ -317,12 +333,16 @@ let Predicates = [IsISAFuture] in {
   def TLBIEIO
       : XForm_RSB5_UIMM2<31, 18, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC),
                          "tlbieio $RB, $RS, $RIC", []>;
+  def MTLPL : XForm_RBS5<31, 275, (outs), (ins gprc:$RB, gprc:$RS),
+                         "mtlpl $RB, $RS", IIC_SprMTSPR, []>;
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
     def TLBIEP8
         : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs),
                                   (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC,
                                       u1imm:$PRS, u1imm:$R),
                                   "tlbiep $RB, $RS, $RIC, $PRS, $R", []>;
+    def MTLPL8 : XForm_RBS5<31, 275, (outs), (ins g8rc:$RB, g8rc:$RS),
+                            "mtlpl $RB, $RS", IIC_SprMTSPR, []>, isPPC64;
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 410f20e..b86020a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2572,11 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   // Combine sin / cos into _sincos_stret if it is available.
-  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
-      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  }
+  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
 
   if (Subtarget.isTargetWin64()) {
     setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -33067,26 +33064,30 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  bool isF64 = ArgVT == MVT::f64;
+
+  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
+  const char *LibcallName = TLI.getLibcallName(LC);
+  if (!LibcallName)
+    return SDValue();
+
   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
 
   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   // which returns the values as { float, float } (in XMM0) or
   // { double, double } (which is returned in XMM0, XMM1).
   SDLoc dl(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   Args.emplace_back(Arg, ArgTy);
 
-  bool isF64 = ArgVT == MVT::f64;
   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   // the results are returned via SRet in memory.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = TLI.getLibcallName(LC);
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
@@ -54634,6 +54635,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
   SDLoc DL(N);
 
   // Attempt to pre-truncate inputs to arithmetic ops instead.
@@ -54652,6 +54654,39 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
     return V;
 
+  // Fold trunc(srl(load(p),amt)) -> load(p+amt/8)
+  // If we're shifting down byte aligned bit chunks from a larger load for
+  // truncation, see if we can convert the shift into a pointer offset instead.
+  // Limit this to normal (non-ext) scalar integer loads.
+  if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
+      Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
+      ISD::isNormalLoad(Src.getOperand(0).getNode())) {
+    auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
+    if (Ld->isSimple() && VT.isByteSized() &&
+        isPowerOf2_64(VT.getSizeInBits())) {
+      SDValue ShAmt = Src.getOperand(1);
+      KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+      // Check the shift amount is byte aligned.
+      // Check the truncation doesn't use any shifted in (zero) top bits.
+      if (KnownAmt.countMinTrailingZeros() >= 3 &&
+          KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
+                                     VT.getSizeInBits())) {
+        EVT PtrVT = Ld->getBasePtr().getValueType();
+        SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
+        SDValue PtrByteOfs =
+            DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
+                        DAG.getShiftAmountConstant(3, PtrVT, DL));
+        SDValue NewPtr = DAG.getMemBasePlusOffset(
+            Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
+        SDValue NewLoad =
+            DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getMemOperand());
+        DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
+                                      NewLoad.getValue(1));
+        return NewLoad;
+      }
+    }
+  }
+
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8d9933b..92fca90 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3496,7 +3496,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       if (isPowerOf2_64(AlignMask + 1)) {
         uint64_t Offset = 0;
         match(A, m_Add(m_Value(A), m_ConstantInt(Offset)));
-        if (match(A, m_PtrToInt(m_Value(A)))) {
+        if (match(A, m_PtrToIntOrAddr(m_Value(A)))) {
           /// Note: this doesn't preserve the offset information but merges
           /// offset and alignment.
           /// TODO: we can generate a GEP instead of merging the alignment with
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index f939e7a..614c6eb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2148,7 +2148,7 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
   return nullptr;
 }
 
-Value *InstCombinerImpl::foldPtrToIntOfGEP(Type *IntTy, Value *Ptr) {
+Value *InstCombinerImpl::foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr) {
   // Look through chain of one-use GEPs.
   Type *PtrTy = Ptr->getType();
   SmallVector<GEPOperator *> GEPs;
@@ -2210,7 +2210,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
       Mask->getType() == Ty)
     return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask);
 
-  if (Value *V = foldPtrToIntOfGEP(Ty, SrcOp))
+  if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp))
     return replaceInstUsesWith(CI, V);
 
   Value *Vec, *Scalar, *Index;
@@ -2228,6 +2228,21 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
 }
 
 Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) {
+  Value *SrcOp = CI.getPointerOperand();
+  Type *Ty = CI.getType();
+
+  // (ptrtoaddr (ptrmask P, M))
+  //    -> (and (ptrtoaddr P), M)
+  // This is generally beneficial as `and` is better supported than `ptrmask`.
+  Value *Ptr, *Mask;
+  if (match(SrcOp, m_OneUse(m_Intrinsic<Intrinsic::ptrmask>(m_Value(Ptr),
+                                                            m_Value(Mask)))) &&
+      Mask->getType() == Ty)
+    return BinaryOperator::CreateAnd(Builder.CreatePtrToAddr(Ptr), Mask);
+
+  if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp))
+    return replaceInstUsesWith(CI, V);
+
   // FIXME: Implement variants of ptrtoint folds.
   return commonCastTransforms(CI);
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 9c75d9a..d85e4f7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -700,7 +700,7 @@ public:
   /// folded operation.
   void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN);
 
-  Value *foldPtrToIntOfGEP(Type *IntTy, Value *Ptr);
+  Value *foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr);
   Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, CmpPredicate Cond,
                            Instruction &I);
   Instruction *foldSelectICmp(CmpPredicate Pred, SelectInst *SI, Value *RHS,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index acad795..d9ac26bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3648,6 +3648,37 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
     Sub = VecOp->getDefiningRecipe();
     VecOp = Tmp;
   }
+
+  // If ValB is a constant and can be safely extended, truncate it to the same
+  // type as ExtA's operand, then extend it to the same type as ExtA. This
+  // creates two uniform extends that can more easily be matched by the rest of
+  // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
+  // replaced with the new extend of the constant.
+  auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
+                                           VPWidenCastRecipe *&ExtB,
+                                           VPValue *&ValB, VPWidenRecipe *Mul) {
+    if (!ExtA || ExtB || !ValB->isLiveIn())
+      return;
+    Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
+    Instruction::CastOps ExtOpc = ExtA->getOpcode();
+    const APInt *Const;
+    if (!match(ValB, m_APInt(Const)) ||
+        !llvm::canConstantBeExtended(
+            Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
+      return;
+    // The truncate ensures that the type of each extended operand is the
+    // same, and it's been proven that the constant can be extended from
+    // NarrowTy safely. Necessary since ExtA's extended operand would be
+    // e.g. an i8, while the const will likely be an i32. This will be
+    // elided by later optimisations.
+    VPBuilder Builder(Mul);
+    auto *Trunc =
+        Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
+    Type *WideTy = Ctx.Types.inferScalarType(ExtA);
+    ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
+    Mul->setOperand(1, ExtB);
+  };
+
   // Try to match reduce.add(mul(...)).
   if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
     auto *RecipeA =
@@ -3656,6 +3687,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
     auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
 
+    // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
+    ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
+
     // Match reduce.add/sub(mul(ext, ext)).
     if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
         match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
@@ -3665,7 +3699,6 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
                                       cast<VPWidenRecipe>(Sub), Red);
       return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
     }
-    // Match reduce.add(mul).
     // TODO: Add an expression type for this variant with a negated mul
     if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
       return new VPExpressionRecipe(Mul, Red);
@@ -3674,18 +3707,26 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
   // variants.
   if (Sub)
     return nullptr;
-  // Match reduce.add(ext(mul(ext(A), ext(B)))).
-  // All extend recipes must have same opcode or A == B
-  // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
-  if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
-                                      m_ZExtOrSExt(m_VPValue()))))) {
+
+  // Match reduce.add(ext(mul(A, B))).
+  if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
     auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
     auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
-    auto *Ext0 =
-        cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
-    auto *Ext1 =
-        cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
-    if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
+    auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
+    auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
+
+    // reduce.add(ext(mul(ext, const)))
+    // -> reduce.add(ext(mul(ext, ext(const))))
+    ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
+
+    // reduce.add(ext(mul(ext(A), ext(B))))
+    // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
+    // The inner extends must either have the same opcode as the outer extend or
+    // be the same, in which case the multiply can never result in a negative
+    // value and the outer extend can be folded away by doing wider
+    // extends for the operands of the mul.
+    if (Ext0 && Ext1 &&
+        (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
         Ext0->getOpcode() == Ext1->getOpcode() &&
         IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
       auto *NewExt0 = new VPWidenCastRecipe(
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 2583a93..5b81f5d 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -426,3 +426,21 @@ define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwin
   call void %callee()
   ret void
 }
+
+define void @disable_tailcallopt(ptr %callee) "aarch64_inout_zt0" nounwind {
+; CHECK-COMMON-LABEL: disable_tailcallopt:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    sub sp, sp, #80
+; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x19, sp
+; CHECK-COMMON-NEXT:    str zt0, [x19]
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    blr x0
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    ldr zt0, [x19]
+; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #80
+; CHECK-COMMON-NEXT:    ret
+  tail call void %callee()
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll b/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll
new file mode 100644
index 0000000..fcf1246
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll
@@ -0,0 +1,94 @@
+;; REQUIRES: asserts
+;; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b %s -o - | FileCheck %s
+;; Sanity check for lowering masked scatter without assertion errors.
+
+define void @outer_product(ptr  %aptr, ptr  %bptr, ptr %cptr, i32 %T, i32 %W) {
+entry:
+  %W.ripple.bcast.splatinsert = insertelement <8 x i32> poison, i32 %W, i64 0
+  %W.ripple.bcast.splat = shufflevector <8 x i32> %W.ripple.bcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  %div1194 = lshr i32 %T, 3
+  %cmp84.not = icmp ult i32 %T, 8
+  br i1 %cmp84.not, label %for.end49, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %div10195 = lshr i32 %W, 3
+  %cmp1782.not = icmp ult i32 %W, 8
+  %arrayidx27.ripple.LS.dim.slope = mul <8 x i32> %W.ripple.bcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %arrayidx27.ripple.LS.dim.slope.ripple.bcast = shufflevector <8 x i32> %arrayidx27.ripple.LS.dim.slope, <8 x i32> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %arrayidx27.ripple.LS.slope = add <64 x i32> %arrayidx27.ripple.LS.dim.slope.ripple.bcast, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %invariant.gep196 = getelementptr i8, ptr %cptr, <64 x i32> %arrayidx27.ripple.LS.slope
+  br label %for.body
+
+for.body:                                         ; preds = %for.end, %for.body.preheader
+  %ripple.par.iv.085 = phi i32 [ %add48, %for.end ], [ 0, %for.body.preheader ]
+  %mul2 = shl i32 %ripple.par.iv.085, 3
+  br i1 %cmp1782.not, label %for.end, label %for.body18.lr.ph
+
+for.body18.lr.ph:                                 ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw i8, ptr %aptr, i32 %mul2
+  %mul25 = mul i32 %mul2, %W
+  %gep197 = getelementptr i8, <64 x ptr> %invariant.gep196, i32 %mul25
+  br label %for.body18
+
+for.body18:                                       ; preds = %for.body18, %for.body18.lr.ph
+  %ripple.par.iv15.083 = phi i32 [ 0, %for.body18.lr.ph ], [ %add28, %for.body18 ]
+  %mul19 = shl i32 %ripple.par.iv15.083, 3
+  %.ripple.LS.instance184 = load <8 x i8>, ptr %arrayidx, align 1
+  %.ripple.LS.instance184.ripple.bcast = shufflevector <8 x i8> %.ripple.LS.instance184, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %arrayidx21 = getelementptr inbounds nuw i8, ptr %bptr, i32 %mul19
+  %.ripple.LS.instance = load <8 x i8>, ptr %arrayidx21, align 1
+  %.ripple.LS.instance.ripple.bcast = shufflevector <8 x i8> %.ripple.LS.instance, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mul23.ripple.LS.instance = mul <64 x i8> %.ripple.LS.instance.ripple.bcast, %.ripple.LS.instance184.ripple.bcast
+  %gep = getelementptr i8, <64 x ptr> %gep197, i32 %mul19
+  tail call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %mul23.ripple.LS.instance, <64 x ptr> %gep, i32 1, <64 x i1> splat (i1 true))
+  %add28 = add nuw i32 %ripple.par.iv15.083, 1
+  %cmp17 = icmp ult i32 %add28, %div10195
+  br i1 %cmp17, label %for.body18, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body18
+  %0 = shl i32 %add28, 3
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.body
+  %ripple.par.iv15.0.lcssa = phi i32 [ 0, %for.body ], [ %0, %for.end.loopexit ]
+  %add30.ripple.bcast.splatinsert = insertelement <8 x i32> poison, i32 %ripple.par.iv15.0.lcssa, i64 0
+  %add30.ripple.bcast.splat = shufflevector <8 x i32> %add30.ripple.bcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  %add30.ripple.LS.instance = or disjoint <8 x i32> %add30.ripple.bcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %cmp32.ripple.LS.instance = icmp ne i32 %ripple.par.iv15.0.lcssa, %W
+  %cmp32.ripple.LS.instance.ripple.bcast.splatinsert = insertelement <8 x i1> poison, i1 %cmp32.ripple.LS.instance, i64 0
+  %cmp32.ripple.LS.instance.ripple.bcast.splat = shufflevector <8 x i1> %cmp32.ripple.LS.instance.ripple.bcast.splatinsert, <8 x i1> poison, <8 x i32> zeroinitializer
+  %cmp33.ripple.vectorized = icmp ult <8 x i32> %add30.ripple.LS.instance, %W.ripple.bcast.splat
+  %or.cond.ripple.LS.instance = select <8 x i1> %cmp32.ripple.LS.instance.ripple.bcast.splat, <8 x i1> %cmp33.ripple.vectorized, <8 x i1> zeroinitializer
+  %or.cond.ripple.LS.instance.ripple.bcast = shufflevector <8 x i1> %or.cond.ripple.LS.instance, <8 x i1> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle = shufflevector <8 x i1> %or.cond.ripple.LS.instance, <8 x i1> <i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 false>, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 15>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.operator = or <8 x i1> %or.cond.ripple.LS.instance, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle
+  %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle189 = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator, <8 x i1> <i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 false, i1 false>, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 14, i32 15>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.operator190 = or <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle189
+  %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle191 = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator190, <8 x i1> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.operator192 = or <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator190, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle191
+  %ripple.red.extract.ripple.bcast.splat = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator192, <8 x i1> poison, <8 x i32> zeroinitializer
+  %arrayidx34.ripple.branch.clone = getelementptr inbounds nuw i8, ptr %aptr, i32 %mul2
+  %.ripple.LS.instance188.ripple.branch.clone.ripple.masked.load = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %arrayidx34.ripple.branch.clone, i32 1, <8 x i1> %ripple.red.extract.ripple.bcast.splat, <8 x i8> poison)
+  %.ripple.LS.instance188.ripple.bcast.ripple.branch.clone = shufflevector <8 x i8> %.ripple.LS.instance188.ripple.branch.clone.ripple.masked.load, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %arrayidx36.ripple.branch.clone = getelementptr inbounds nuw i8, ptr %bptr, i32 %ripple.par.iv15.0.lcssa
+  %.ripple.LS.instance187.ripple.branch.clone.ripple.masked.load = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %arrayidx36.ripple.branch.clone, i32 1, <8 x i1> %or.cond.ripple.LS.instance, <8 x i8> poison)
+  %.ripple.LS.instance187.ripple.bcast.ripple.branch.clone = shufflevector <8 x i8> %.ripple.LS.instance187.ripple.branch.clone.ripple.masked.load, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mul38.ripple.LS.instance.ripple.branch.clone = mul <64 x i8> %.ripple.LS.instance187.ripple.bcast.ripple.branch.clone, %.ripple.LS.instance188.ripple.bcast.ripple.branch.clone
+  %mul40.ripple.branch.clone = mul i32 %mul2, %W
+  %1 = getelementptr i8, ptr %cptr, i32 %mul40.ripple.branch.clone
+  %arrayidx42.ripple.branch.clone = getelementptr i8, ptr %1, i32 %ripple.par.iv15.0.lcssa
+  %arrayidx42.ripple.LS.instance.ripple.branch.clone = getelementptr i8, ptr %arrayidx42.ripple.branch.clone, <64 x i32> %arrayidx27.ripple.LS.slope
+  tail call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %mul38.ripple.LS.instance.ripple.branch.clone, <64 x ptr> %arrayidx42.ripple.LS.instance.ripple.branch.clone, i32 1, <64 x i1> %or.cond.ripple.LS.instance.ripple.bcast)
+  %add48 = add nuw i32 %ripple.par.iv.085, 1
+  %cmp = icmp ult i32 %add48, %div1194
+  br i1 %cmp, label %for.body, label %for.end49
+
+for.end49:                                        ; preds = %for.end, %entry
+  ret void
+}
+
+;; CHECK: outer_product
+;; CHECK: {{r[0-9]+}} = lsr({{r[0-9]+}},#3)
+;; CHECK: {{q[0-9]+}} = vand({{v[0-9]+}},{{r[0-9]+}})
+;; CHECK: {{v[0-9]+}} = vmux(q0,{{v[0-9]+}},{{v[0-9]+}})
+;; CHECK: vmem{{.*}} = {{v[0-9]+}}
diff --git a/llvm/test/CodeGen/PowerPC/bittest.ll b/llvm/test/CodeGen/PowerPC/bittest.ll
new file mode 100644
index 0000000..cba56e3
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/bittest.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs < %s -O3 -mcpu=ppc -mtriple powerpc-ibm-aix \
+; RUN:     -ppc-asm-full-reg-names | FileCheck %s
+
+define i32 @foo(i32 noundef signext %x) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stwu r1, -64(r1)
+; CHECK-NEXT:    stw r0, 72(r1)
+; CHECK-NEXT:    cmpwi r3, 8
+; CHECK-NEXT:    stw r31, 60(r1) # 4-byte Folded Spill
+; CHECK-NEXT:    mr r31, r3
+; CHECK-NEXT:    li r3, 0
+; CHECK-NEXT:    ble cr0, L..BB0_4
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    cmpwi r31, 11
+; CHECK-NEXT:    bge cr0, L..BB0_7
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    cmplwi r31, 9
+; CHECK-NEXT:    beq cr0, L..BB0_9
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    cmplwi r31, 10
+; CHECK-NEXT:    beq cr0, L..BB0_11
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_4: # %entry
+; CHECK-NEXT:    cmplwi r31, 4
+; CHECK-NEXT:    beq cr0, L..BB0_12
+; CHECK-NEXT:  # %bb.5: # %entry
+; CHECK-NEXT:    cmplwi r31, 7
+; CHECK-NEXT:    beq cr0, L..BB0_11
+; CHECK-NEXT:  # %bb.6: # %entry
+; CHECK-NEXT:    cmplwi r31, 8
+; CHECK-NEXT:    beq cr0, L..BB0_10
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_7: # %entry
+; CHECK-NEXT:    beq cr0, L..BB0_10
+; CHECK-NEXT:  # %bb.8: # %entry
+; CHECK-NEXT:    cmplwi r31, 12
+; CHECK-NEXT:    bne cr0, L..BB0_13
+; CHECK-NEXT:  L..BB0_9: # %sw.bb2
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo3[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_10: # %sw.bb1
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo2[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_11: # %sw.bb
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo1[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_12: # %sw.bb3
+; CHECK-NEXT:    li r3, 4
+; CHECK-NEXT:    bl .foo4[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    li r3, 4
+; CHECK-NEXT:  L..BB0_13: # %return
+; CHECK-NEXT:    lwz r31, 60(r1) # 4-byte Folded Reload
+; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    lwz r0, 8(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+entry:
+  switch i32 %x, label %return [
+  i32 7, label %sw.bb
+  i32 10, label %sw.bb
+  i32 8, label %sw.bb1
+  i32 11, label %sw.bb1
+  i32 9, label %sw.bb2
+  i32 12, label %sw.bb2
+  i32 4, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry, %entry
+  tail call void @foo1(i32 noundef signext %x)
+  br label %return
+
+sw.bb1:                                           ; preds = %entry, %entry
+  tail call void @foo2(i32 noundef signext %x)
+  br label %return
+
+sw.bb2:                                           ; preds = %entry, %entry
+  tail call void @foo3(i32 noundef signext %x)
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  tail call void @foo4(i32 noundef signext 4)
+  br label %return
+
+return:                                           ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry
+  %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ %x, %sw.bb ]
+  ret i32 %retval.0
+}
+
+define i32 @goo(i32 noundef signext %x) {
+; CHECK-LABEL: goo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stwu r1, -64(r1)
+; CHECK-NEXT:    stw r0, 72(r1)
+; CHECK-NEXT:    cmplwi r3, 12
+; CHECK-NEXT:    stw r31, 60(r1) # 4-byte Folded Spill
+; CHECK-NEXT:    mr r31, r3
+; CHECK-NEXT:    bgt cr0, L..BB1_7
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    li r3, 1
+; CHECK-NEXT:    slw r3, r3, r31
+; CHECK-NEXT:    andi. r4, r3, 5632
+; CHECK-NEXT:    bne cr0, L..BB1_4
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    andi. r3, r3, 2304
+; CHECK-NEXT:    beq cr0, L..BB1_5
+; CHECK-NEXT:  # %bb.3: # %sw.bb1
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo2[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    b L..BB1_9
+; CHECK-NEXT:  L..BB1_4: # %sw.bb2
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo3[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    b L..BB1_9
+; CHECK-NEXT:  L..BB1_5: # %entry
+; CHECK-NEXT:    cmplwi r31, 7
+; CHECK-NEXT:    bne cr0, L..BB1_7
+; CHECK-NEXT:  # %bb.6: # %sw.bb
+; CHECK-NEXT:    li r3, 7
+; CHECK-NEXT:    li r31, 7
+; CHECK-NEXT:    bl .foo1[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    b L..BB1_9
+; CHECK-NEXT:  L..BB1_7: # %entry
+; CHECK-NEXT:    cmplwi r31, 4
+; CHECK-NEXT:    li r31, 0
+; CHECK-NEXT:    bne cr0, L..BB1_9
+; CHECK-NEXT:  # %bb.8: # %sw.bb3
+; CHECK-NEXT:    li r3, 4
+; CHECK-NEXT:    li r31, 4
+; CHECK-NEXT:    bl .foo4[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  L..BB1_9: # %return
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    lwz r31, 60(r1) # 4-byte Folded Reload
+; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    lwz r0, 8(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+entry:
+  switch i32 %x, label %return [
+  i32 7, label %sw.bb
+  i32 8, label %sw.bb1
+  i32 11, label %sw.bb1
+  i32 9, label %sw.bb2
+  i32 10, label %sw.bb2
+  i32 12, label %sw.bb2
+  i32 4, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry
+  tail call void @foo1(i32 noundef signext 7)
+  br label %return
+
+sw.bb1:                                           ; preds = %entry, %entry
+  tail call void @foo2(i32 noundef signext %x)
+  br label %return
+
+sw.bb2:                                           ; preds = %entry, %entry, %entry
+  tail call void @foo3(i32 noundef signext %x)
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  tail call void @foo4(i32 noundef signext 4)
+  br label %return
+
+return:                                           ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry
+  %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ 7, %sw.bb ]
+  ret i32 %retval.0
+}
+
+declare void @foo1(i32 noundef signext)
+
+declare void @foo2(i32 noundef signext)
+
+declare void @foo3(i32 noundef signext)
+
+declare void @foo4(i32 noundef signext)
diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
index ea4d32b..d087491 100644
--- a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
@@ -660,8 +660,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
 ; SSE2-LABEL: call_ret_v3bf16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    movl 4(%rdi), %eax
-; SSE2-NEXT:    pinsrw $0, %eax, %xmm1
+; SSE2-NEXT:    pinsrw $0, 4(%rdi), %xmm1
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    callq returns_v3bf16@PLT
@@ -725,8 +724,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
 ; AVXNECONVERT-LABEL: call_ret_v3bf16:
 ; AVXNECONVERT:       # %bb.0:
 ; AVXNECONVERT-NEXT:    pushq %rax
-; AVXNECONVERT-NEXT:    movl 4(%rdi), %eax
-; AVXNECONVERT-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVXNECONVERT-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
 ; AVXNECONVERT-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVXNECONVERT-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; AVXNECONVERT-NEXT:    callq returns_v3bf16@PLT
diff --git a/llvm/test/CodeGen/X86/trunc-srl-load.ll b/llvm/test/CodeGen/X86/trunc-srl-load.ll
index 4dae143..d9c21d3 100644
--- a/llvm/test/CodeGen/X86/trunc-srl-load.ll
+++ b/llvm/test/CodeGen/X86/trunc-srl-load.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown                   | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64
 
 ; Tests showing for the analysis of non-constant shift amounts to improve load address math
 
@@ -12,42 +12,20 @@
 define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub64_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl 4(%eax), %esi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    andb $16, %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    jne .LBB0_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:  .LBB0_2:
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $48, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movzwl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub64_16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    andb $48, %cl
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shrq %cl, %rax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub64_16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX-NEXT:    andb $48, %sil
-; AVX-NEXT:    shrxq %rsi, (%rdi), %rax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub64_16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $48, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movzwl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 63
   %idx_align = and i32 %idx_bounds, -16
   %sh = zext nneg i32 %idx_align to i64
@@ -60,67 +38,20 @@ define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
 define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andb $16, %cl
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    movl (%esp,%edx), %eax
-; X86-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $112, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movzwl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub128_16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andb $48, %cl
-; SSE-NEXT:    movq %rdx, %rdi
-; SSE-NEXT:    shrq %cl, %rdi
-; SSE-NEXT:    shrdq %cl, %rdx, %rax
-; SSE-NEXT:    testb $64, %sil
-; SSE-NEXT:    cmovneq %rdi, %rax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub128_16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rdx
-; AVX-NEXT:    movq 8(%rdi), %rax
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    andb $48, %cl
-; AVX-NEXT:    shrdq %cl, %rax, %rdx
-; AVX-NEXT:    shrxq %rcx, %rax, %rax
-; AVX-NEXT:    testb $64, %sil
-; AVX-NEXT:    cmoveq %rdx, %rax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub128_16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $112, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movzwl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -16
   %sh = zext nneg i32 %idx_align to i128
@@ -133,62 +64,20 @@ define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
 define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andb $96, %al
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl (%esp,%eax), %eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub128_32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andb $32, %cl
-; SSE-NEXT:    movq %rdx, %rdi
-; SSE-NEXT:    shrq %cl, %rdi
-; SSE-NEXT:    shrdq %cl, %rdx, %rax
-; SSE-NEXT:    testb $64, %sil
-; SSE-NEXT:    cmovneq %rdi, %rax
-; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub128_32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rdx
-; AVX-NEXT:    movq 8(%rdi), %rax
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    andb $32, %cl
-; AVX-NEXT:    shrdq %cl, %rax, %rdx
-; AVX-NEXT:    shrxq %rcx, %rax, %rax
-; AVX-NEXT:    testb $64, %sil
-; AVX-NEXT:    cmoveq %rdx, %rax
-; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub128_32:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $96, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -32
   %sh = zext nneg i32 %idx_align to i128
@@ -201,46 +90,20 @@ define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
 define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andb $64, %al
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %eax
-; X86-NEXT:    movl 4(%esp,%ecx), %edx
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $64, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %eax
+; X86-NEXT:    movl 4(%ecx,%edx), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractSub128_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    testb $64, %sil
-; X64-NEXT:    je .LBB3_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB3_1:
-; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $64, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
 ; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -64
@@ -254,185 +117,20 @@ define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
 define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $192, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl 52(%eax), %edx
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl 60(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl $24, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $60, %edx
-; X86-NEXT:    movl 48(%esp,%edx), %eax
-; X86-NEXT:    movl 52(%esp,%edx), %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $63, %ecx
+; X86-NEXT:    movzbl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub512_8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movups 16(%rdi), %xmm1
-; SSE-NEXT:    movups 32(%rdi), %xmm2
-; SSE-NEXT:    movups 48(%rdi), %xmm3
-; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $56, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rdx
-; SSE-NEXT:    shrq %cl, %rdx
-; SSE-NEXT:    movl -120(%rsp,%rsi), %eax
-; SSE-NEXT:    addl %eax, %eax
-; SSE-NEXT:    notl %ecx
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    orl %edx, %eax
-; SSE-NEXT:    # kill: def $al killed $al killed $rax
-; SSE-NEXT:    popq %rcx
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: extractSub512_8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $56, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    notl %ecx
-; AVX2-NEXT:    movl -120(%rsp,%rsi), %edx
-; AVX2-NEXT:    addl %edx, %edx
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    # kill: def $al killed $al killed $rax
-; AVX2-NEXT:    popq %rcx
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: extractSub512_8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $56, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rax
-; AVX512-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX512-NEXT:    notl %ecx
-; AVX512-NEXT:    movl -120(%rsp,%rsi), %edx
-; AVX512-NEXT:    addl %edx, %edx
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX512-NEXT:    orl %ecx, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $rax
-; AVX512-NEXT:    popq %rcx
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: extractSub512_8:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    andl $63, %esi
+; X64-NEXT:    movzbl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 511
   %idx_align = and i32 %idx_bounds, -8
   %ld = load i512, ptr %word, align 8
@@ -445,152 +143,21 @@ define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
 define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $192, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl 52(%eax), %edx
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl 60(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    andl $56, %ecx
-; X86-NEXT:    movl 48(%esp,%ecx), %eax
-; X86-NEXT:    movl 52(%esp,%ecx), %edx
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $56, %edx
+; X86-NEXT:    movl (%ecx,%edx), %eax
+; X86-NEXT:    movl 4(%ecx,%edx), %edx
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub512_64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movups 16(%rdi), %xmm1
-; SSE-NEXT:    movups 32(%rdi), %xmm2
-; SSE-NEXT:    movups 48(%rdi), %xmm3
-; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rax
-; SSE-NEXT:    popq %rcx
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: extractSub512_64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX2-NEXT:    popq %rcx
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: extractSub512_64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX512-NEXT:    popq %rcx
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: extractSub512_64:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    andl $56, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 511
   %idx_align = and i32 %idx_bounds, -64
   %sh = zext nneg i32 %idx_align to i512
@@ -603,143 +170,35 @@ define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {
 define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $192, %esp
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl 52(%eax), %edx
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl 60(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %edi
-; X86-NEXT:    andl $48, %edi
-; X86-NEXT:    movl 48(%esp,%edi), %ecx
-; X86-NEXT:    movl 52(%esp,%edi), %edx
-; X86-NEXT:    movl 56(%esp,%edi), %esi
-; X86-NEXT:    movl 60(%esp,%edi), %edi
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $48, %edx
+; X86-NEXT:    movl (%ecx,%edx), %esi
+; X86-NEXT:    movl 4(%ecx,%edx), %edi
+; X86-NEXT:    movl 8(%ecx,%edx), %ebx
+; X86-NEXT:    movl 12(%ecx,%edx), %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
-; SSE-LABEL: extractSub512_128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movups 16(%rdi), %xmm1
-; SSE-NEXT:    movups 32(%rdi), %xmm2
-; SSE-NEXT:    movups 48(%rdi), %xmm3
-; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $48, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rax
-; SSE-NEXT:    movq -120(%rsp,%rsi), %rdx
-; SSE-NEXT:    popq %rcx
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub512_128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    pushq %rax
-; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX-NEXT:    vmovups (%rdi), %ymm0
-; AVX-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    shrl $3, %esi
-; AVX-NEXT:    andl $48, %esi
-; AVX-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX-NEXT:    movq -120(%rsp,%rsi), %rdx
-; AVX-NEXT:    popq %rcx
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub512_128:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    andl $48, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
+; X64-NEXT:    movq 8(%rdi,%rsi), %rdx
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 511
   %idx_align = and i32 %idx_bounds, -128
   %sh = zext nneg i32 %idx_align to i512
@@ -752,916 +211,21 @@ define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {
 define i64 @extractSub4096_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub4096_64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $1536, %esp # imm = 0x600
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 64(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 84(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 88(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 92(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 100(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 104(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 108(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 112(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 116(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 120(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 124(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 132(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 136(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 140(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 144(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 148(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 152(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 156(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 160(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 164(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 168(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 172(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 176(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 180(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 184(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 188(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 192(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 196(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 200(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 204(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 208(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 212(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 216(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 220(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 224(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 228(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 232(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 236(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 240(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 244(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 248(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 252(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 256(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 260(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 264(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 268(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 272(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 276(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 280(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 284(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 288(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 292(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 296(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 300(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 304(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 308(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 312(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 316(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 320(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 324(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 328(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 332(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 336(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 340(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 344(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 348(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 352(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 356(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 360(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 364(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 368(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 372(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 376(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 380(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 384(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 388(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 392(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 396(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 400(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 404(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 408(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 412(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 416(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 420(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 424(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 428(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 432(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 436(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 440(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 444(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 448(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 452(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 456(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 460(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 464(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 468(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 472(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 476(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 480(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 484(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 488(%eax), %ebx
-; X86-NEXT:    movl 492(%eax), %edi
-; X86-NEXT:    movl 496(%eax), %esi
-; X86-NEXT:    movl 500(%eax), %edx
-; X86-NEXT:    movl 504(%eax), %ecx
-; X86-NEXT:    movl 508(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $4032, %ecx # imm = 0xFC0
-; X86-NEXT:    andl 12(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    movl 496(%esp,%ecx), %eax
-; X86-NEXT:    movl 500(%esp,%ecx), %edx
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $4032, %edx # imm = 0xFC0
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %eax
+; X86-NEXT:    movl 4(%ecx,%edx), %edx
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub4096_64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    subq $1176, %rsp # imm = 0x498
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 16(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 32(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 48(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 64(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 80(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 96(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 112(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 128(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; SSE-NEXT:    movups 144(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 160(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 176(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 192(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 208(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 224(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 240(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 256(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 272(%rdi), %xmm15
-; SSE-NEXT:    movups 288(%rdi), %xmm14
-; SSE-NEXT:    movups 304(%rdi), %xmm13
-; SSE-NEXT:    movups 320(%rdi), %xmm12
-; SSE-NEXT:    movups 336(%rdi), %xmm11
-; SSE-NEXT:    movups 352(%rdi), %xmm10
-; SSE-NEXT:    movups 368(%rdi), %xmm9
-; SSE-NEXT:    movups 384(%rdi), %xmm8
-; SSE-NEXT:    movups 400(%rdi), %xmm7
-; SSE-NEXT:    movups 416(%rdi), %xmm6
-; SSE-NEXT:    movups 432(%rdi), %xmm5
-; SSE-NEXT:    movups 448(%rdi), %xmm4
-; SSE-NEXT:    movups 464(%rdi), %xmm3
-; SSE-NEXT:    movups 480(%rdi), %xmm2
-; SSE-NEXT:    movups 496(%rdi), %xmm1
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    andl $4032, %esi # imm = 0xFC0
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    movq 144(%rsp,%rsi), %rax
-; SSE-NEXT:    addq $1176, %rsp # imm = 0x498
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: extractSub4096_64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $936, %rsp # imm = 0x3A8
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX2-NEXT:    vmovups 96(%rdi), %ymm3
-; AVX2-NEXT:    vmovups 128(%rdi), %ymm4
-; AVX2-NEXT:    vmovups 160(%rdi), %ymm5
-; AVX2-NEXT:    vmovups 192(%rdi), %ymm6
-; AVX2-NEXT:    vmovups 224(%rdi), %ymm7
-; AVX2-NEXT:    vmovups 256(%rdi), %ymm8
-; AVX2-NEXT:    vmovups 288(%rdi), %ymm9
-; AVX2-NEXT:    vmovups 320(%rdi), %ymm10
-; AVX2-NEXT:    vmovups 352(%rdi), %ymm11
-; AVX2-NEXT:    vmovups 384(%rdi), %ymm12
-; AVX2-NEXT:    vmovups 416(%rdi), %ymm13
-; AVX2-NEXT:    vmovups 448(%rdi), %ymm14
-; AVX2-NEXT:    vmovups 480(%rdi), %ymm15
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm3, (%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT:    andl $4032, %esi # imm = 0xFC0
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    movq -96(%rsp,%rsi), %rax
-; AVX2-NEXT:    addq $936, %rsp # imm = 0x3A8
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: extractSub4096_64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $904, %rsp # imm = 0x388
-; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX512-NEXT:    vmovups 96(%rdi), %ymm3
-; AVX512-NEXT:    vmovups 128(%rdi), %ymm4
-; AVX512-NEXT:    vmovups 160(%rdi), %ymm5
-; AVX512-NEXT:    vmovups 192(%rdi), %ymm6
-; AVX512-NEXT:    vmovups 224(%rdi), %ymm7
-; AVX512-NEXT:    vmovups 256(%rdi), %ymm8
-; AVX512-NEXT:    vmovups 288(%rdi), %ymm9
-; AVX512-NEXT:    vmovups 320(%rdi), %ymm10
-; AVX512-NEXT:    vmovups 352(%rdi), %ymm11
-; AVX512-NEXT:    vmovups 384(%rdi), %ymm12
-; AVX512-NEXT:    vmovups 416(%rdi), %ymm13
-; AVX512-NEXT:    andl $4032, %esi # imm = 0xFC0
-; AVX512-NEXT:    vmovups 448(%rdi), %ymm14
-; AVX512-NEXT:    vmovups 480(%rdi), %ymm15
-; AVX512-NEXT:    vxorps %xmm16, %xmm16, %xmm16
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm15, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm14, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm13, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm12, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm11, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm10, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm9, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm8, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm7, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm6, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm5, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm4, (%rsp)
-; AVX512-NEXT:    vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX512-NEXT:    addq $904, %rsp # imm = 0x388
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: extractSub4096_64:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $4032, %esi # imm = 0xFC0
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 4095
   %idx_align = and i32 %idx_bounds, -64
   %sh = zext nneg i32 %idx_align to i4096
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 81c4d5d..c3054a3 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -962,39 +962,22 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movl %ecx, %eax
-; X64-NO-BMI2-NEXT:    shrb $6, %al
-; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
-; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
-; X64-NO-BMI2-NEXT:    retq
-;
-; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
-; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movl %esi, %eax
-; X64-BMI2-NEXT:    shrb $6, %al
-; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
-; X64-BMI2-NEXT:    movb %al, (%rdx)
-; X64-BMI2-NEXT:    retq
+; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movups (%rdi), %xmm0
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    leal (,%rsi,8), %eax
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $6, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT:    andl $7, %esi
+; X64-NEXT:    movzbl (%rsi,%rax), %eax
+; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
@@ -3417,7 +3400,6 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X86: {{.*}}
 ; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 8d36eef..84c2cc6 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -1220,41 +1220,23 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; no @load_16byte_chunk_of_16byte_alloca
 
 define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movl %ecx, %eax
-; X64-NO-BMI2-NEXT:    shrb $6, %al
-; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
-; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
-; X64-NO-BMI2-NEXT:    retq
-;
-; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movl %esi, %eax
-; X64-BMI2-NEXT:    shrb $6, %al
-; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
-; X64-BMI2-NEXT:    movb %al, (%rdx)
-; X64-BMI2-NEXT:    retq
+; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64:       # %bb.0:
+; X64-NEXT:    movups (%rdi), %xmm0
+; X64-NEXT:    movups 16(%rdi), %xmm1
+; X64-NEXT:    leal (,%rsi,8), %eax
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $6, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT:    andl $7, %esi
+; X64-NEXT:    movzbl (%rsi,%rax), %eax
+; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
@@ -2156,7 +2138,6 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; no @load_32byte_chunk_of_32byte_alloca
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X86: {{.*}}
 ; X86-NO-SHLD: {{.*}}
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index 054489c..f5cb4b7 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -286,6 +286,9 @@
 #CHECK: xvmulhuh  4, 5, 7
 0xf0,0x85,0x3b,0xd0
 
+#CHECK: mtlpl 3, 4
+0x7c,0x80,0x1a,0x26
+
 #CHECK: xxmulmul 8, 3, 4, 2
 0xed,0x03,0x22,0x08
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index 17d1413..f0df8ce 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -280,6 +280,9 @@
 #CHECK: xvmulhuh  4, 5, 7
 0xd0,0x3b,0x85,0xf0
 
+#CHECK: mtlpl 3, 4
+0x26,0x1a,0x80,0x7c
+
 #CHECK: xxmulmul 8, 3, 4, 2
 0x08,0x22,0x03,0xed
 
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index e5bc1f4..bc0683e 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -403,6 +403,10 @@
 #CHECK-BE: xvmulhuh 4, 5, 7              # encoding: [0xf0,0x85,0x3b,0xd0]
 #CHECK-LE: xvmulhuh 4, 5, 7              # encoding: [0xd0,0x3b,0x85,0xf0]
 
+           mtlpl 3, 4
+#CHECK-BE: mtlpl 3, 4                     # encoding: [0x7c,0x80,0x1a,0x26]
+#CHECK-LE: mtlpl 3, 4                     # encoding: [0x26,0x1a,0x80,0x7c]
+
            xxmulmul 8, 3, 4, 2
 #CHECK-BE: xxmulmul 8, 3, 4, 2          # encoding: [0xed,0x03,0x22,0x08]
 #CHECK-LE: xxmulmul 8, 3, 4, 2          # encoding: [0x08,0x22,0x03,0xed]
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 7b0b871..cc87d65 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -10,8 +10,8 @@ declare void @llvm.assume(i1) #1
 
 ; Check that the assume has not been removed:
 
-define i32 @foo1(ptr %a) #0 {
-; DEFAULT-LABEL: @foo1(
+define i32 @align_to_bundle(ptr %a) #0 {
+; DEFAULT-LABEL: @align_to_bundle(
 ; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
 ; DEFAULT-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
 ; DEFAULT-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
@@ -19,7 +19,7 @@ define i32 @foo1(ptr %a) #0 {
 ; DEFAULT-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; DEFAULT-NEXT:    ret i32 [[T0]]
 ;
-; BUNDLES-LABEL: @foo1(
+; BUNDLES-LABEL: @align_to_bundle(
 ; BUNDLES-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
 ; BUNDLES-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 32) ]
 ; BUNDLES-NEXT:    ret i32 [[T0]]
@@ -32,6 +32,28 @@ define i32 @foo1(ptr %a) #0 {
   ret i32 %t0
 }
 
+define i32 @align_to_bundle_ptrtoaddr(ptr %a) #0 {
+; DEFAULT-LABEL: @align_to_bundle_ptrtoaddr(
+; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
+; DEFAULT-NEXT:    [[PTRINT:%.*]] = ptrtoaddr ptr [[A]] to i64
+; DEFAULT-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; DEFAULT-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; DEFAULT-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; DEFAULT-NEXT:    ret i32 [[T0]]
+;
+; BUNDLES-LABEL: @align_to_bundle_ptrtoaddr(
+; BUNDLES-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
+; BUNDLES-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 32) ]
+; BUNDLES-NEXT:    ret i32 [[T0]]
+;
+  %t0 = load i32, ptr %a, align 4
+  %ptrint = ptrtoaddr ptr %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  ret i32 %t0
+}
+
 define i32 @align_assume_trunc_cond(ptr %a) #0 {
 ; DEFAULT-LABEL: @align_assume_trunc_cond(
 ; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index a7434a2..adf3aa1 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -237,3 +237,75 @@ define ptr addrspace(1) @gep_sub_ptrtoaddr_different_obj_addrsize(ptr addrspace(
   call void @use.i32(i32 %addr)
   ret ptr addrspace(1) %gep
 }
+
+define i64 @ptrtoaddr_of_ptrmask(ptr %p, i64 %mask) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_ptrmask(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[MASK:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[ADDR:%.*]] = and i64 [[MASK]], [[TMP1]]
+; CHECK-NEXT:    ret i64 [[ADDR]]
+;
+  %masked = call ptr @llvm.ptrmask(ptr %p, i64 %mask)
+  %addr = ptrtoaddr ptr %masked to i64
+  ret i64 %addr
+}
+
+define i32 @ptrtoaddr_of_ptrmask_addrsize(ptr addrspace(1) %p, i32 %mask) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_ptrmask_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[MASK:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoaddr ptr addrspace(1) [[P]] to i32
+; CHECK-NEXT:    [[ADDR:%.*]] = and i32 [[MASK]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[ADDR]]
+;
+  %masked = call ptr addrspace(1) @llvm.ptrmask(ptr addrspace(1) %p, i32 %mask)
+  %addr = ptrtoaddr ptr addrspace(1) %masked to i32
+  ret i32 %addr
+}
+
+define i64 @ptrtoaddr_of_gep_of_inttoptr(i64 %int, i64 %offset) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_gep_of_inttoptr(
+; CHECK-SAME: i64 [[INT:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[ADDR:%.*]] = add i64 [[INT]], [[OFFSET]]
+; CHECK-NEXT:    ret i64 [[ADDR]]
+;
+  %ptr = inttoptr i64 %int to ptr
+  %gep = getelementptr i8, ptr %ptr, i64 %offset
+  %addr = ptrtoaddr ptr %gep to i64
+  ret i64 %addr
+}
+
+; FIXME: This could be supported by truncating %int before performing the
+; arithmetic.
+define i32 @ptrtoaddr_of_gep_of_inttoptr_addrsize(i64 %int, i32 %offset) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_gep_of_inttoptr_addrsize(
+; CHECK-SAME: i64 [[INT:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[PTR:%.*]] = inttoptr i64 [[INT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i32 [[OFFSET]]
+; CHECK-NEXT:    [[ADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[GEP]] to i32
+; CHECK-NEXT:    ret i32 [[ADDR]]
+;
+  %ptr = inttoptr i64 %int to ptr addrspace(1)
+  %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+  %addr = ptrtoaddr ptr addrspace(1) %gep to i32
+  ret i32 %addr
+}
+
+define i64 @ptrtoaddr_of_gep_of_null(i64 %offset) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_gep_of_null(
+; CHECK-SAME: i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i64 [[OFFSET]]
+;
+  %gep = getelementptr i8, ptr null, i64 %offset
+  %addr = ptrtoaddr ptr %gep to i64
+  ret i64 %addr
+}
+
+define i32 @ptrtoaddr_of_gep_of_null_addrsize(i32 %offset) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_gep_of_null_addrsize(
+; CHECK-SAME: i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i32 [[OFFSET]]
+;
+  %gep = getelementptr i8, ptr addrspace(1) null, i32 %offset
+  %addr = ptrtoaddr ptr addrspace(1) %gep to i32
+  ret i32 %addr
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
index 9f9e3f9..77a7f0d 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
@@ -1,26 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
-; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -S | FileCheck %s
-
-declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a)
-
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
 
 define i32 @add_0() {
 ; CHECK-LABEL: @add_0(
@@ -30,6 +10,15 @@ define i32 @add_0() {
   ret i32 %x
 }
 
+define i32 @add_0_scalable_vector() {
+; CHECK-LABEL: @add_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @add_1() {
 ; CHECK-LABEL: @add_1(
 ; CHECK-NEXT:    ret i32 8
@@ -38,6 +27,15 @@ define i32 @add_1() {
   ret i32 %x
 }
 
+define i32 @add_1_scalable_vector() {
+; CHECK-LABEL: @add_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @add_inc() {
 ; CHECK-LABEL: @add_inc(
 ; CHECK-NEXT:    ret i32 18
@@ -63,8 +61,17 @@ define i32 @add_undef() {
   ret i32 %x
 }
 
-define i32 @add_undef1() {
-; CHECK-LABEL: @add_undef1(
+define i32 @add_undef_scalable_vector() {
+; CHECK-LABEL: @add_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @add_undef_elt() {
+; CHECK-LABEL: @add_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -80,8 +87,17 @@ define i32 @add_poison() {
   ret i32 %x
 }
 
-define i32 @add_poison1() {
-; CHECK-LABEL: @add_poison1(
+define i32 @add_poison_scalable_vector() {
+; CHECK-LABEL: @add_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @add_poison_elt() {
+; CHECK-LABEL: @add_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 poison, i32 1, i32 1, i32 42, i32 1, i32 1>)
@@ -105,6 +121,15 @@ define i32 @mul_0() {
   ret i32 %x
 }
 
+define i32 @mul_0_scalable_vector() {
+; CHECK-LABEL: @mul_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @mul_1() {
 ; CHECK-LABEL: @mul_1(
 ; CHECK-NEXT:    ret i32 1
@@ -113,6 +138,15 @@ define i32 @mul_1() {
   ret i32 %x
 }
 
+define i32 @mul_1_scalable_vector() {
+; CHECK-LABEL: @mul_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @mul_inc() {
 ; CHECK-LABEL: @mul_inc(
 ; CHECK-NEXT:    ret i32 40320
@@ -138,8 +172,17 @@ define i32 @mul_undef() {
   ret i32 %x
 }
 
-define i32 @mul_undef1() {
-; CHECK-LABEL: @mul_undef1(
+define i32 @mul_undef_scalable_vector() {
+; CHECK-LABEL: @mul_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @mul_undef_elt() {
+; CHECK-LABEL: @mul_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -155,8 +198,17 @@ define i32 @mul_poison() {
   ret i32 %x
 }
 
-define i32 @mul_poison1() {
-; CHECK-LABEL: @mul_poison1(
+define i32 @mul_poison_scalable_vector() {
+; CHECK-LABEL: @mul_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @mul_poison_elt() {
+; CHECK-LABEL: @mul_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 0, i32 1, i32 poison, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -171,6 +223,15 @@ define i32 @and_0() {
   ret i32 %x
 }
 
+define i32 @and_0_scalable_vector() {
+; CHECK-LABEL: @and_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @and_1() {
 ; CHECK-LABEL: @and_1(
 ; CHECK-NEXT:    ret i32 1
@@ -179,6 +240,15 @@ define i32 @and_1() {
   ret i32 %x
 }
 
+define i32 @and_1_scalable_vector() {
+; CHECK-LABEL: @and_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @and_inc() {
 ; CHECK-LABEL: @and_inc(
 ; CHECK-NEXT:    ret i32 0
@@ -204,8 +274,17 @@ define i32 @and_undef() {
   ret i32 %x
 }
 
-define i32 @and_undef1() {
-; CHECK-LABEL: @and_undef1(
+define i32 @and_undef_scalable_vector() {
+; CHECK-LABEL: @and_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @and_undef_elt() {
+; CHECK-LABEL: @and_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -221,8 +300,17 @@ define i32 @and_poison() {
   ret i32 %x
 }
 
-define i32 @and_poison1() {
-; CHECK-LABEL: @and_poison1(
+define i32 @and_poison_scalable_vector() {
+; CHECK-LABEL: @and_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @and_poison_elt() {
+; CHECK-LABEL: @and_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 -1, i32 1, i32 poison, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -237,6 +325,15 @@ define i32 @or_0() {
   ret i32 %x
 }
 
+define i32 @or_0_scalable_vector() {
+; CHECK-LABEL: @or_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @or_1() {
 ; CHECK-LABEL: @or_1(
 ; CHECK-NEXT:    ret i32 1
@@ -245,6 +342,15 @@ define i32 @or_1() {
   ret i32 %x
 }
 
+define i32 @or_1_scalable_vector() {
+; CHECK-LABEL: @or_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @or_inc() {
 ; CHECK-LABEL: @or_inc(
 ; CHECK-NEXT:    ret i32 -1
@@ -270,8 +376,17 @@ define i32 @or_undef() {
   ret i32 %x
 }
 
-define i32 @or_undef1() {
-; CHECK-LABEL: @or_undef1(
+define i32 @or_undef_scalable_vector() {
+; CHECK-LABEL: @or_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @or_undef_elt() {
+; CHECK-LABEL: @or_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -287,8 +402,17 @@ define i32 @or_poison() {
   ret i32 %x
 }
 
-define i32 @or_poison1() {
-; CHECK-LABEL: @or_poison1(
+define i32 @or_poison_scalable_vector() {
+; CHECK-LABEL: @or_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @or_poison_elt() {
+; CHECK-LABEL: @or_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 0, i32 poison, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -303,6 +427,15 @@ define i32 @xor_0() {
   ret i32 %x
 }
 
+define i32 @xor_0_scalable_vector() {
+; CHECK-LABEL: @xor_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @xor_1() {
 ; CHECK-LABEL: @xor_1(
 ; CHECK-NEXT:    ret i32 0
@@ -311,6 +444,15 @@ define i32 @xor_1() {
   ret i32 %x
 }
 
+define i32 @xor_1_scalable_vector() {
+; CHECK-LABEL: @xor_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @xor_inc() {
 ; CHECK-LABEL: @xor_inc(
 ; CHECK-NEXT:    ret i32 10
@@ -336,8 +478,17 @@ define i32 @xor_undef() {
   ret i32 %x
 }
 
-define i32 @xor_undef1() {
-; CHECK-LABEL: @xor_undef1(
+define i32 @xor_undef_scalable_vector() {
+; CHECK-LABEL: @xor_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @xor_undef_elt() {
+; CHECK-LABEL: @xor_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -353,8 +504,17 @@ define i32 @xor_poison() {
   ret i32 %x
 }
 
-define i32 @xor_poison1() {
-; CHECK-LABEL: @xor_poison1(
+define i32 @xor_poison_scalable_vector() {
+; CHECK-LABEL: @xor_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @xor_poison_elt() {
+; CHECK-LABEL: @xor_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 poison, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -369,6 +529,15 @@ define i32 @smin_0() {
   ret i32 %x
 }
 
+define i32 @smin_0_scalable_vector() {
+; CHECK-LABEL: @smin_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @smin_1() {
 ; CHECK-LABEL: @smin_1(
 ; CHECK-NEXT:    ret i32 1
@@ -377,6 +546,15 @@ define i32 @smin_1() {
   ret i32 %x
 }
 
+define i32 @smin_1_scalable_vector() {
+; CHECK-LABEL: @smin_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @smin_inc() {
 ; CHECK-LABEL: @smin_inc(
 ; CHECK-NEXT:    ret i32 -6
@@ -402,8 +580,17 @@ define i32 @smin_undef() {
   ret i32 %x
 }
 
-define i32 @smin_undef1() {
-; CHECK-LABEL: @smin_undef1(
+define i32 @smin_undef_scalable_vector() {
+; CHECK-LABEL: @smin_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @smin_undef_elt() {
+; CHECK-LABEL: @smin_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -419,8 +606,17 @@ define i32 @smin_poison() {
   ret i32 %x
 }
 
-define i32 @smin_poison1() {
-; CHECK-LABEL: @smin_poison1(
+define i32 @smin_poison_scalable_vector() {
+; CHECK-LABEL: @smin_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @smin_poison_elt() {
+; CHECK-LABEL: @smin_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 poison, i32 1, i32 1, i32 1>)
@@ -435,6 +631,15 @@ define i32 @smax_0() {
   ret i32 %x
 }
 
+define i32 @smax_0_scalable_vector() {
+; CHECK-LABEL: @smax_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @smax_1() {
 ; CHECK-LABEL: @smax_1(
 ; CHECK-NEXT:    ret i32 1
@@ -443,6 +648,15 @@ define i32 @smax_1() {
   ret i32 %x
 }
 
+define i32 @smax_1_scalable_vector() {
+; CHECK-LABEL: @smax_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @smax_inc() {
 ; CHECK-LABEL: @smax_inc(
 ; CHECK-NEXT:    ret i32 8
@@ -468,8 +682,17 @@ define i32 @smax_undef() {
   ret i32 %x
 }
 
-define i32 @smax_undef1() {
-; CHECK-LABEL: @smax_undef1(
+define i32 @smax_undef_scalable_vector() {
+; CHECK-LABEL: @smax_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @smax_undef_elt() {
+; CHECK-LABEL: @smax_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -485,8 +708,17 @@ define i32 @smax_poison() {
   ret i32 %x
 }
 
-define i32 @smax_poison1() {
-; CHECK-LABEL: @smax_poison1(
+define i32 @smax_poison_scalable_vector() {
+; CHECK-LABEL: @smax_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @smax_poison_elt() {
+; CHECK-LABEL: @smax_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1, i32 poison>)
@@ -501,6 +733,15 @@ define i32 @umin_0() {
   ret i32 %x
 }
 
+define i32 @umin_0_scalable_vector() {
+; CHECK-LABEL: @umin_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @umin_1() {
 ; CHECK-LABEL: @umin_1(
 ; CHECK-NEXT:    ret i32 1
@@ -509,6 +750,15 @@ define i32 @umin_1() {
   ret i32 %x
 }
 
+define i32 @umin_1_scalable_vector() {
+; CHECK-LABEL: @umin_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @umin_inc() {
 ; CHECK-LABEL: @umin_inc(
 ; CHECK-NEXT:    ret i32 1
@@ -534,8 +784,17 @@ define i32 @umin_undef() {
   ret i32 %x
 }
 
-define i32 @umin_undef1() {
-; CHECK-LABEL: @umin_undef1(
+define i32 @umin_undef_scalable_vector() {
+; CHECK-LABEL: @umin_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @umin_undef_elt() {
+; CHECK-LABEL: @umin_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -551,8 +810,17 @@ define i32 @umin_poison() {
   ret i32 %x
 }
 
-define i32 @umin_poison1() {
-; CHECK-LABEL: @umin_poison1(
+define i32 @umin_poison_scalable_vector() {
+; CHECK-LABEL: @umin_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @umin_poison_elt() {
+; CHECK-LABEL: @umin_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 -1, i32 poison, i32 1, i32 1, i32 1, i32 1>)
@@ -567,6 +835,15 @@ define i32 @umax_0() {
   ret i32 %x
 }
 
+define i32 @umax_0_scalable_vector() {
+; CHECK-LABEL: @umax_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @umax_1() {
 ; CHECK-LABEL: @umax_1(
 ; CHECK-NEXT:    ret i32 1
@@ -575,6 +852,15 @@ define i32 @umax_1() {
   ret i32 %x
 }
 
+define i32 @umax_1_scalable_vector() {
+; CHECK-LABEL: @umax_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @umax_inc() {
 ; CHECK-LABEL: @umax_inc(
 ; CHECK-NEXT:    ret i32 -3
@@ -600,8 +886,17 @@ define i32 @umax_undef() {
   ret i32 %x
 }
 
-define i32 @umax_undef1() {
-; CHECK-LABEL: @umax_undef1(
+define i32 @umax_undef_scalable_vector() {
+; CHECK-LABEL: @umax_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @umax_undef_elt() {
+; CHECK-LABEL: @umax_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -617,8 +912,17 @@ define i32 @umax_poison() {
   ret i32 %x
 }
 
-define i32 @umax_poison1() {
-; CHECK-LABEL: @umax_poison1(
+define i32 @umax_poison_scalable_vector() {
+; CHECK-LABEL: @umax_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @umax_poison_elt() {
+; CHECK-LABEL: @umax_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 poison, i32 1, i32 1, i32 poison, i32 1, i32 1>)
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index 964a257..fafa82c 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -2800,6 +2800,88 @@ exit:
   ret i64 %r.0.lcssa
 }
 
+define i32 @reduction_expression_ext_mulacc_livein(ptr %a, i16 %c) {
+; CHECK-LABEL: define i32 @reduction_expression_ext_mulacc_livein(
+; CHECK-SAME: ptr [[A:%.*]], i16 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5]] = add i32 [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK:       [[FOR_EXIT]]:
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @reduction_expression_ext_mulacc_livein(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[C:%.*]]) {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*:]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i16>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add i32 [[VEC_PHI]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP11]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK-INTERLEAVED:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[BIN_RDX]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i16
+  %mul = mul i16 %c, %ext.a
+  %mul.ext = zext i16 %mul to i32
+  %add = add i32 %mul.ext, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
 declare float @llvm.fmuladd.f32(float, float, float)
 
 !6 = distinct !{!6, !7, !8}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 06b0448..291ada8 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -800,3 +800,545 @@ exit:
   %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
   ret i64 %r.0.lcssa
 }
+
+define i32 @print_mulacc_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_mulacc_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
+; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
+; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
+; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
+; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i32), (ir<63> zext to i32))
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR   %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
+; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT:  Live-in ir<%1> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
+; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
+; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
+; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
+; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
+; CHECK-NEXT:  Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
+; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
+; CHECK-NEXT:    WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32
+; CHECK-NEXT:    WIDEN ir<%mul> = mul ir<%l.ext>, ir<63>
+; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul>)
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT:  Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<scalar.ph>:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
+; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %mul = mul i32 %l.ext, 63
+  %red.next = add i32 %red, %mul
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.next
+}
+
+; Constants >= 128 cannot be treated as sign-extended, so the expression shouldn't extend 128
+define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_mulacc_not_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
+; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
+; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
+; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
+; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (mul ir<%l.ext>, ir<128>)
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR   %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
+; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT:  Live-in ir<%1> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
+; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
+; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
+; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
+; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
+; CHECK-NEXT:  Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
+; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
+; CHECK-NEXT:    WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:    WIDEN ir<%mul> = mul ir<%l.ext>, ir<128>
+; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul>)
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT:  Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<scalar.ph>:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
+; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = sext i8 %l to i32
+  %mul = mul i32 %l.ext, 128
+  %red.next = add i32 %red, %mul
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %red.next.lcssa = phi i32 [ %red.next, %loop ]
+  ret i32 %red.next.lcssa
+}
+
+define i64 @print_ext_mulacc_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_ext_mulacc_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
+; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
+; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
+; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
+; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i64), (ir<63> zext to i64))
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR   %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
+; CHECK-NEXT:    IR   %mul.ext = zext i32 %mul to i64
+; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT:  Live-in ir<%1> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
+; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
+; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
+; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
+; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
+; CHECK-NEXT:  Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
+; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
+; CHECK-NEXT:    WIDEN-CAST vp<%4> = zext ir<%l> to i64
+; CHECK-NEXT:    WIDEN ir<%mul> = mul vp<%4>, ir<63>
+; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul>)
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT:  Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%6> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%6> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<scalar.ph>:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%6>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
+; CHECK-NEXT:    IR   %mul.ext = zext i32 %mul to i64
+; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %mul = mul i32 %l.ext, 63
+  %mul.ext = zext i32 %mul to i64
+  %red.next = add i64 %red, %mul.ext
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %red.next
+}
+
+; Constants >= 128 cannot be treated as sign-extended, so the expression shouldn't extend 128
+define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_ext_mulacc_not_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
+; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
+; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
+; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
+; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:      WIDEN ir<%mul> = mul ir<%l.ext>, ir<128>
+; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (ir<%mul> sext to i64)
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR   %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
+; CHECK-NEXT:    IR   %mul.ext = sext i32 %mul to i64
+; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT:  Live-in ir<%1> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
+; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
+; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
+; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
+; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
+; CHECK-NEXT:  Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
+; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
+; CHECK-NEXT:    WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:    WIDEN ir<%mul> = mul ir<%l.ext>, ir<128>
+; CHECK-NEXT:    WIDEN-CAST ir<%mul.ext> = sext ir<%mul> to i64
+; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul.ext>)
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT:  Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<scalar.ph>:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
+; CHECK-NEXT:    IR   %mul.ext = sext i32 %mul to i64
+; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = sext i8 %l to i32
+  %mul = mul i32 %l.ext, 128
+  %mul.ext = sext i32 %mul to i64
+  %red.next = add i64 %red, %mul.ext
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %red.next.lcssa = phi i64 [ %red.next, %loop ]
+  ret i64 %red.next.lcssa
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll
new file mode 100644
index 0000000..a804225
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether the UTC format the switch-cases correctly, which requires TWO extra spaces.
+
+define i8 @testi8(i8 %x) {
+  switch i8 %x, label %default [
+    i8 0, label %case1
+    i8 1, label %case2
+    i8 2, label %case3
+    i8 3, label %case3
+  ]
+default:
+  ret i8 0
+case1:
+  ret i8 1
+case2:
+  ret i8 2
+case3:
+  ret i8 3
+}
+
+define i32 @testi32(i32 %x) {
+  switch i32 %x, label %default [
+    i32 0, label %case1
+    i32 1, label %case2
+    i32 2, label %case3
+    i32 3, label %case3
+  ]
+default:
+  ret i32 0
+case1:
+  ret i32 1
+case2:
+  ret i32 2
+case3:
+  ret i32 3
+}
+
+define i128 @testi128(i128 %x) {
+  switch i128 %x, label %default [
+    i128 0, label %case1
+    i128 1, label %case2
+    i128 2, label %case3
+    i128 3, label %case3
+  ]
+default:
+  ret i128 0
+case1:
+  ret i128 1
+case2:
+  ret i128 2
+case3:
+  ret i128 3
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
new file mode 100644
index 0000000..b1977e7
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 7
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether the UTC format the switch-cases correctly, which requires TWO extra spaces.
+
+define i8 @testi8(i8 %x) {
+; CHECK-LABEL: define i8 @testi8(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    switch i8 [[X]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT:      i8 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i8 1, label %[[CASE2:.*]]
+; CHECK-NEXT:      i8 2, label %[[CASE3:.*]]
+; CHECK-NEXT:      i8 3, label %[[CASE3]]
+; CHECK-NEXT:    ]
+; CHECK:       [[DEFAULT]]:
+; CHECK-NEXT:    ret i8 0
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    ret i8 1
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    ret i8 2
+; CHECK:       [[CASE3]]:
+; CHECK-NEXT:    ret i8 3
+;
+  switch i8 %x, label %default [
+    i8 0, label %case1
+    i8 1, label %case2
+    i8 2, label %case3
+    i8 3, label %case3
+  ]
+default:
+  ret i8 0
+case1:
+  ret i8 1
+case2:
+  ret i8 2
+case3:
+  ret i8 3
+}
+
+define i32 @testi32(i32 %x) {
+; CHECK-LABEL: define i32 @testi32(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    switch i32 [[X]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE3:.*]]
+; CHECK-NEXT:      i32 3, label %[[CASE3]]
+; CHECK-NEXT:    ]
+; CHECK:       [[DEFAULT]]:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    ret i32 2
+; CHECK:       [[CASE3]]:
+; CHECK-NEXT:    ret i32 3
+;
+  switch i32 %x, label %default [
+    i32 0, label %case1
+    i32 1, label %case2
+    i32 2, label %case3
+    i32 3, label %case3
+  ]
+default:
+  ret i32 0
+case1:
+  ret i32 1
+case2:
+  ret i32 2
+case3:
+  ret i32 3
+}
+
+define i128 @testi128(i128 %x) {
+; CHECK-LABEL: define i128 @testi128(
+; CHECK-SAME: i128 [[X:%.*]]) {
+; CHECK-NEXT:    switch i128 [[X]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT:      i128 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i128 1, label %[[CASE2:.*]]
+; CHECK-NEXT:      i128 2, label %[[CASE3:.*]]
+; CHECK-NEXT:      i128 3, label %[[CASE3]]
+; CHECK-NEXT:    ]
+; CHECK:       [[DEFAULT]]:
+; CHECK-NEXT:    ret i128 0
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    ret i128 1
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    ret i128 2
+; CHECK:       [[CASE3]]:
+; CHECK-NEXT:    ret i128 3
+;
+  switch i128 %x, label %default [
+    i128 0, label %case1
+    i128 1, label %case2
+    i128 2, label %case3
+    i128 3, label %case3
+  ]
+default:
+  ret i128 0
+case1:
+  ret i128 1
+case2:
+  ret i128 2
+case3:
+  ret i128 3
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test
new file mode 100644
index 0000000..891dbe0
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test
@@ -0,0 +1,3 @@
+## switch_case test checking that update_test_checks.py works correctly
+# RUN: cp -f %S/Inputs/switch_case.ll %t.ll && %update_test_checks %t.ll --version 7
+# RUN: diff -u %t.ll %S/Inputs/switch_case.ll.expected
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s b/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s
index becd9d1..519edf04 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s
@@ -1,6 +1,12 @@
 ## This test uses TU index for type parsing in dwp and makes sure the DWARF4 type is
 ## successfully retrieved.
 
+## cd to a unique dir so we can refer to the file as just "test.dwo" in the
+## assembly test input below.
+# RUN: rm -rf %t
+# RUN: mkdir %t
+# RUN: cd %t
+
 # RUN: llvm-mc %s --split-dwarf-file=test.dwo -filetype obj -triple x86_64 -o test.o
 # RUN: llvm-dwp -e test.o -o test.dwp
 # RUN: llvm-dwarfdump test.dwp | FileCheck %s
diff --git a/llvm/unittests/IR/AbstractCallSiteTest.cpp b/llvm/unittests/IR/AbstractCallSiteTest.cpp
index ddb1091..623d1b3 100644
--- a/llvm/unittests/IR/AbstractCallSiteTest.cpp
+++ b/llvm/unittests/IR/AbstractCallSiteTest.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/AbstractCallSite.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
@@ -51,5 +52,96 @@ TEST(AbstractCallSite, CallbackCall) {
   EXPECT_TRUE(ACS);
   EXPECT_TRUE(ACS.isCallbackCall());
   EXPECT_TRUE(ACS.isCallee(CallbackUse));
+  EXPECT_EQ(ACS.getCalleeUseForCallback(), *CallbackUse);
   EXPECT_EQ(ACS.getCalledFunction(), Callback);
+
+  // The callback metadata {CallbackNo, Arg0No, ..., isVarArg} = {1, -1, true}
+  EXPECT_EQ(ACS.getCallArgOperandNoForCallee(), 1);
+  // Though the callback metadata only specifies ONE unfixed argument No, the
+  // callback callee is vararg, hence the third arg is also considered as
+  // another arg for the callback.
+  EXPECT_EQ(ACS.getNumArgOperands(), 2u);
+  Argument *Param0 = Callback->getArg(0), *Param1 = Callback->getArg(1);
+  ASSERT_TRUE(Param0 && Param1);
+  EXPECT_EQ(ACS.getCallArgOperandNo(*Param0), -1);
+  EXPECT_EQ(ACS.getCallArgOperandNo(*Param1), 2);
+}
+
+TEST(AbstractCallSite, DirectCall) {
+  LLVMContext C;
+
+  const char *IR = "declare void @bar(i32 %x, i32 %y)\n"
+                   "define void @foo() {\n"
+                   "  call void @bar(i32 1, i32 2)\n"
+                   "  ret void\n"
+                   "}\n";
+
+  std::unique_ptr<Module> M = parseIR(C, IR);
+  ASSERT_TRUE(M);
+
+  Function *Callee = M->getFunction("bar");
+  ASSERT_NE(Callee, nullptr);
+
+  const Use *DirectCallUse = Callee->getSingleUndroppableUse();
+  ASSERT_NE(DirectCallUse, nullptr);
+
+  AbstractCallSite ACS(DirectCallUse);
+  EXPECT_TRUE(ACS);
+  EXPECT_TRUE(ACS.isDirectCall());
+  EXPECT_TRUE(ACS.isCallee(DirectCallUse));
+  EXPECT_EQ(ACS.getCalledFunction(), Callee);
+  EXPECT_EQ(ACS.getNumArgOperands(), 2u);
+  Argument *ArgX = Callee->getArg(0);
+  ASSERT_NE(ArgX, nullptr);
+  Value *CAO1 = ACS.getCallArgOperand(*ArgX);
+  Value *CAO2 = ACS.getCallArgOperand(0);
+  ASSERT_NE(CAO2, nullptr);
+  // The two call arg operands should be the same object, since they are both
+  // the first argument of the call.
+  EXPECT_EQ(CAO2, CAO1);
+
+  ConstantInt *FirstArgInt = dyn_cast<ConstantInt>(CAO2);
+  ASSERT_NE(FirstArgInt, nullptr);
+  EXPECT_EQ(FirstArgInt->getZExtValue(), 1ull);
+
+  EXPECT_EQ(ACS.getCallArgOperandNo(*ArgX), 0);
+  EXPECT_EQ(ACS.getCallArgOperandNo(0), 0);
+  EXPECT_EQ(ACS.getCallArgOperandNo(1), 1);
+}
+
+TEST(AbstractCallSite, IndirectCall) {
+  LLVMContext C;
+
+  const char *IR = "define void @foo(ptr %0) {\n"
+                   "  call void %0(i32 1, i32 2)\n"
+                   "  ret void\n"
+                   "}\n";
+
+  std::unique_ptr<Module> M = parseIR(C, IR);
+  ASSERT_TRUE(M);
+
+  Function *Fun = M->getFunction("foo");
+  ASSERT_NE(Fun, nullptr);
+
+  Argument *ArgAsCallee = Fun->getArg(0);
+  ASSERT_NE(ArgAsCallee, nullptr);
+
+  const Use *IndCallUse = ArgAsCallee->getSingleUndroppableUse();
+  ASSERT_NE(IndCallUse, nullptr);
+
+  AbstractCallSite ACS(IndCallUse);
+  EXPECT_TRUE(ACS);
+  EXPECT_TRUE(ACS.isIndirectCall());
+  EXPECT_TRUE(ACS.isCallee(IndCallUse));
+  EXPECT_EQ(ACS.getCalledFunction(), nullptr);
+  EXPECT_EQ(ACS.getCalledOperand(), ArgAsCallee);
+  EXPECT_EQ(ACS.getNumArgOperands(), 2u);
+  Value *CalledOperand = ACS.getCallArgOperand(0);
+  ASSERT_NE(CalledOperand, nullptr);
+  ConstantInt *FirstArgInt = dyn_cast<ConstantInt>(CalledOperand);
+  ASSERT_NE(FirstArgInt, nullptr);
+  EXPECT_EQ(FirstArgInt->getZExtValue(), 1ull);
+
+  EXPECT_EQ(ACS.getCallArgOperandNo(0), 0);
+  EXPECT_EQ(ACS.getCallArgOperandNo(1), 1);
 }
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index a5e3c39..8cd200c9 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -29,6 +29,7 @@ Version changelog:
    'none' and 'all'. 'smart' is the default.
 5: Basic block labels are matched by FileCheck expressions
 6: The semantics of TBAA checks has been incorporated in the check lines.
+7: Indent switch-cases correctly.
 """
 DEFAULT_VERSION = 6
 
@@ -606,6 +607,7 @@ MARCH_ARG_RE = re.compile(r"-march[= ]([^ ]+)")
 DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)")
 
 IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_")
+IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\w+")
 
 SCRUB_LEADING_WHITESPACE_RE = re.compile(r"^(\s+)")
 SCRUB_WHITESPACE_RE = re.compile(r"(?!^(|  \w))[ \t]+", flags=re.M)
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index f883145..9fba96a 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -945,7 +945,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
             path = (
                 cmd_shenv.env["PATH"] if "PATH" in cmd_shenv.env else shenv.env["PATH"]
             )
-            executable = lit.util.which(args[0], shenv.env["PATH"])
+            executable = lit.util.which(args[0], path)
         if not executable:
             raise InternalShellError(j, "%r: command not found" % args[0])
 
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg
new file mode 100644
index 0000000..36517f9
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg
@@ -0,0 +1,8 @@
+import lit.formats
+
+config.name = "shtest-env-path"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+config.substitutions.append(("%{python}", '"%s"' % (sys.executable)))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt b/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt
new file mode 100644
index 0000000..b36e861
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt
@@ -0,0 +1,8 @@
+## Tests env command for setting the PATH variable.
+
+## Check that test.sh can be found using the configured PATH.
+#
+# RUN: env PATH=%S test.sh | FileCheck --check-prefix=CHECK %s
+#
+
+# CHECK: TEST-ENV-PATH-123
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh b/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh
new file mode 100755
index 0000000..a1e46fc
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+echo "TEST-ENV-PATH-123"
+
diff --git a/llvm/utils/lit/tests/shtest-env-path.py b/llvm/utils/lit/tests/shtest-env-path.py
new file mode 100644
index 0000000..bf459ae
--- /dev/null
+++ b/llvm/utils/lit/tests/shtest-env-path.py
@@ -0,0 +1,13 @@
+## Tests env command for setting the PATH variable.
+
+# The test is using /bin/sh. Limit to system known to have /bin/sh.
+# REQUIRES: system-linux
+
+# RUN: %{lit} -a -v %{inputs}/shtest-env-path/path.txt \
+# RUN:   | FileCheck -match-full-lines %s
+#
+# END.
+
+# CHECK: -- Testing: 1 tests{{.*}}
+# CHECK: PASS: shtest-env-path :: path.txt (1 of 1)
+# CHECK: --
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 3d07b16..aef7c09 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -550,6 +550,7 @@ tools/UpdateTestChecks/update_test_checks/stable_ir_values5.test
 tools/UpdateTestChecks/update_test_checks/stable_ir_values6.test
 tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test
 tools/UpdateTestChecks/update_test_checks/stable_ir_values.test
+tools/UpdateTestChecks/update_test_checks/switch_case.test
 tools/UpdateTestChecks/update_test_checks/tbaa-semantics-checks.test
 tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test
 Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
diff --git a/llvm/utils/update_test_checks.py b/llvm/utils/update_test_checks.py
index 3b562fb..42227b2 100755
--- a/llvm/utils/update_test_checks.py
+++ b/llvm/utils/update_test_checks.py
@@ -260,9 +260,17 @@ def update_test(ti: common.TestInfo):
                 skip_same_checks=dropped_previous_line,
             ):
                 # This input line of the function body will go as-is into the output.
-                # Except make leading whitespace uniform: 2 spaces. 4 for debug records.
+                # Except make leading whitespace uniform: 2 spaces. 4 for debug records/switch cases.
                 indent = (
-                    "  " if not common.IS_DEBUG_RECORD_RE.match(input_line) else "    "
+                    " " * 4
+                    if (
+                        common.IS_DEBUG_RECORD_RE.match(input_line)
+                        or (
+                            ti.args.version > 6
+                            and common.IS_SWITCH_CASE_RE.match(input_line)
+                        )
+                    )
+                    else " " * 2
                 )
                 input_line = common.SCRUB_LEADING_WHITESPACE_RE.sub(indent, input_line)
                 output_lines.append(input_line)