115 files changed, 3135 insertions, 2296 deletions
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d2e5bd2..ab536ad 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -250,6 +250,8 @@ Non-comprehensive list of changes in this release
 
 - ``__builtin_assume_dereferenceable`` now accepts non-constant size operands.
 
+- Fixed a crash when the second argument to ``__builtin_assume_aligned`` was not constant (#GH161314)
+
 New Compiler Flags
 ------------------
 - New option ``-fno-sanitize-debug-trap-reasons`` added to disable emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
@@ -452,6 +454,7 @@ Bug Fixes to AST Handling
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
+- Fixed missing diagnostics of ``diagnose_if`` on templates involved in initialization. (#GH160776)
 
 Miscellaneous Clang Crashes Fixed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index f319b17..c961222 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -465,11 +465,16 @@ public:
       return nullptr;
 
     if (Value *Result = ConstantEmitter(CGF).tryEmitConstantExpr(E)) {
-      if (E->isGLValue())
+      if (E->isGLValue()) {
+        // This was already converted to an rvalue when it was constant
+        // evaluated.
+        if (E->hasAPValueResult() && !E->getAPValueResult().isLValue())
+          return Result;
         return CGF.EmitLoadOfScalar(
             Address(Result, CGF.convertTypeForLoadStore(E->getType()),
                     CGF.getContext().getTypeAlignInChars(E->getType())),
             /*Volatile*/ false, E->getType(), E->getExprLoc());
+      }
       return Result;
     }
     return Visit(E->getSubExpr());
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7ce3513..3cc61b1 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5954,6 +5954,9 @@ bool Sema::BuiltinAssumeAligned(CallExpr *TheCall) {
     if (Result > Sema::MaximumAlignment)
       Diag(TheCall->getBeginLoc(), diag::warn_assume_aligned_too_great)
           << SecondArg->getSourceRange() << Sema::MaximumAlignment;
+
+    TheCall->setArg(1,
+                    ConstantExpr::Create(Context, SecondArg, APValue(Result)));
   }
 
   if (NumArgs > 2) {
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 8413090..11d2d5c 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -264,14 +264,6 @@ class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
 
   UnsignedOrNone OuterPackSubstIndex;
 
-  TemplateArgument getPackSubstitutedTemplateArgument(TemplateArgument Arg) {
-    assert(*SemaRef.ArgPackSubstIndex < Arg.pack_size());
-    Arg = Arg.pack_begin()[*SemaRef.ArgPackSubstIndex];
-    if (Arg.isPackExpansion())
-      Arg = Arg.getPackExpansionPattern();
-    return Arg;
-  }
-
   bool shouldVisitTemplateInstantiations() const { return true; }
 
 public:
@@ -294,7 +286,7 @@ public:
       assert(Arg.getKind() == TemplateArgument::Pack &&
              "Missing argument pack");
 
-      Arg = getPackSubstitutedTemplateArgument(Arg);
+      Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
     }
 
     UsedTemplateArgs.push_back(
@@ -312,7 +304,7 @@ public:
     if (NTTP->isParameterPack() && SemaRef.ArgPackSubstIndex) {
       assert(Arg.getKind() == TemplateArgument::Pack &&
              "Missing argument pack");
-      Arg = getPackSubstitutedTemplateArgument(Arg);
+      Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
     }
 
     UsedTemplateArgs.push_back(
@@ -325,8 +317,11 @@ public:
   }
 
   bool TraverseDecl(Decl *D) {
-    if (auto *VD = dyn_cast<ValueDecl>(D))
+    if (auto *VD = dyn_cast<ValueDecl>(D)) {
+      if (auto *Var = dyn_cast<VarDecl>(VD))
+        TraverseStmt(Var->getInit());
       return TraverseType(VD->getType());
+    }
 
     return inherited::TraverseDecl(D);
   }
@@ -363,6 +358,14 @@ public:
     return inherited::TraverseTemplateArgument(Arg);
   }
 
+  bool TraverseSizeOfPackExpr(SizeOfPackExpr *SOPE) {
+    return TraverseDecl(SOPE->getPack());
+  }
+
+  bool VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *E) {
+    return inherited::TraverseStmt(E->getReplacement());
+  }
+
   void VisitConstraint(const NormalizedConstraintWithParamMapping &Constraint) {
     if (!Constraint.hasParameterMapping()) {
       for (const auto &List : TemplateArgs)
@@ -2083,8 +2086,8 @@ bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) {
                                         /*UpdateArgsWithConversions=*/false))
     return true;
   auto TemplateArgs = *MLTAL;
-  TemplateArgs.replaceOutermostTemplateArguments(
-      TemplateArgs.getAssociatedDecl(0).first, CTAI.SugaredConverted);
+  TemplateArgs.replaceOutermostTemplateArguments(CSE->getNamedConcept(),
+                                                 CTAI.SugaredConverted);
   return SubstituteParameterMappings(SemaRef, &TemplateArgs, ArgsAsWritten,
                                      InFoldExpr)
       .substitute(CC.getNormalizedConstraint());
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 0d0d2c0..922fcac 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -7539,7 +7539,7 @@ PerformConstructorInitialization(Sema &S,
 
   // Only check access if all of that succeeded.
   S.CheckConstructorAccess(Loc, Constructor, Step.Function.FoundDecl, Entity);
-  if (S.DiagnoseUseOfDecl(Step.Function.FoundDecl, Loc))
+  if (S.DiagnoseUseOfOverloadedDecl(Constructor, Loc))
     return ExprError();
 
   if (const ArrayType *AT = S.Context.getAsArrayType(Entity.getType()))
@@ -8092,7 +8092,7 @@ ExprResult InitializationSequence::Perform(Sema &S,
 
         S.CheckConstructorAccess(Kind.getLocation(), Constructor, FoundFn,
                                  Entity);
-        if (S.DiagnoseUseOfDecl(FoundFn, Kind.getLocation()))
+        if (S.DiagnoseUseOfOverloadedDecl(Constructor, Kind.getLocation()))
           return ExprError();
 
         CastKind = CK_ConstructorConversion;
@@ -8102,7 +8102,7 @@ ExprResult InitializationSequence::Perform(Sema &S,
         CXXConversionDecl *Conversion = cast<CXXConversionDecl>(Fn);
         S.CheckMemberOperatorAccess(Kind.getLocation(), CurInit.get(), nullptr,
                                     FoundFn);
-        if (S.DiagnoseUseOfDecl(FoundFn, Kind.getLocation()))
+        if (S.DiagnoseUseOfOverloadedDecl(Conversion, Kind.getLocation()))
           return ExprError();
 
         CurInit = S.BuildCXXMemberCallExpr(CurInit.get(), FoundFn, Conversion,
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 6bba505..3baa977 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -6718,6 +6718,10 @@ struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor {
     }
     return true;
   }
+
+  bool TraverseSizeOfPackExpr(SizeOfPackExpr *SOPE) override {
+    return TraverseDecl(SOPE->getPack());
+  }
 };
 }
 
diff --git a/clang/test/Parser/cxx0x-lambda-expressions.cpp b/clang/test/Parser/cxx0x-lambda-expressions.cpp
index f90f8ce..5b57c7f 100644
--- a/clang/test/Parser/cxx0x-lambda-expressions.cpp
+++ b/clang/test/Parser/cxx0x-lambda-expressions.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++03 -Wno-c99-designator %s -Wno-c++11-extensions
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++03 -Wno-c99-designator %s -Wno-c++11-extensions -Wno-local-type-template-args
 // RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++11 -Wno-c99-designator %s
 // RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx17ext,cxx20ext,cxx23ext          -std=c++14 -Wno-c99-designator %s
 // RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx20ext,cxx23ext                   -std=c++17 -Wno-c99-designator %s
diff --git a/clang/test/SemaCXX/builtin-assume-aligned.cpp b/clang/test/SemaCXX/builtin-assume-aligned.cpp
index 48bd841..30296c7 100644
--- a/clang/test/SemaCXX/builtin-assume-aligned.cpp
+++ b/clang/test/SemaCXX/builtin-assume-aligned.cpp
@@ -47,3 +47,16 @@ constexpr void *s1 = __builtin_assume_aligned(x, 32);
 constexpr void *s2 = __builtin_assume_aligned(x, 32, 5);
 constexpr void *s3 = __builtin_assume_aligned(x, 32, -1);
 
+
+constexpr int add(int a, int b) {
+  return a+b;
+}
+constexpr void *c1 = __builtin_assume_aligned(p, add(1,1));
+constexpr void *c2 = __builtin_assume_aligned(p, add(2,1)); // expected-error {{not a power of 2}}
+
+constexpr long kAlignment = 128;
+long AllocateAlignedBytes_payload;
+void AllocateAlignedBytes() {
+  void *m = __builtin_assume_aligned(
+      reinterpret_cast<void *>(AllocateAlignedBytes_payload), kAlignment);
+}
diff --git a/clang/test/SemaCXX/diagnose_if.cpp b/clang/test/SemaCXX/diagnose_if.cpp
index 1b9e660..0af8bb7 100644
--- a/clang/test/SemaCXX/diagnose_if.cpp
+++ b/clang/test/SemaCXX/diagnose_if.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14
+// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++20 -verify=expected,cxx20
 // RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14 -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++20 -verify=expected,cxx20 -fexperimental-new-constant-interpreter
 
 #define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__)))
 
@@ -665,3 +667,28 @@ void run() {
   switch (constexpr Foo i = 2) { default: break; } // expected-error{{oh no}}
 }
 }
+
+namespace GH160776 {
+
+struct ConstructorTemplate {
+  template <class T>
+  explicit ConstructorTemplate(T x)
+      _diagnose_if(sizeof(T) == sizeof(char), "oh no", "error") {} // expected-note {{diagnose_if}}
+
+  template <class T>
+#if __cplusplus >= 202002L
+  requires (sizeof(T) == 1) // cxx20-note {{evaluated to false}}
+#endif
+  operator T() _diagnose_if(sizeof(T) == sizeof(char), "oh no", "error") { // expected-note {{diagnose_if}} \
+                                                                           // cxx20-note {{constraints not satisfied}}
+    return T{};
+  }
+};
+
+void run() {
+  ConstructorTemplate x('1'); // expected-error {{oh no}}
+  char y = x; // expected-error {{oh no}}
+  int z = x; // cxx20-error {{no viable conversion}}
+}
+
+}
diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp
index 8ea8e32..f9d7cfc 100644
--- a/clang/test/SemaCXX/lambda-expressions.cpp
+++ b/clang/test/SemaCXX/lambda-expressions.cpp
@@ -149,7 +149,8 @@ namespace PR12031 {
   void f(int i, X x);
   void g() {
     const int v = 10;
-    f(v, [](){});
+    f(v, [](){}); // cxx03-warning {{template argument uses local type}} \
+                         // cxx03-note {{while substituting}}
   }
 }
 
@@ -572,26 +573,37 @@ namespace PR27994 {
 struct A { template <class T> A(T); };
 
 template <class T>
-struct B {
+struct B { // #PR27994_B
   int x;
-  A a = [&] { int y = x; };
-  A b = [&] { [&] { [&] { int y = x; }; }; };
-  A d = [&](auto param) { int y = x; }; // cxx03-cxx11-error {{'auto' not allowed in lambda parameter}}
-  A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx03-cxx11-error 2 {{'auto' not allowed in lambda parameter}}
+  A a = [&] { int y = x; }; // cxx03-warning {{template argument uses unnamed type}} \
+                            //   cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
+  A b = [&] { [&] { [&] { int y = x; }; }; }; // cxx03-warning {{template argument uses unnamed type}} \
+                                              //   cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
+  A d = [&](auto param) { int y = x; }; // cxx03-cxx11-error {{'auto' not allowed in lambda parameter}} \
+                                        // cxx03-warning {{template argument uses unnamed type}} \
+                                        //  cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
+  A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx03-cxx11-error 2 {{'auto' not allowed in lambda parameter}} \
+                                                                       // cxx03-warning {{template argument uses unnamed type}} \
+                                                                       //  cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
 };
 
 B<int> b;
+// cxx03-note@#PR27994_B 4{{in instantiation of default member initializer}}
+// cxx03-note@-2 4{{in evaluation of exception}}
 
 template <class T> struct C {
   struct D {
+    // cxx03-note@-1 {{in instantiation of default member initializer}}
     int x;
-    A f = [&] { int y = x; };
+    A f = [&] { int y = x; }; // cxx03-warning {{template argument uses unnamed type}} \
+                              // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
   };
 };
 
 int func() {
   C<int> a;
   decltype(a)::D b;
+  // cxx03-note@-1 {{in evaluation of exception}}
 }
 }
 
@@ -606,8 +618,12 @@ struct S1 {
 
 void foo1() {
   auto s0 = S1([name=]() {}); // expected-error {{expected expression}}
+                                     // cxx03-warning@-1 {{template argument uses local type}} \
+                                     // cxx03-note@-1 {{while substituting deduced template arguments}}
   auto s1 = S1([name=name]() {}); // expected-error {{use of undeclared identifier 'name'; did you mean 'name1'?}}
                                   // cxx03-cxx11-warning@-1 {{initialized lambda captures are a C++14 extension}}
+                                  // cxx03-warning@-2 {{template argument uses local type}} \
+                                  // cxx03-note@-2 {{while substituting deduced template arguments}}
 }
 }
 
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 6d29f8b..e5e081f 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1333,4 +1333,75 @@ static_assert(__cpp17_iterator<not_move_constructible>); \
 // expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<parameter_mapping_regressions::case3::not_move_constructible>' evaluated to false}}
 }
 
+namespace case4 {
+
+template<bool b>
+concept bool_ = b;
+
+template<typename... Ts>
+concept unary = bool_<sizeof...(Ts) == 1>;
+
+static_assert(!unary<>);
+static_assert(unary<void>);
+
+}
+
+namespace case5 {
+
+template<int size>
+concept true1 = size == size;
+
+template<typename... Ts>
+concept true2 = true1<sizeof...(Ts)>;
+
+template<typename... Ts>
+concept true3 = true2<Ts...>;
+
+static_assert(true3<void>);
+
+}
+
+namespace case6 {
+
+namespace std {
+template <int __v>
+struct integral_constant {
+  static const int value = __v;
+};
+
+template <class _Tp, class... _Args>
+constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...);
+
+template <class _From, class _To>
+constexpr bool is_convertible_v = __is_convertible(_From, _To);
+
+template <class>
+struct tuple_size;
+
+template <class _Tp>
+constexpr decltype(sizeof(int)) tuple_size_v = tuple_size<_Tp>::value;
+}  // namespace std
+
+template <int N, int X>
+concept FixedExtentConstructibleFromExtent = X == N;
+
+template <int Extent>
+struct span {
+  int static constexpr extent = Extent;
+  template <typename R, int N = std::tuple_size_v<R>>
+    requires(FixedExtentConstructibleFromExtent<extent, N>)
+  span(R);
+};
+
+template <class, int>
+struct array {};
+
+template <class _Tp, decltype(sizeof(int)) _Size>
+struct std::tuple_size<array<_Tp, _Size>> : integral_constant<_Size> {};
+
+static_assert(std::is_convertible_v<array<int, 3>, span<3>>);
+static_assert(!std::is_constructible_v<span<4>, array<int, 3>>);
+
+}
+
 }
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 4b2a0d8..924d0cb 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -47,6 +47,7 @@
 #include "math/exp10f16.h"
 #include "math/exp10m1f.h"
 #include "math/exp10m1f16.h"
+#include "math/exp2.h"
 #include "math/expf.h"
 #include "math/expf16.h"
 #include "math/frexpf.h"
diff --git a/libc/shared/math/exp2.h b/libc/shared/math/exp2.h
new file mode 100644
index 0000000..6f1e143
--- /dev/null
+++ b/libc/shared/math/exp2.h
@@ -0,0 +1,23 @@
+//===-- Shared exp2 function ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP2_H
+#define LLVM_LIBC_SHARED_MATH_EXP2_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp2.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp2;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP2_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 98f9bb42..4130fdf 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -374,6 +374,15 @@ add_header_library(
 )
 
 add_header_library(
+  common_constants
+  HDRS
+    common_constants.h
+  DEPENDS
+    libc.src.__support.macros.config
+    libc.src.__support.number_pair
+)
+
+add_header_library(
   cos
   HDRS
     cos.h
@@ -705,6 +714,28 @@ add_header_library(
 )
 
 add_header_library(
+  exp2
+  HDRS
+    exp2.h
+  DEPENDS
+    .common_constants
+    .exp_utils
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.optional
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.FPUtil.triple_double
+    libc.src.__support.integer_literals
+    libc.src.__support.macros.optimization
+    libc.src.errno.errno
+)
+
+add_header_library(
   exp10
   HDRS
     exp10.h
diff --git a/libc/src/math/generic/common_constants.cpp b/libc/src/__support/math/common_constants.h
index 2a15df2..53abbfe 100644
--- a/libc/src/math/generic/common_constants.cpp
+++ b/libc/src/__support/math/common_constants.h
@@ -6,12 +6,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "common_constants.h"
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H
+
 #include "src/__support/macros/config.h"
 #include "src/__support/number_pair.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
+namespace common_constants_internal {
+
+// log(2) generated by Sollya with:
+// > a = 2^-43 * nearestint(2^43*log(2));
+// LSB = 2^-43 is chosen so that e_x * LOG_2_HI is exact for -1075 < e_x < 1024.
+static constexpr double LOG_2_HI = 0x1.62e42fefa38p-1; // LSB = 2^-43
+// > b = round(log10(2) - a, D, RN);
+static constexpr double LOG_2_LO = 0x1.ef35793c7673p-45; // LSB = 2^-97
+
+// Minimax polynomial for (log(1 + x) - x)/x^2, generated by sollya with:
+// > P = fpminimax((log(1 + x) - x)/x^2, 5, [|D...|], [-2^-8, 2^-7]);
+constexpr double LOG_COEFFS[6] = {-0x1.fffffffffffffp-2, 0x1.5555555554a9bp-2,
+                                  -0x1.0000000094567p-2, 0x1.99999dcc9823cp-3,
+                                  -0x1.55550ac2e537ap-3, 0x1.21a02c4e624d7p-3};
+
 // Range reduction constants for logarithms.
 // r(0) = 1, r(127) = 0.5
 // r(k) = 2^-8 * ceil(2^8 * (1 - 2^-8) / (1 + k*2^-7))
@@ -19,7 +36,7 @@ namespace LIBC_NAMESPACE_DECL {
 // precision, and -2^-8 <= v < 2^-7.
 // TODO(lntue): Add reference to how the constants are derived after the
 // resulting paper is ready.
-alignas(8) const float R[128] = {
+alignas(8) static constexpr float R[128] = {
     0x1p0,     0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1,  0x1.ecp-1, 0x1.e8p-1,
     0x1.e4p-1, 0x1.ep-1,  0x1.dep-1, 0x1.dap-1, 0x1.d6p-1, 0x1.d4p-1, 0x1.dp-1,
     0x1.ccp-1, 0x1.cap-1, 0x1.c6p-1, 0x1.c4p-1, 0x1.cp-1,  0x1.bep-1, 0x1.bap-1,
@@ -40,7 +57,7 @@ alignas(8) const float R[128] = {
     0x1.0ap-1, 0x1.08p-1, 0x1.08p-1, 0x1.06p-1, 0x1.06p-1, 0x1.04p-1, 0x1.04p-1,
     0x1.02p-1, 0x1.0p-1};
 
-const double RD[128] = {
+static constexpr double RD[128] = {
     0x1p0,     0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1,  0x1.ecp-1, 0x1.e8p-1,
     0x1.e4p-1, 0x1.ep-1,  0x1.dep-1, 0x1.dap-1, 0x1.d6p-1, 0x1.d4p-1, 0x1.dp-1,
     0x1.ccp-1, 0x1.cap-1, 0x1.c6p-1, 0x1.c4p-1, 0x1.cp-1,  0x1.bep-1, 0x1.bap-1,
@@ -65,7 +82,7 @@ const double RD[128] = {
 // available.
 // Generated by Sollya with the formula: CD[i] = RD[i]*(1 + i*2^-7) - 1
 // for RD[i] defined on the table above.
-const double CD[128] = {
+static constexpr double CD[128] = {
     0.0,        -0x1p-14,   -0x1p-12,   -0x1.2p-11,  -0x1p-10,   -0x1.9p-10,
     -0x1.2p-9,  -0x1.88p-9, -0x1p-8,    -0x1.9p-11,  -0x1.fp-10, -0x1.9cp-9,
     -0x1p-12,   -0x1.cp-10, -0x1.bp-9,  -0x1.5p-11,  -0x1.4p-9,  0x1p-14,
@@ -90,7 +107,7 @@ const double CD[128] = {
     -0x1p-14,   -0x1p-8,
 };
 
-const double LOG_R[128] = {
+static constexpr double LOG_R[128] = {
     0x0.0000000000000p0,  0x1.010157588de71p-7, 0x1.0205658935847p-6,
     0x1.8492528c8cabfp-6, 0x1.0415d89e74444p-5, 0x1.466aed42de3eap-5,
     0x1.894aa149fb343p-5, 0x1.ccb73cdddb2ccp-5, 0x1.08598b59e3a07p-4,
@@ -135,7 +152,7 @@ const double LOG_R[128] = {
     0x1.5707a26bb8c66p-1, 0x1.5af405c3649ep-1,  0x1.5af405c3649ep-1,
     0x1.5ee82aa24192p-1,  0x0.000000000000p0};
 
-const double LOG2_R[128] = {
+static constexpr double LOG2_R[128] = {
     0x0.0000000000000p+0, 0x1.72c7ba20f7327p-7, 0x1.743ee861f3556p-6,
     0x1.184b8e4c56af8p-5, 0x1.77394c9d958d5p-5, 0x1.d6ebd1f1febfep-5,
     0x1.1bb32a600549dp-4, 0x1.4c560fe68af88p-4, 0x1.7d60496cfbb4cp-4,
@@ -188,7 +205,7 @@ const double LOG2_R[128] = {
 //     print("{", -c, ",", -b, "},");
 //   };
 // We replace LOG_R[0] with log10(1.0) == 0.0
-alignas(16) const NumberPair<double> LOG_R_DD[128] = {
+alignas(16) static constexpr NumberPair<double> LOG_R_DD[128] = {
     {0.0, 0.0},
     {-0x1.0c76b999d2be8p-46, 0x1.010157589p-7},
     {-0x1.3dc5b06e2f7d2p-45, 0x1.0205658938p-6},
@@ -324,7 +341,7 @@ alignas(16) const NumberPair<double> LOG_R_DD[128] = {
 // Output range:
 //   [-0x1.3ffcp-15, 0x1.3e3dp-15]
 // We store S2[i] = 2^16 (r(i - 2^6) - 1).
-alignas(8) const int S2[193] = {
+alignas(8) static constexpr int S2[193] = {
     0x101,  0xfd,   0xf9,   0xf5,   0xf1,   0xed,   0xe9,   0xe5,   0xe1,
     0xdd,   0xd9,   0xd5,   0xd1,   0xcd,   0xc9,   0xc5,   0xc1,   0xbd,
     0xb9,   0xb4,   0xb0,   0xac,   0xa8,   0xa4,   0xa0,   0x9c,   0x98,
@@ -348,7 +365,7 @@ alignas(8) const int S2[193] = {
     -0x1cd, -0x1d1, -0x1d5, -0x1d9, -0x1dd, -0x1e0, -0x1e4, -0x1e8, -0x1ec,
     -0x1f0, -0x1f4, -0x1f8, -0x1fc};
 
-const double R2[193] = {
+static constexpr double R2[193] = {
     0x1.0101p0,  0x1.00fdp0,  0x1.00f9p0,  0x1.00f5p0,  0x1.00f1p0,
     0x1.00edp0,  0x1.00e9p0,  0x1.00e5p0,  0x1.00e1p0,  0x1.00ddp0,
     0x1.00d9p0,  0x1.00d5p0,  0x1.00d1p0,  0x1.00cdp0,  0x1.00c9p0,
@@ -395,7 +412,7 @@ const double R2[193] = {
 // Output range:
 //   [-0x1.01928p-22 , 0x1p-22]
 // We store S[i] = 2^21 (r(i - 80) - 1).
-alignas(8) const int S3[161] = {
+alignas(8) static constexpr int S3[161] = {
     0x50,  0x4f,  0x4e,  0x4d,  0x4c,  0x4b,  0x4a,  0x49,  0x48,  0x47,  0x46,
     0x45,  0x44,  0x43,  0x42,  0x41,  0x40,  0x3f,  0x3e,  0x3d,  0x3c,  0x3b,
     0x3a,  0x39,  0x38,  0x37,  0x36,  0x35,  0x34,  0x33,  0x32,  0x31,  0x30,
@@ -418,7 +435,7 @@ alignas(8) const int S3[161] = {
 // Output range:
 //   [-0x1.0002143p-29 , 0x1p-29]
 // We store S[i] = 2^28 (r(i - 65) - 1).
-alignas(8) const int S4[130] = {
+alignas(8) static constexpr int S4[130] = {
     0x41,  0x40,  0x3f,  0x3e,  0x3d,  0x3c,  0x3b,  0x3a,  0x39,  0x38,  0x37,
     0x36,  0x35,  0x34,  0x33,  0x32,  0x31,  0x30,  0x2f,  0x2e,  0x2d,  0x2c,
     0x2b,  0x2a,  0x29,  0x28,  0x27,  0x26,  0x25,  0x24,  0x23,  0x22,  0x21,
@@ -439,7 +456,7 @@ alignas(8) const int S4[130] = {
 // Table is generated with Sollya as follow:
 // > display = hexadecimal;
 // > for i from -104 to 89 do { D(exp(i)); };
-const double EXP_M1[195] = {
+static constexpr double EXP_M1[195] = {
     0x1.f1e6b68529e33p-151, 0x1.525be4e4e601dp-149, 0x1.cbe0a45f75eb1p-148,
     0x1.3884e838aea68p-146, 0x1.a8c1f14e2af5dp-145, 0x1.20a717e64a9bdp-143,
     0x1.8851d84118908p-142, 0x1.0a9bdfb02d240p-140, 0x1.6a5bea046b42ep-139,
@@ -511,7 +528,7 @@ const double EXP_M1[195] = {
 // Table is generated with Sollya as follow:
 // > display = hexadecimal;
 // > for i from 0 to 127 do { D(exp(i / 128)); };
-const double EXP_M2[128] = {
+static constexpr double EXP_M2[128] = {
     0x1.0000000000000p0, 0x1.0202015600446p0, 0x1.04080ab55de39p0,
     0x1.06122436410ddp0, 0x1.08205601127edp0, 0x1.0a32a84e9c1f6p0,
     0x1.0c49236829e8cp0, 0x1.0e63cfa7ab09dp0, 0x1.1082b577d34edp0,
@@ -557,4 +574,8 @@ const double EXP_M2[128] = {
     0x1.568bb722dd593p1, 0x1.593b7d72305bbp1,
 };
 
+} // namespace common_constants_internal
+
 } // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H
diff --git a/libc/src/__support/math/exp2.h b/libc/src/__support/math/exp2.h
new file mode 100644
index 0000000..7eaa465
--- /dev/null
+++ b/libc/src/__support/math/exp2.h
@@ -0,0 +1,425 @@
+//===-- Implementation header for exp2 --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H
+
+#include "common_constants.h" // Lookup tables EXP2_MID1 and EXP_M2.
+#include "exp_constants.h"
+#include "exp_utils.h" // ziv_test_denorm.
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/FPUtil/triple_double.h"
+#include "src/__support/common.h"
+#include "src/__support/integer_literals.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+namespace exp2_internal {
+
+using namespace common_constants_internal;
+
+using fputil::DoubleDouble;
+using fputil::TripleDouble;
+using Float128 = typename fputil::DyadicFloat<128>;
+
+using LIBC_NAMESPACE::operator""_u128;
+
+// Error bounds:
+// Errors when using double precision.
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+constexpr double ERR_D = 0x1.0p-63;
+#else
+constexpr double ERR_D = 0x1.8p-63;
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Errors when using double-double precision.
+constexpr double ERR_DD = 0x1.0p-100;
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// Polynomial approximations with double precision.  Generated by Sollya with:
+// > P = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
+// > P;
+// Error bounds:
+//   | output - (2^dx - 1) / dx | < 1.5 * 2^-52.
+LIBC_INLINE static double poly_approx_d(double dx) {
+  // dx^2
+  double dx2 = dx * dx;
+  double c0 =
+      fputil::multiply_add(dx, 0x1.ebfbdff82c58ep-3, 0x1.62e42fefa39efp-1);
+  double c1 =
+      fputil::multiply_add(dx, 0x1.3b2aba7a95a89p-7, 0x1.c6b08e8fc0c0ep-5);
+  double p = fputil::multiply_add(dx2, c1, c0);
+  return p;
+}
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Polynomial approximation with double-double precision.  Generated by Solya
+// with:
+// > P = fpminimax((2^x - 1)/x, 5, [|DD...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
+// Error bounds:
+//   | output - 2^(dx) | < 2^-101
+LIBC_INLINE static constexpr DoubleDouble
+poly_approx_dd(const DoubleDouble &dx) {
+  // Taylor polynomial.
+  constexpr DoubleDouble COEFFS[] = {
+      {0, 0x1p0},
+      {0x1.abc9e3b39824p-56, 0x1.62e42fefa39efp-1},
+      {-0x1.5e43a53e4527bp-57, 0x1.ebfbdff82c58fp-3},
+      {-0x1.d37963a9444eep-59, 0x1.c6b08d704a0cp-5},
+      {0x1.4eda1a81133dap-62, 0x1.3b2ab6fba4e77p-7},
+      {-0x1.c53fd1ba85d14p-64, 0x1.5d87fe7a265a5p-10},
+      {0x1.d89250b013eb8p-70, 0x1.430912f86cb8ep-13},
+  };
+
+  DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
+                                    COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
+  return p;
+}
+
+// Polynomial approximation with 128-bit precision:
+// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7
+// For |dx| < 2^-13 + 2^-30:
+//   | output - exp(dx) | < 2^-126.
+LIBC_INLINE static constexpr Float128 poly_approx_f128(const Float128 &dx) {
+  constexpr Float128 COEFFS_128[]{
+      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
+      {Sign::POS, -128, 0xb17217f7'd1cf79ab'c9e3b398'03f2f6af_u128},
+      {Sign::POS, -128, 0x3d7f7bff'058b1d50'de2d60dd'9c9a1d9f_u128},
+      {Sign::POS, -132, 0xe35846b8'2505fc59'9d3b15d9'e7fb6897_u128},
+      {Sign::POS, -134, 0x9d955b7d'd273b94e'184462f6'bcd2b9e7_u128},
+      {Sign::POS, -137, 0xaec3ff3c'53398883'39ea1bb9'64c51a89_u128},
+      {Sign::POS, -138, 0x2861225f'345c396a'842c5341'8fa8ae61_u128},
+      {Sign::POS, -144, 0xffe5fe2d'109a319d'7abeb5ab'd5ad2079_u128},
+  };
+
+  Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
+                                COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
+                                COEFFS_128[6], COEFFS_128[7]);
+  return p;
+}
+
+// Compute 2^(x) using 128-bit precision.
+// TODO(lntue): investigate triple-double precision implementation for this
+// step.
+LIBC_INLINE static constexpr Float128 exp2_f128(double x, int hi, int idx1,
+                                                int idx2) {
+  Float128 dx = Float128(x);
+
+  // TODO: Skip recalculating exp_mid1 and exp_mid2.
+  Float128 exp_mid1 =
+      fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
+                        fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
+                                          Float128(EXP2_MID1[idx1].lo)));
+
+  Float128 exp_mid2 =
+      fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
+                        fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
+                                          Float128(EXP2_MID2[idx2].lo)));
+
+  Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
+
+  Float128 p = poly_approx_f128(dx);
+
+  Float128 r = fputil::quick_mul(exp_mid, p);
+
+  r.exponent += hi;
+
+  return r;
+}
+
+// Compute 2^x with double-double precision.
+LIBC_INLINE static DoubleDouble
+exp2_double_double(double x, const DoubleDouble &exp_mid) {
+  DoubleDouble dx({0, x});
+
+  // Degree-6 polynomial approximation in double-double precision.
+  // | p - 2^x | < 2^-103.
+  DoubleDouble p = poly_approx_dd(dx);
+
+  // Error bounds: 2^-102.
+  DoubleDouble r = fputil::quick_mult(exp_mid, p);
+
+  return r;
+}
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// When output is denormal.
+LIBC_INLINE static double exp2_denorm(double x) {
+  // Range reduction.
+  int k =
+      static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
+  double kd = static_cast<double>(k);
+
+  uint32_t idx1 = (k >> 6) & 0x3f;
+  uint32_t idx2 = k & 0x3f;
+
+  int hi = k >> 12;
+
+  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
+  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
+  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
+
+  // |dx| < 2^-13 + 2^-30.
+  double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
+
+  double mid_lo = dx * exp_mid.hi;
+
+  // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
+  double p = poly_approx_d(dx);
+
+  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
+      .value();
+#else
+  if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
+      LIBC_LIKELY(r.has_value()))
+    return r.value();
+
+  // Use double-double
+  DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
+
+  if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
+      LIBC_LIKELY(r.has_value()))
+    return r.value();
+
+  // Use 128-bit precision
+  Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
+
+  return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+// Check for exceptional cases when:
+//  * log2(1 - 2^-54) < x < log2(1 + 2^-53)
+//  * x >= 1024
+//  * x <= -1022
+//  * x is inf or nan
+LIBC_INLINE static constexpr double set_exceptional(double x) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint64_t x_u = xbits.uintval();
+  uint64_t x_abs = xbits.abs().uintval();
+
+  // |x| < log2(1 + 2^-53)
+  if (x_abs <= 0x3ca71547652b82fd) {
+    // 2^(x) ~ 1 + x/2
+    return fputil::multiply_add(x, 0.5, 1.0);
+  }
+
+  // x <= -1022 || x >= 1024 or inf/nan.
+  if (x_u > 0xc08ff00000000000) {
+    // x <= -1075 or -inf/nan
+    if (x_u >= 0xc090cc0000000000) {
+      // exp(-Inf) = 0
+      if (xbits.is_inf())
+        return 0.0;
+
+      // exp(nan) = nan
+      if (xbits.is_nan())
+        return x;
+
+      if (fputil::quick_get_round() == FE_UPWARD)
+        return FPBits::min_subnormal().get_val();
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_UNDERFLOW);
+      return 0.0;
+    }
+
+    return exp2_denorm(x);
+  }
+
+  // x >= 1024 or +inf/nan
+  // x is finite
+  if (x_u < 0x7ff0'0000'0000'0000ULL) {
+    int rounding = fputil::quick_get_round();
+    if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+      return FPBits::max_normal().get_val();
+
+    fputil::set_errno_if_required(ERANGE);
+    fputil::raise_except_if_required(FE_OVERFLOW);
+  }
+  // x is +inf or nan
+  return x + FPBits::inf().get_val();
+}
+
+} // namespace exp2_internal
+
+LIBC_INLINE static constexpr double exp2(double x) {
+  using namespace exp2_internal;
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint64_t x_u = xbits.uintval();
+
+  // x < -1022 or x >= 1024 or log2(1 - 2^-54) < x < log2(1 + 2^-53).
+  if (LIBC_UNLIKELY(x_u > 0xc08ff00000000000 ||
+                    (x_u <= 0xbc971547652b82fe && x_u >= 0x4090000000000000) ||
+                    x_u <= 0x3ca71547652b82fd)) {
+    return set_exceptional(x);
+  }
+
+  // Now -1075 < x <= log2(1 - 2^-54) or log2(1 + 2^-53) < x < 1024
+
+  // Range reduction:
+  // Let x = (hi + mid1 + mid2) + lo
+  // in which:
+  //   hi is an integer
+  //   mid1 * 2^6 is an integer
+  //   mid2 * 2^12 is an integer
+  // then:
+  //   2^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 2^(lo).
+  // With this formula:
+  //   - multiplying by 2^hi is exact and cheap, simply by adding the exponent
+  //     field.
+  //   - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
+  //   - 2^(lo) ~ 1 + a0*lo + a1 * lo^2 + ...
+  //
+  // We compute (hi + mid1 + mid2) together by perform the rounding on x * 2^12.
+  // Since |x| < |-1075)| < 2^11,
+  //   |x * 2^12| < 2^11 * 2^12 < 2^23,
+  // So we can fit the rounded result round(x * 2^12) in int32_t.
+  // Thus, the goal is to be able to use an additional addition and fixed width
+  // shift to get an int32_t representing round(x * 2^12).
+  //
+  // Assuming int32_t using 2-complement representation, since the mantissa part
+  // of a double precision is unsigned with the leading bit hidden, if we add an
+  // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^25 to the product, the
+  // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
+  // considered as a proper 2-complement representations of x*2^12.
+  //
+  // One small problem with this approach is that the sum (x*2^12 + C) in
+  // double precision is rounded to the least significant bit of the dorminant
+  // factor C.  In order to minimize the rounding errors from this addition, we
+  // want to minimize e1.  Another constraint that we want is that after
+  // shifting the mantissa so that the least significant bit of int32_t
+  // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
+  // any adjustment.  So combining these 2 requirements, we can choose
+  //   C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
+  // after right shifting the mantissa, the resulting int32_t has correct sign.
+  // With this choice of C, the number of mantissa bits we need to shift to the
+  // right is: 52 - 33 = 19.
+  //
+  // Moreover, since the integer right shifts are equivalent to rounding down,
+  // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
+  // +infinity.  So in particular, we can compute:
+  //   hmm = x * 2^12 + C,
+  // where C = 2^33 + 2^32 + 2^-1, then if
+  //   k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19),
+  // the reduced argument:
+  //   lo = x - 2^-12 * k is bounded by:
+  //   |lo| <= 2^-13 + 2^-12*2^-19
+  //         = 2^-13 + 2^-31.
+  //
+  // Finally, notice that k only uses the mantissa of x * 2^12, so the
+  // exponent 2^12 is not needed.  So we can simply define
+  //   C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
+  //   k = int32_t(lower 51 bits of double(x + C) >> 19).
+
+  // Rounding errors <= 2^-31.
+  int k =
+      static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
+  double kd = static_cast<double>(k);
+
+  uint32_t idx1 = (k >> 6) & 0x3f;
+  uint32_t idx2 = k & 0x3f;
+
+  int hi = k >> 12;
+
+  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
+  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
+  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
+
+  // |dx| < 2^-13 + 2^-30.
+  double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
+
+  // We use the degree-4 polynomial to approximate 2^(lo):
+  //   2^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4 = 1 + lo * P(lo)
+  // So that the errors are bounded by:
+  //   |P(lo) - (2^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
+  // Let P_ be an evaluation of P where all intermediate computations are in
+  // double precision.  Using either Horner's or Estrin's schemes, the evaluated
+  // errors can be bounded by:
+  //      |P_(lo) - P(lo)| < 2^-51
+  //   => |lo * P_(lo) - (2^lo - 1) | < 2^-64
+  //   => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-63.
+  // Since we approximate
+  //   2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
+  // We use the expression:
+  //    (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
+  //  ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
+  // with errors bounded by 2^-63.
+
+  double mid_lo = dx * exp_mid.hi;
+
+  // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
+  double p = poly_approx_d(dx);
+
+  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+  // field.
+  int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+  double r =
+      cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
+  return r;
+#else
+  double upper = exp_mid.hi + (lo + ERR_D);
+  double lower = exp_mid.hi + (lo - ERR_D);
+
+  if (LIBC_LIKELY(upper == lower)) {
+    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+    // field.
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
+    return r;
+  }
+
+  // Use double-double
+  DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
+
+  double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
+  double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
+
+  if (LIBC_LIKELY(upper_dd == lower_dd)) {
+    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+    // field.
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
+    return r;
+  }
+
+  // Use 128-bit precision
+  Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
+
+  return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 99c1b08..28ea475 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1448,21 +1448,7 @@ add_entrypoint_object(
   HDRS
     ../exp2.h
   DEPENDS
-    .common_constants
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.optional
-    libc.src.__support.FPUtil.dyadic_float
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.FPUtil.triple_double
-    libc.src.__support.integer_literals
-    libc.src.__support.macros.optimization
-    libc.src.__support.math.exp_utils
-    libc.src.errno.errno
+    libc.src.__support.math.exp2
 )
 
 add_header_library(
@@ -1613,7 +1599,6 @@ add_entrypoint_object(
   HDRS
     ../expm1.h
   DEPENDS
-    .common_constants
     libc.src.__support.CPP.bit
     libc.src.__support.FPUtil.dyadic_float
     libc.src.__support.FPUtil.fenv_impl
@@ -1624,6 +1609,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.triple_double
     libc.src.__support.integer_literals
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
     libc.src.errno.errno
 )
 
@@ -1634,7 +1620,6 @@ add_entrypoint_object(
   HDRS
     ../expm1f.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
@@ -1643,6 +1628,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
     libc.src.errno.errno
 )
 
@@ -1673,7 +1659,6 @@ add_entrypoint_object(
   HDRS
     ../powf.h
   DEPENDS
-    .common_constants
     .exp2f_impl
     libc.src.__support.math.exp10f
     libc.src.__support.CPP.bit
@@ -1685,6 +1670,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.FPUtil.triple_double
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
     libc.src.errno.errno
 )
 
@@ -1695,7 +1681,6 @@ add_entrypoint_object(
   HDRS
     ../pow.h
   DEPENDS
-    .common_constants
     libc.hdr.errno_macros
     libc.hdr.fenv_macros
     libc.src.__support.CPP.bit
@@ -1707,6 +1692,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2043,26 +2029,14 @@ add_entrypoint_object(
     libc.src.__support.macros.properties.types
 )
 
-add_object_library(
-  common_constants
-  HDRS
-    common_constants.h
-  SRCS
-    common_constants.cpp
-  DEPENDS
-    libc.src.__support.math.exp_constants
-    libc.src.__support.math.acosh_float_constants
-    libc.src.__support.number_pair
-)
-
 add_header_library(
   log_range_reduction
   HDRS
     log_range_reduction.h
   DEPENDS
-    .common_constants
-    libc.src.__support.uint128
     libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.math.common_constants
+    libc.src.__support.uint128
 )
 
 add_entrypoint_object(
@@ -2072,7 +2046,6 @@ add_entrypoint_object(
   HDRS
     ../log10.h
   DEPENDS
-    .common_constants
     .log_range_reduction
     libc.src.__support.FPUtil.double_double
     libc.src.__support.FPUtil.dyadic_float
@@ -2082,6 +2055,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.integer_literals
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2091,12 +2065,12 @@ add_entrypoint_object(
   HDRS
     ../log10f.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.fma
     libc.src.__support.FPUtil.polyeval
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2126,7 +2100,6 @@ add_entrypoint_object(
   HDRS
     ../log1p.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.double_double
     libc.src.__support.FPUtil.dyadic_float
     libc.src.__support.FPUtil.fenv_impl
@@ -2135,6 +2108,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.integer_literals
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2144,13 +2118,13 @@ add_entrypoint_object(
   HDRS
     ../log1pf.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.fma
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2160,7 +2134,6 @@ add_entrypoint_object(
   HDRS
     ../log2.h
   DEPENDS
-    .common_constants
     .log_range_reduction
     libc.src.__support.FPUtil.double_double
     libc.src.__support.FPUtil.dyadic_float
@@ -2170,6 +2143,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.integer_literals
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2179,13 +2153,13 @@ add_entrypoint_object(
   HDRS
     ../log2f.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.fma
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2215,7 +2189,6 @@ add_entrypoint_object(
   HDRS
     ../log.h
   DEPENDS
-    .common_constants
     .log_range_reduction
     libc.src.__support.FPUtil.double_double
     libc.src.__support.FPUtil.dyadic_float
@@ -2225,6 +2198,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.integer_literals
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2234,7 +2208,6 @@ add_entrypoint_object(
   HDRS
     ../logf.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
@@ -2242,6 +2215,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.macros.optimization
     libc.src.__support.macros.properties.cpu_features
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/common_constants.h b/libc/src/math/generic/common_constants.h
deleted file mode 100644
index 9ee31f0..0000000
--- a/libc/src/math/generic/common_constants.h
+++ /dev/null
@@ -1,73 +0,0 @@
-//===-- Common constants for math functions ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
-#define LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
-
-#include "src/__support/FPUtil/triple_double.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/math/acosh_float_constants.h"
-#include "src/__support/math/exp_constants.h"
-#include "src/__support/number_pair.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-// Lookup table for range reduction constants r for logarithms.
-extern const float R[128];
-
-// Lookup table for range reduction constants r for logarithms.
-extern const double RD[128];
-
-// Lookup table for compensated constants for exact range reduction when FMA
-// instructions are not available.
-extern const double CD[128];
-
-// Lookup table for -log(r)
-extern const double LOG_R[128];
-extern const NumberPair<double> LOG_R_DD[128];
-
-// Lookup table for -log2(r)
-extern const double LOG2_R[128];
-
-// Minimax polynomial for (log(1 + x) - x)/x^2, generated by sollya with:
-// > P = fpminimax((log(1 + x) - x)/x^2, 5, [|D...|], [-2^-8, 2^-7]);
-constexpr double LOG_COEFFS[6] = {-0x1.fffffffffffffp-2, 0x1.5555555554a9bp-2,
-                                  -0x1.0000000094567p-2, 0x1.99999dcc9823cp-3,
-                                  -0x1.55550ac2e537ap-3, 0x1.21a02c4e624d7p-3};
-
-// Logarithm Range Reduction - Step 2, 3, and 4.
-extern const int S2[193];
-extern const int S3[161];
-extern const int S4[130];
-
-extern const double R2[193];
-
-// log(2) generated by Sollya with:
-// > a = 2^-43 * nearestint(2^43*log(2));
-// LSB = 2^-43 is chosen so that e_x * LOG_2_HI is exact for -1075 < e_x < 1024.
-constexpr double LOG_2_HI = 0x1.62e42fefa38p-1; // LSB = 2^-43
-// > b = round(log10(2) - a, D, RN);
-constexpr double LOG_2_LO = 0x1.ef35793c7673p-45; // LSB = 2^-97
-
-// Lookup table for exp(m) with m = -104, ..., 89.
-//   -104 = floor(log(single precision's min denormal))
-//     89 = ceil(log(single precision's max normal))
-// Table is generated with Sollya as follow:
-// > display = hexadecimal;
-// > for i from -104 to 89 do { D(exp(i)); };
-extern const double EXP_M1[195];
-
-// Lookup table for exp(m * 2^(-7)) with m = 0, ..., 127.
-// Table is generated with Sollya as follow:
-// > display = hexadecimal;
-// > for i from 0 to 127 do { D(exp(i / 128)); };
-extern const double EXP_M2[128];
-
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 154154f..20e1ff5 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -7,404 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp2.h"
-#include "common_constants.h" // Lookup tables EXP2_MID1 and EXP_M2.
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/optional.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/double_double.h"
-#include "src/__support/FPUtil/dyadic_float.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/FPUtil/triple_double.h"
-#include "src/__support/common.h"
-#include "src/__support/integer_literals.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/__support/math/exp_utils.h"      // ziv_test_denorm.
+#include "src/__support/math/exp2.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using fputil::DoubleDouble;
-using fputil::TripleDouble;
-using Float128 = typename fputil::DyadicFloat<128>;
-
-using LIBC_NAMESPACE::operator""_u128;
-
-// Error bounds:
-// Errors when using double precision.
-#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-constexpr double ERR_D = 0x1.0p-63;
-#else
-constexpr double ERR_D = 0x1.8p-63;
-#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Errors when using double-double precision.
-constexpr double ERR_DD = 0x1.0p-100;
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-namespace {
-
-// Polynomial approximations with double precision.  Generated by Sollya with:
-// > P = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
-// > P;
-// Error bounds:
-//   | output - (2^dx - 1) / dx | < 1.5 * 2^-52.
-LIBC_INLINE double poly_approx_d(double dx) {
-  // dx^2
-  double dx2 = dx * dx;
-  double c0 =
-      fputil::multiply_add(dx, 0x1.ebfbdff82c58ep-3, 0x1.62e42fefa39efp-1);
-  double c1 =
-      fputil::multiply_add(dx, 0x1.3b2aba7a95a89p-7, 0x1.c6b08e8fc0c0ep-5);
-  double p = fputil::multiply_add(dx2, c1, c0);
-  return p;
-}
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Polynomial approximation with double-double precision.  Generated by Solya
-// with:
-// > P = fpminimax((2^x - 1)/x, 5, [|DD...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
-// Error bounds:
-//   | output - 2^(dx) | < 2^-101
-DoubleDouble poly_approx_dd(const DoubleDouble &dx) {
-  // Taylor polynomial.
-  constexpr DoubleDouble COEFFS[] = {
-      {0, 0x1p0},
-      {0x1.abc9e3b39824p-56, 0x1.62e42fefa39efp-1},
-      {-0x1.5e43a53e4527bp-57, 0x1.ebfbdff82c58fp-3},
-      {-0x1.d37963a9444eep-59, 0x1.c6b08d704a0cp-5},
-      {0x1.4eda1a81133dap-62, 0x1.3b2ab6fba4e77p-7},
-      {-0x1.c53fd1ba85d14p-64, 0x1.5d87fe7a265a5p-10},
-      {0x1.d89250b013eb8p-70, 0x1.430912f86cb8ep-13},
-  };
-
-  DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
-                                    COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
-  return p;
-}
-
-// Polynomial approximation with 128-bit precision:
-// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7
-// For |dx| < 2^-13 + 2^-30:
-//   | output - exp(dx) | < 2^-126.
-Float128 poly_approx_f128(const Float128 &dx) {
-  constexpr Float128 COEFFS_128[]{
-      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
-      {Sign::POS, -128, 0xb17217f7'd1cf79ab'c9e3b398'03f2f6af_u128},
-      {Sign::POS, -128, 0x3d7f7bff'058b1d50'de2d60dd'9c9a1d9f_u128},
-      {Sign::POS, -132, 0xe35846b8'2505fc59'9d3b15d9'e7fb6897_u128},
-      {Sign::POS, -134, 0x9d955b7d'd273b94e'184462f6'bcd2b9e7_u128},
-      {Sign::POS, -137, 0xaec3ff3c'53398883'39ea1bb9'64c51a89_u128},
-      {Sign::POS, -138, 0x2861225f'345c396a'842c5341'8fa8ae61_u128},
-      {Sign::POS, -144, 0xffe5fe2d'109a319d'7abeb5ab'd5ad2079_u128},
-  };
-
-  Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
-                                COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
-                                COEFFS_128[6], COEFFS_128[7]);
-  return p;
-}
-
-// Compute 2^(x) using 128-bit precision.
-// TODO(lntue): investigate triple-double precision implementation for this
-// step.
-Float128 exp2_f128(double x, int hi, int idx1, int idx2) {
-  Float128 dx = Float128(x);
-
-  // TODO: Skip recalculating exp_mid1 and exp_mid2.
-  Float128 exp_mid1 =
-      fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
-                        fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
-                                          Float128(EXP2_MID1[idx1].lo)));
-
-  Float128 exp_mid2 =
-      fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
-                        fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
-                                          Float128(EXP2_MID2[idx2].lo)));
-
-  Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
-
-  Float128 p = poly_approx_f128(dx);
-
-  Float128 r = fputil::quick_mul(exp_mid, p);
-
-  r.exponent += hi;
-
-  return r;
-}
-
-// Compute 2^x with double-double precision.
-DoubleDouble exp2_double_double(double x, const DoubleDouble &exp_mid) {
-  DoubleDouble dx({0, x});
-
-  // Degree-6 polynomial approximation in double-double precision.
-  // | p - 2^x | < 2^-103.
-  DoubleDouble p = poly_approx_dd(dx);
-
-  // Error bounds: 2^-102.
-  DoubleDouble r = fputil::quick_mult(exp_mid, p);
-
-  return r;
-}
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-// When output is denormal.
-double exp2_denorm(double x) {
-  // Range reduction.
-  int k =
-      static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
-  double kd = static_cast<double>(k);
-
-  uint32_t idx1 = (k >> 6) & 0x3f;
-  uint32_t idx2 = k & 0x3f;
-
-  int hi = k >> 12;
-
-  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
-  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
-  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
-
-  // |dx| < 2^-13 + 2^-30.
-  double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
-
-  double mid_lo = dx * exp_mid.hi;
-
-  // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
-  double p = poly_approx_d(dx);
-
-  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
-      .value();
-#else
-  if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
-      LIBC_LIKELY(r.has_value()))
-    return r.value();
-
-  // Use double-double
-  DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
-
-  if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
-      LIBC_LIKELY(r.has_value()))
-    return r.value();
-
-  // Use 128-bit precision
-  Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
-
-  return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
-
-// Check for exceptional cases when:
-//  * log2(1 - 2^-54) < x < log2(1 + 2^-53)
-//  * x >= 1024
-//  * x <= -1022
-//  * x is inf or nan
-double set_exceptional(double x) {
-  using FPBits = typename fputil::FPBits<double>;
-  FPBits xbits(x);
-
-  uint64_t x_u = xbits.uintval();
-  uint64_t x_abs = xbits.abs().uintval();
-
-  // |x| < log2(1 + 2^-53)
-  if (x_abs <= 0x3ca71547652b82fd) {
-    // 2^(x) ~ 1 + x/2
-    return fputil::multiply_add(x, 0.5, 1.0);
-  }
-
-  // x <= -1022 || x >= 1024 or inf/nan.
-  if (x_u > 0xc08ff00000000000) {
-    // x <= -1075 or -inf/nan
-    if (x_u >= 0xc090cc0000000000) {
-      // exp(-Inf) = 0
-      if (xbits.is_inf())
-        return 0.0;
-
-      // exp(nan) = nan
-      if (xbits.is_nan())
-        return x;
-
-      if (fputil::quick_get_round() == FE_UPWARD)
-        return FPBits::min_subnormal().get_val();
-      fputil::set_errno_if_required(ERANGE);
-      fputil::raise_except_if_required(FE_UNDERFLOW);
-      return 0.0;
-    }
-
-    return exp2_denorm(x);
-  }
-
-  // x >= 1024 or +inf/nan
-  // x is finite
-  if (x_u < 0x7ff0'0000'0000'0000ULL) {
-    int rounding = fputil::quick_get_round();
-    if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal().get_val();
-
-    fputil::set_errno_if_required(ERANGE);
-    fputil::raise_except_if_required(FE_OVERFLOW);
-  }
-  // x is +inf or nan
-  return x + FPBits::inf().get_val();
-}
-
-} // namespace
-
-LLVM_LIBC_FUNCTION(double, exp2, (double x)) {
-  using FPBits = typename fputil::FPBits<double>;
-  FPBits xbits(x);
-
-  uint64_t x_u = xbits.uintval();
-
-  // x < -1022 or x >= 1024 or log2(1 - 2^-54) < x < log2(1 + 2^-53).
-  if (LIBC_UNLIKELY(x_u > 0xc08ff00000000000 ||
-                    (x_u <= 0xbc971547652b82fe && x_u >= 0x4090000000000000) ||
-                    x_u <= 0x3ca71547652b82fd)) {
-    return set_exceptional(x);
-  }
-
-  // Now -1075 < x <= log2(1 - 2^-54) or log2(1 + 2^-53) < x < 1024
-
-  // Range reduction:
-  // Let x = (hi + mid1 + mid2) + lo
-  // in which:
-  //   hi is an integer
-  //   mid1 * 2^6 is an integer
-  //   mid2 * 2^12 is an integer
-  // then:
-  //   2^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 2^(lo).
-  // With this formula:
-  //   - multiplying by 2^hi is exact and cheap, simply by adding the exponent
-  //     field.
-  //   - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
-  //   - 2^(lo) ~ 1 + a0*lo + a1 * lo^2 + ...
-  //
-  // We compute (hi + mid1 + mid2) together by perform the rounding on x * 2^12.
-  // Since |x| < |-1075)| < 2^11,
-  //   |x * 2^12| < 2^11 * 2^12 < 2^23,
-  // So we can fit the rounded result round(x * 2^12) in int32_t.
-  // Thus, the goal is to be able to use an additional addition and fixed width
-  // shift to get an int32_t representing round(x * 2^12).
-  //
-  // Assuming int32_t using 2-complement representation, since the mantissa part
-  // of a double precision is unsigned with the leading bit hidden, if we add an
-  // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^25 to the product, the
-  // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
-  // considered as a proper 2-complement representations of x*2^12.
-  //
-  // One small problem with this approach is that the sum (x*2^12 + C) in
-  // double precision is rounded to the least significant bit of the dorminant
-  // factor C.  In order to minimize the rounding errors from this addition, we
-  // want to minimize e1.  Another constraint that we want is that after
-  // shifting the mantissa so that the least significant bit of int32_t
-  // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
-  // any adjustment.  So combining these 2 requirements, we can choose
-  //   C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
-  // after right shifting the mantissa, the resulting int32_t has correct sign.
-  // With this choice of C, the number of mantissa bits we need to shift to the
-  // right is: 52 - 33 = 19.
-  //
-  // Moreover, since the integer right shifts are equivalent to rounding down,
-  // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
-  // +infinity.  So in particular, we can compute:
-  //   hmm = x * 2^12 + C,
-  // where C = 2^33 + 2^32 + 2^-1, then if
-  //   k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19),
-  // the reduced argument:
-  //   lo = x - 2^-12 * k is bounded by:
-  //   |lo| <= 2^-13 + 2^-12*2^-19
-  //         = 2^-13 + 2^-31.
-  //
-  // Finally, notice that k only uses the mantissa of x * 2^12, so the
-  // exponent 2^12 is not needed.  So we can simply define
-  //   C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
-  //   k = int32_t(lower 51 bits of double(x + C) >> 19).
-
-  // Rounding errors <= 2^-31.
-  int k =
-      static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
-  double kd = static_cast<double>(k);
-
-  uint32_t idx1 = (k >> 6) & 0x3f;
-  uint32_t idx2 = k & 0x3f;
-
-  int hi = k >> 12;
-
-  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
-  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
-  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
-
-  // |dx| < 2^-13 + 2^-30.
-  double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
-
-  // We use the degree-4 polynomial to approximate 2^(lo):
-  //   2^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4 = 1 + lo * P(lo)
-  // So that the errors are bounded by:
-  //   |P(lo) - (2^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
-  // Let P_ be an evaluation of P where all intermediate computations are in
-  // double precision.  Using either Horner's or Estrin's schemes, the evaluated
-  // errors can be bounded by:
-  //      |P_(lo) - P(lo)| < 2^-51
-  //   => |lo * P_(lo) - (2^lo - 1) | < 2^-64
-  //   => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-63.
-  // Since we approximate
-  //   2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
-  // We use the expression:
-  //    (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
-  //  ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
-  // with errors bounded by 2^-63.
-
-  double mid_lo = dx * exp_mid.hi;
-
-  // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
-  double p = poly_approx_d(dx);
-
-  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  // To multiply by 2^hi, a fast way is to simply add hi to the exponent
-  // field.
-  int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-  double r =
-      cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
-  return r;
-#else
-  double upper = exp_mid.hi + (lo + ERR_D);
-  double lower = exp_mid.hi + (lo - ERR_D);
-
-  if (LIBC_LIKELY(upper == lower)) {
-    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
-    // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
-    return r;
-  }
-
-  // Use double-double
-  DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
-
-  double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
-  double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
-
-  if (LIBC_LIKELY(upper_dd == lower_dd)) {
-    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
-    // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
-    return r;
-  }
-
-  // Use 128-bit precision
-  Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
-
-  return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
+LLVM_LIBC_FUNCTION(double, exp2, (double x)) { return math::exp2(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp
index c360554..a3d0c1a 100644
--- a/libc/src/math/generic/expm1.cpp
+++ b/libc/src/math/generic/expm1.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/expm1.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "src/__support/CPP/bit.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
@@ -22,6 +21,8 @@
 #include "src/__support/integer_literals.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
+#include "src/__support/math/exp_constants.h"
 
 #if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
 #define LIBC_MATH_EXPM1_SKIP_ACCURATE_PASS
@@ -59,6 +60,8 @@ constexpr double MLOG_2_EXP2_M12_LO = 0x1.b0e2633fe0685p-79;
 
 namespace {
 
+using namespace common_constants_internal;
+
 // Polynomial approximations with double precision:
 // Return expm1(dx) / x ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24.
 // For |dx| < 2^-13 + 2^-30:
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index b2967e2..72c8aa3 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/expm1f.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FMA.h"
@@ -20,10 +19,12 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
+  using namespace common_constants_internal;
   using FPBits = typename fputil::FPBits<float>;
   FPBits xbits(x);
 
diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp
index 0cd4424..66ce059 100644
--- a/libc/src/math/generic/log.cpp
+++ b/libc/src/math/generic/log.cpp
@@ -18,8 +18,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
-#include "common_constants.h"
 #include "log_range_reduction.h"
+#include "src/__support/math/common_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128;
 
 namespace {
 
+using namespace common_constants_internal;
+
 #ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 // A simple upper bound for the error of e_x * log(2) - log(r).
 constexpr double HI_ERR = 0x1.0p-85;
diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp
index 1c4e559..95f24fa 100644
--- a/libc/src/math/generic/log10.cpp
+++ b/libc/src/math/generic/log10.cpp
@@ -18,8 +18,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
-#include "common_constants.h"
 #include "log_range_reduction.h"
+#include "src/__support/math/common_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128;
 
 namespace {
 
+using namespace common_constants_internal;
+
 constexpr fputil::DoubleDouble LOG10_E = {0x1.95355baaafad3p-57,
                                           0x1.bcb7b1526e50ep-2};
 
@@ -739,6 +741,7 @@ double log10_accurate(int e_x, int index, double m_x) {
 } // namespace
 
 LLVM_LIBC_FUNCTION(double, log10, (double x)) {
+  using namespace common_constants_internal;
   using FPBits_t = typename fputil::FPBits<double>;
 
   FPBits_t xbits(x);
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index 81e7cdb..6b9cc5d 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/log10f.h"
-#include "common_constants.h" // Lookup table for (1/f)
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FMA.h"
 #include "src/__support/FPUtil/FPBits.h"
@@ -18,6 +17,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f)
 
 // This is an algorithm for log10(x) in single precision which is
 // correctly rounded for all rounding modes, based on the implementation of
@@ -104,6 +104,7 @@ static constexpr double LOG10_R[128] = {
     0x1.30cb3a7bb3625p-2, 0x1.34413509f79ffp-2};
 
 LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
+  using namespace common_constants_internal;
   constexpr double LOG10_2 = 0x1.34413509f79ffp-2;
 
   using FPBits = typename fputil::FPBits<float>;
diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp
index 09f465a..1595981 100644
--- a/libc/src/math/generic/log1p.cpp
+++ b/libc/src/math/generic/log1p.cpp
@@ -18,7 +18,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
-#include "common_constants.h"
+#include "src/__support/math/common_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -29,6 +29,8 @@ using LIBC_NAMESPACE::operator""_u128;
 
 namespace {
 
+using namespace common_constants_internal;
+
 // R1[i] = 2^-8 * nearestint( 2^8 / (1 + i * 2^-7) )
 constexpr double R1[129] = {
     0x1p0,     0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1,  0x1.ecp-1, 0x1.eap-1,
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index 16b1b34..f0289c2 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/log1pf.h"
-#include "common_constants.h" // Lookup table for (1/f) and log(f)
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FMA.h"
 #include "src/__support/FPUtil/FPBits.h"
@@ -18,6 +17,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/acosh_float_constants.h"
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f) and log(f)
 
 // This is an algorithm for log10(x) in single precision which is
 // correctly rounded for all rounding modes.
@@ -38,6 +39,7 @@ namespace internal {
 // We don't need to treat denormal and 0
 LIBC_INLINE float log(double x) {
   using namespace acoshf_internal;
+  using namespace common_constants_internal;
   constexpr double LOG_2 = 0x1.62e42fefa39efp-1;
 
   using FPBits = typename fputil::FPBits<double>;
diff --git a/libc/src/math/generic/log2.cpp b/libc/src/math/generic/log2.cpp
index 27ca2fc..f0c0ae3 100644
--- a/libc/src/math/generic/log2.cpp
+++ b/libc/src/math/generic/log2.cpp
@@ -18,8 +18,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
-#include "common_constants.h"
 #include "log_range_reduction.h"
+#include "src/__support/math/common_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128;
 
 namespace {
 
+using namespace common_constants_internal;
+
 constexpr fputil::DoubleDouble LOG2_E = {0x1.777d0ffda0d24p-56,
                                          0x1.71547652b82fep0};
 
@@ -859,6 +861,7 @@ double log2_accurate(int e_x, int index, double m_x) {
 } // namespace
 
 LLVM_LIBC_FUNCTION(double, log2, (double x)) {
+  using namespace common_constants_internal;
   using FPBits_t = typename fputil::FPBits<double>;
 
   FPBits_t xbits(x);
diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp
index cff718e..7353f03 100644
--- a/libc/src/math/generic/log2f.cpp
+++ b/libc/src/math/generic/log2f.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/log2f.h"
-#include "common_constants.h" // Lookup table for (1/f)
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
@@ -15,7 +14,8 @@
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/optimization.h"   // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f)
 
 // This is a correctly-rounded algorithm for log2(x) in single precision with
 // round-to-nearest, tie-to-even mode from the RLIBM project at:
@@ -55,6 +55,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
+  using namespace common_constants_internal;
   using FPBits = typename fputil::FPBits<float>;
 
   FPBits xbits(x);
diff --git a/libc/src/math/generic/log_range_reduction.h b/libc/src/math/generic/log_range_reduction.h
index 8c94230..7484506 100644
--- a/libc/src/math/generic/log_range_reduction.h
+++ b/libc/src/math/generic/log_range_reduction.h
@@ -9,9 +9,9 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_LOG_RANGE_REDUCTION_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_LOG_RANGE_REDUCTION_H
 
-#include "common_constants.h"
 #include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/math/common_constants.h"
 #include "src/__support/uint128.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -36,6 +36,7 @@ struct LogRR {
 LIBC_INLINE fputil::DyadicFloat<128>
 log_range_reduction(double m_x, const LogRR &log_table,
                     fputil::DyadicFloat<128> &sum) {
+  using namespace common_constants_internal;
   using Float128 = typename fputil::DyadicFloat<128>;
   using MType = typename Float128::MantissaType;
 
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index e8d2ba2..4d2947d 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/logf.h"
-#include "common_constants.h" // Lookup table for (1/f) and log(f)
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
@@ -17,6 +16,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f) and log(f)
 
 // This is an algorithm for log(x) in single precision which is correctly
 // rounded for all rounding modes, based on the implementation of log(x) from
@@ -53,6 +53,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, logf, (float x)) {
+  using namespace common_constants_internal;
   constexpr double LOG_2 = 0x1.62e42fefa39efp-1;
   using FPBits = typename fputil::FPBits<float>;
 
diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp
index 43e99a7..c9f685b 100644
--- a/libc/src/math/generic/pow.cpp
+++ b/libc/src/math/generic/pow.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/pow.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
 #include "src/__support/CPP/bit.h"
@@ -21,6 +20,8 @@
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
+#include "src/__support/math/exp_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -28,6 +29,8 @@ using fputil::DoubleDouble;
 
 namespace {
 
+using namespace common_constants_internal;
+
 // Constants for log2(x) range reduction, generated by Sollya with:
 // > for i from 0 to 127 do {
 //     r = 2^-8 * ceil( 2^8 * (1 - 2^(-8)) / (1 + i*2^-7) );
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index a45ef51..12246e9 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/powf.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "src/__support/CPP/bit.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
@@ -15,10 +14,13 @@
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/FPUtil/sqrt.h" // Speedup for powf(x, 1/2) = sqrtf(x)
+#include "src/__support/FPUtil/triple_double.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "src/__support/math/exp10f.h" // Speedup for powf(10, y) = exp10f(y)
+#include "src/__support/math/exp_constants.h"
 
 #include "exp2f_impl.h"  // Speedup for powf(2, y) = exp2f(y)
 
@@ -29,6 +31,8 @@ using fputil::TripleDouble;
 
 namespace {
 
+using namespace common_constants_internal;
+
 #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 alignas(16) constexpr DoubleDouble LOG2_R_DD[128] = {
     {0.0, 0.0},
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index ea4634c..040f635 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -40,6 +40,7 @@ add_fp_unittest(
     libc.src.__support.math.exp10m1f16
     libc.src.__support.math.erff
     libc.src.__support.math.exp
+    libc.src.__support.math.exp2
     libc.src.__support.math.exp10
     libc.src.__support.math.exp10f
     libc.src.__support.math.exp10f16
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 1722193..ef2e7b8 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -80,6 +80,7 @@ TEST(LlvmLibcSharedMathTest, AllDouble) {
   EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::cos(0.0));
   EXPECT_FP_EQ(0x0p+0, LIBC_NAMESPACE::shared::dsqrtl(0.0));
   EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp(0.0));
+  EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp2(0.0));
   EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp10(0.0));
 }
 
diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst
index eae693b1..f86c736 100644
--- a/llvm/docs/GitHub.rst
+++ b/llvm/docs/GitHub.rst
@@ -567,8 +567,11 @@ branch.  Just make sure to add the release milestone to the pull request.
 Getting admin access to CI infrastructure
 =========================================
 
-Any individual who is responsible for setting up and/or maintaining CI infrastructure for a LLVM project can
-request to be granted the CI/CD role to the LLVM organization admins. The request can be made by creating
-`a Github issue <https://github.com/llvm/llvm-project/issues/new>`_ and using the ``infrastructure`` label.
-Applicants must include a justification for why the role is being requested. Applications are reviewed on a
-case-by-case basis by the LLVM admins and the role can be revoked at any point as the LLVM admins see fit.
+Any individual who is responsible for setting up and/or maintaining CI
+infrastructure for a LLVM project can request to be granted the CI/CD role by
+the LLVM infrastructure area team. The request can be made by creating `a
+Github issue <https://github.com/llvm/llvm-project/issues/new>`_ and using the
+``infrastructure`` label.  Applicants must include a justification for why the
+role is being requested. Applications are reviewed on a case-by-case basis by
+the LLVM infrastructure area team and the role can be revoked at any point as
+the area team sees fit.
diff --git a/llvm/include/llvm/ADT/TypeSwitch.h b/llvm/include/llvm/ADT/TypeSwitch.h
index 5bbbdf2..5657303 100644
--- a/llvm/include/llvm/ADT/TypeSwitch.h
+++ b/llvm/include/llvm/ADT/TypeSwitch.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <optional>
 
 namespace llvm {
@@ -117,11 +118,16 @@ public:
     return defaultResult;
   }
 
-  [[nodiscard]] operator ResultT() {
-    assert(result && "Fell off the end of a type-switch");
-    return std::move(*result);
+  /// Declare default as unreachable, making sure that all cases were handled.
+  [[nodiscard]] ResultT DefaultUnreachable(
+      const char *message = "Fell off the end of a type-switch") {
+    if (result)
+      return std::move(*result);
+    llvm_unreachable(message);
   }
 
+  [[nodiscard]] operator ResultT() { return DefaultUnreachable(); }
+
 private:
   /// The pointer to the result of this switch statement, once known,
   /// null before that.
@@ -158,6 +164,13 @@ public:
       defaultFn(this->value);
   }
 
+  /// Declare default as unreachable, making sure that all cases were handled.
+  void DefaultUnreachable(
+      const char *message = "Fell off the end of a type-switch") {
+    if (!foundMatch)
+      llvm_unreachable(message);
+  }
+
 private:
   /// A flag detailing if we have already found a match.
   bool foundMatch = false;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7a4abe9..5d3b233 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1551,12 +1551,6 @@ public:
       OperandValueInfo OpdInfo = {OK_AnyValue, OP_None},
       const Instruction *I = nullptr) const;
 
-  /// \return The cost of VP Load and Store instructions.
-  LLVM_ABI InstructionCost getVPMemoryOpCost(
-      unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
-      const Instruction *I = nullptr) const;
-
   /// \return The cost of masked Load and Store instructions.
   LLVM_ABI InstructionCost getMaskedMemoryOpCost(
       unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 566e1cf..4cd607c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -841,14 +841,6 @@ public:
     return 1;
   }
 
-  virtual InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src,
-                                            Align Alignment,
-                                            unsigned AddressSpace,
-                                            TTI::TargetCostKind CostKind,
-                                            const Instruction *I) const {
-    return 1;
-  }
-
   virtual InstructionCost
   getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                         unsigned AddressSpace,
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 6c12cd3..7a2e9ad 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -464,10 +464,10 @@ public:
   const char *getData64bitsDirective() const { return Data64bitsDirective; }
   bool supportsSignedData() const { return SupportsSignedData; }
 
-  /// Targets can implement this method to specify a section to switch to if the
-  /// translation unit doesn't have any trampolines that require an executable
-  /// stack.
-  virtual MCSection *getNonexecutableStackSection(MCContext &Ctx) const {
+  /// Targets can implement this method to specify a section to switch to
+  /// depending on whether the translation unit has any trampolines that require
+  /// an executable stack.
+  virtual MCSection *getStackSection(MCContext &Ctx, bool Exec) const {
     return nullptr;
   }
 
diff --git a/llvm/include/llvm/MC/MCAsmInfoELF.h b/llvm/include/llvm/MC/MCAsmInfoELF.h
index e067888..095ee4d 100644
--- a/llvm/include/llvm/MC/MCAsmInfoELF.h
+++ b/llvm/include/llvm/MC/MCAsmInfoELF.h
@@ -15,7 +15,7 @@ namespace llvm {
 
 class MCAsmInfoELF : public MCAsmInfo {
   virtual void anchor();
-  MCSection *getNonexecutableStackSection(MCContext &Ctx) const override;
+  MCSection *getStackSection(MCContext &Ctx, bool Exec) const override;
   void printSwitchToSection(const MCSection &, uint32_t, const Triple &,
                             raw_ostream &) const final;
   bool useCodeAlign(const MCSection &Sec) const final;
diff --git a/llvm/include/llvm/MC/TargetRegistry.h b/llvm/include/llvm/MC/TargetRegistry.h
index 019ee60..570d4c0 100644
--- a/llvm/include/llvm/MC/TargetRegistry.h
+++ b/llvm/include/llvm/MC/TargetRegistry.h
@@ -389,6 +389,7 @@ public:
   /// @name Feature Constructors
   /// @{
 
+  // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
   [[deprecated("Use overload accepting Triple instead")]]
   MCAsmInfo *createMCAsmInfo(const MCRegisterInfo &MRI, StringRef TheTriple,
                              const MCTargetOptions &Options) const {
@@ -440,6 +441,7 @@ public:
     return MCInstrAnalysisCtorFn(Info);
   }
 
+  // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
   [[deprecated("Use overload accepting Triple instead")]]
   MCRegisterInfo *createMCRegInfo(StringRef TT) const {
     if (!MCRegInfoCtorFn)
@@ -454,6 +456,7 @@ public:
     return MCRegInfoCtorFn(TT);
   }
 
+  // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
   [[deprecated("Use overload accepting Triple instead")]]
   MCSubtargetInfo *createMCSubtargetInfo(StringRef TheTriple, StringRef CPU,
                                          StringRef Features) const {
@@ -496,16 +499,6 @@ public:
                                JIT);
   }
 
-  [[deprecated("Use overload accepting Triple instead")]]
-  TargetMachine *createTargetMachine(
-      StringRef TT, StringRef CPU, StringRef Features,
-      const TargetOptions &Options, std::optional<Reloc::Model> RM,
-      std::optional<CodeModel::Model> CM = std::nullopt,
-      CodeGenOptLevel OL = CodeGenOptLevel::Default, bool JIT = false) const {
-    return createTargetMachine(Triple(TT), CPU, Features, Options, RM, CM, OL,
-                               JIT);
-  }
-
   /// createMCAsmBackend - Create a target specific assembly parser.
   MCAsmBackend *createMCAsmBackend(const MCSubtargetInfo &STI,
                                    const MCRegisterInfo &MRI,
@@ -599,6 +592,7 @@ public:
     return nullptr;
   }
 
+  // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
   [[deprecated("Use overload accepting Triple instead")]]
   MCRelocationInfo *createMCRelocationInfo(StringRef TT, MCContext &Ctx) const {
     return createMCRelocationInfo(Triple(TT), Ctx);
@@ -616,6 +610,7 @@ public:
     return Fn(TT, Ctx);
   }
 
+  // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
   [[deprecated("Use overload accepting Triple instead")]]
   MCSymbolizer *
   createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 07f4a8e..0d978d4 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4164,6 +4164,10 @@ static Value *simplifyFCmpInst(CmpPredicate Pred, Value *LHS, Value *RHS,
       return ConstantInt::get(RetTy, Pred == CmpInst::FCMP_UNO);
   }
 
+  if (std::optional<bool> Res =
+          isImpliedByDomCondition(Pred, LHS, RHS, Q.CxtI, Q.DL))
+    return ConstantInt::getBool(RetTy, *Res);
+
   const APFloat *C = nullptr;
   match(RHS, m_APFloatAllowPoison(C));
   std::optional<KnownFPClass> FullKnownClassLHS;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 1eda7a7..a42c061 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantFPRange.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -9474,6 +9475,69 @@ isImpliedCondICmps(CmpPredicate LPred, const Value *L0, const Value *L1,
   return std::nullopt;
 }
 
+/// Return true if LHS implies RHS (expanded to its components as "R0 RPred R1")
+/// is true.  Return false if LHS implies RHS is false. Otherwise, return
+/// std::nullopt if we can't infer anything.
+static std::optional<bool>
+isImpliedCondFCmps(FCmpInst::Predicate LPred, const Value *L0, const Value *L1,
+                   FCmpInst::Predicate RPred, const Value *R0, const Value *R1,
+                   const DataLayout &DL, bool LHSIsTrue) {
+  // The rest of the logic assumes the LHS condition is true.  If that's not the
+  // case, invert the predicate to make it so.
+  if (!LHSIsTrue)
+    LPred = FCmpInst::getInversePredicate(LPred);
+
+  // We can have non-canonical operands, so try to normalize any common operand
+  // to L0/R0.
+  if (L0 == R1) {
+    std::swap(R0, R1);
+    RPred = FCmpInst::getSwappedPredicate(RPred);
+  }
+  if (R0 == L1) {
+    std::swap(L0, L1);
+    LPred = FCmpInst::getSwappedPredicate(LPred);
+  }
+  if (L1 == R1) {
+    // If we have L0 == R0 and L1 == R1, then make L1/R1 the constants.
+    if (L0 != R0 || match(L0, m_ImmConstant())) {
+      std::swap(L0, L1);
+      LPred = ICmpInst::getSwappedCmpPredicate(LPred);
+      std::swap(R0, R1);
+      RPred = ICmpInst::getSwappedCmpPredicate(RPred);
+    }
+  }
+
+  // Can we infer anything when the two compares have matching operands?
+  if (L0 == R0 && L1 == R1) {
+    if ((LPred & RPred) == LPred)
+      return true;
+    if ((LPred & ~RPred) == LPred)
+      return false;
+  }
+
+  // See if we can infer anything if operand-0 matches and we have at least one
+  // constant.
+  const APFloat *L1C, *R1C;
+  if (L0 == R0 && match(L1, m_APFloat(L1C)) && match(R1, m_APFloat(R1C))) {
+    if (std::optional<ConstantFPRange> DomCR =
+            ConstantFPRange::makeExactFCmpRegion(LPred, *L1C)) {
+      if (std::optional<ConstantFPRange> ImpliedCR =
+              ConstantFPRange::makeExactFCmpRegion(RPred, *R1C)) {
+        if (ImpliedCR->contains(*DomCR))
+          return true;
+      }
+      if (std::optional<ConstantFPRange> ImpliedCR =
+              ConstantFPRange::makeExactFCmpRegion(
+                  FCmpInst::getInversePredicate(RPred), *R1C)) {
+        if (ImpliedCR->contains(*DomCR))
+          return false;
+      }
+    }
+  }
+
+  return std::nullopt;
+}
+
 /// Return true if LHS implies RHS is true.  Return false if LHS implies RHS is
 /// false.  Otherwise, return std::nullopt if we can't infer anything.  We
 /// expect the RHS to be an icmp and the LHS to be an 'and', 'or', or a 'select'
@@ -9529,15 +9593,24 @@ llvm::isImpliedCondition(const Value *LHS, CmpPredicate RHSPred,
     LHSIsTrue = !LHSIsTrue;
 
   // Both LHS and RHS are icmps.
-  if (const auto *LHSCmp = dyn_cast<ICmpInst>(LHS))
-    return isImpliedCondICmps(LHSCmp->getCmpPredicate(), LHSCmp->getOperand(0),
-                              LHSCmp->getOperand(1), RHSPred, RHSOp0, RHSOp1,
-                              DL, LHSIsTrue);
-  const Value *V;
-  if (match(LHS, m_NUWTrunc(m_Value(V))))
-    return isImpliedCondICmps(CmpInst::ICMP_NE, V,
-                              ConstantInt::get(V->getType(), 0), RHSPred,
-                              RHSOp0, RHSOp1, DL, LHSIsTrue);
+  if (RHSOp0->getType()->getScalarType()->isIntOrPtrTy()) {
+    if (const auto *LHSCmp = dyn_cast<ICmpInst>(LHS))
+      return isImpliedCondICmps(LHSCmp->getCmpPredicate(),
+                                LHSCmp->getOperand(0), LHSCmp->getOperand(1),
+                                RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue);
+    const Value *V;
+    if (match(LHS, m_NUWTrunc(m_Value(V))))
+      return isImpliedCondICmps(CmpInst::ICMP_NE, V,
+                                ConstantInt::get(V->getType(), 0), RHSPred,
+                                RHSOp0, RHSOp1, DL, LHSIsTrue);
+  } else {
+    assert(RHSOp0->getType()->isFPOrFPVectorTy() &&
+           "Expected floating point type only!");
+    if (const auto *LHSCmp = dyn_cast<FCmpInst>(LHS))
+      return isImpliedCondFCmps(LHSCmp->getPredicate(), LHSCmp->getOperand(0),
+                                LHSCmp->getOperand(1), RHSPred, RHSOp0, RHSOp1,
+                                DL, LHSIsTrue);
+  }
 
   /// The LHS should be an 'or', 'and', or a 'select' instruction.  We expect
   /// the RHS to be an icmp.
@@ -9574,6 +9647,13 @@ std::optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
       return InvertRHS ? !*Implied : *Implied;
     return std::nullopt;
   }
+  if (const FCmpInst *RHSCmp = dyn_cast<FCmpInst>(RHS)) {
+    if (auto Implied = isImpliedCondition(
+            LHS, RHSCmp->getPredicate(), RHSCmp->getOperand(0),
+            RHSCmp->getOperand(1), DL, LHSIsTrue, Depth))
+      return InvertRHS ? !*Implied : *Implied;
+    return std::nullopt;
+  }
 
   const Value *V;
   if (match(RHS, m_NUWTrunc(m_Value(V)))) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 11efe49..10df9c1 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2866,9 +2866,11 @@ bool AsmPrinter::doFinalization(Module &M) {
   // If we don't have any trampolines, then we don't require stack memory
   // to be executable. Some targets have a directive to declare this.
   Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
-  if (!InitTrampolineIntrinsic || InitTrampolineIntrinsic->use_empty())
-    if (MCSection *S = MAI->getNonexecutableStackSection(OutContext))
-      OutStreamer->switchSection(S);
+  bool HasTrampolineUses =
+      InitTrampolineIntrinsic && !InitTrampolineIntrinsic->use_empty();
+  MCSection *S = MAI->getStackSection(OutContext, /*Exec=*/HasTrampolineUses);
+  if (S)
+    OutStreamer->switchSection(S);
 
   if (TM.Options.EmitAddrsig) {
     // Emit address-significance attributes for all globals.
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 2f3b7a2..3c41bbe 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1657,12 +1657,6 @@ bool TargetInstrInfo::isReMaterializableImpl(
     // same virtual register, though.
     if (MO.isDef() && Reg != DefReg)
       return false;
-
-    // Don't allow any virtual-register uses. Rematting an instruction with
-    // virtual register uses would length the live ranges of the uses, which
-    // is not necessarily a good idea, certainly not "trivial".
-    if (MO.isUse())
-      return false;
   }
 
   // Everything checked out.
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 8b5965b..df0c85b 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2994,6 +2994,8 @@ LLVMValueRef LLVMIsATerminatorInst(LLVMValueRef Inst) {
 
 LLVMDbgRecordRef LLVMGetFirstDbgRecord(LLVMValueRef Inst) {
   Instruction *Instr = unwrap<Instruction>(Inst);
+  if (!Instr->DebugMarker)
+    return nullptr;
   auto I = Instr->DebugMarker->StoredDbgRecords.begin();
   if (I == Instr->DebugMarker->StoredDbgRecords.end())
     return nullptr;
@@ -3002,6 +3004,8 @@ LLVMDbgRecordRef LLVMGetFirstDbgRecord(LLVMValueRef Inst) {
 
 LLVMDbgRecordRef LLVMGetLastDbgRecord(LLVMValueRef Inst) {
   Instruction *Instr = unwrap<Instruction>(Inst);
+  if (!Instr->DebugMarker)
+    return nullptr;
   auto I = Instr->DebugMarker->StoredDbgRecords.rbegin();
   if (I == Instr->DebugMarker->StoredDbgRecords.rend())
     return nullptr;
diff --git a/llvm/lib/MC/MCAsmInfoELF.cpp b/llvm/lib/MC/MCAsmInfoELF.cpp
index cdae9d7..98090d3 100644
--- a/llvm/lib/MC/MCAsmInfoELF.cpp
+++ b/llvm/lib/MC/MCAsmInfoELF.cpp
@@ -27,12 +27,13 @@ using namespace llvm;
 
 void MCAsmInfoELF::anchor() {}
 
-MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const {
+MCSection *MCAsmInfoELF::getStackSection(MCContext &Ctx, bool Exec) const {
   // Solaris doesn't know/doesn't care about .note.GNU-stack sections, so
   // don't emit them.
   if (Ctx.getTargetTriple().isOSSolaris())
     return nullptr;
-  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0);
+  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
+                           Exec ? ELF::SHF_EXECINSTR : 0U);
 }
 
 bool MCAsmInfoELF::useCodeAlign(const MCSection &Sec) const {
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index 2881d7c..1bc1b92 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -54,7 +54,7 @@ void MCELFStreamer::initSections(bool NoExecStack, const MCSubtargetInfo &STI) {
                     &STI);
 
   if (NoExecStack)
-    switchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
+    switchSection(Ctx.getAsmInfo()->getStackSection(Ctx, /*Exec=*/false));
 }
 
 void MCELFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) {
diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td
index 54e27a6..f4326de 100644
--- a/llvm/lib/Target/ARM/ARMInstrCDE.td
+++ b/llvm/lib/Target/ARM/ARMInstrCDE.td
@@ -268,6 +268,7 @@ class CDE_Vec_Instr<bit acc, dag oops, dag iops, string asm, string cstr,
                      !con(iops, (ins vpred:$vp)), asm,
                      !strconcat(cstr, vpred.vpred_constraint)>,
     CDE_RequiresQReg {
+  bits<0> vp;
 }
 
 
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 9dffd94..e244134 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -409,6 +409,7 @@ class MVE_p<dag oops, dag iops, InstrItinClass itin, string iname,
            !strconcat(iname, "${vp}",
                       !if(!eq(suffix, ""), "", !strconcat(".", suffix))),
            ops, !strconcat(cstr, vpred.vpred_constraint), vecsize, pattern> {
+  bits<0> vp;
   let Inst{31-29} = 0b111;
   let Inst{27-26} = 0b11;
 }
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index d358913..e67db8e 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -639,6 +639,43 @@ static DecodeStatus DecodeCCOutOperand(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeVpredNOperand(MCInst &Inst,
+                                        const MCDisassembler *Decoder) {
+  const auto *D = static_cast<const ARMDisassembler *>(Decoder);
+  unsigned VCC = D->VPTBlock.getVPTPred();
+  MCRegister CondReg = VCC == ARMVCC::None ? ARM::NoRegister : ARM::P0;
+
+  Inst.addOperand(MCOperand::createImm(VCC));             // $cond
+  Inst.addOperand(MCOperand::createReg(CondReg));         // $cond_reg
+  Inst.addOperand(MCOperand::createReg(ARM::NoRegister)); // $tp_reg
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVpredROperand(MCInst &Inst,
+                                        const MCDisassembler *Decoder) {
+  const auto *D = static_cast<const ARMDisassembler *>(Decoder);
+  unsigned VCC = D->VPTBlock.getVPTPred();
+  MCRegister CondReg = VCC == ARMVCC::None ? ARM::NoRegister : ARM::P0;
+
+  Inst.addOperand(MCOperand::createImm(VCC));             // $cond
+  Inst.addOperand(MCOperand::createReg(CondReg));         // $cond_reg
+  Inst.addOperand(MCOperand::createReg(ARM::NoRegister)); // $tp_reg
+
+  // The last sub-operand ($inactive) is tied to an output operand.
+  // The output operand has already been decoded, so just copy it.
+  const MCInstrDesc &MCID = D->MCII->get(Inst.getOpcode());
+  unsigned InactiveOpIdx = Inst.getNumOperands();
+  int TiedOpIdx = MCID.getOperandConstraint(InactiveOpIdx, MCOI::TIED_TO);
+  assert(TiedOpIdx >= 0 &&
+         "Inactive register in vpred_r is not tied to an output!");
+
+  // Make a copy of the operand to ensure it is not invalidated when MI grows.
+  Inst.addOperand(MCOperand(Inst.getOperand(TiedOpIdx))); // $inactive
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
                                           uint64_t Address,
                                           const MCDisassembler *Decoder) {
@@ -2777,6 +2814,7 @@ static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst, unsigned Insn,
 
   Inst.addOperand(MCOperand::createImm(imm));
 
+  Check(S, DecodeVpredROperand(Inst, Decoder));
   return S;
 }
 
@@ -2802,6 +2840,7 @@ static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn,
   if (!fieldFromInstruction(Insn, 12, 1)) // I bit clear => need input FPSCR
     Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV));
 
+  Check(S, DecodeVpredROperand(Inst, Decoder));
   return S;
 }
 
@@ -5466,30 +5505,6 @@ static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned RegNo,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder) {
-  // The vpred_r operand type includes an MQPR register field derived
-  // from the encoding. But we don't actually want to add an operand
-  // to the MCInst at this stage, because AddThumbPredicate will do it
-  // later, and will infer the register number from the TIED_TO
-  // constraint. So this is a deliberately empty decoder method that
-  // will inhibit the auto-generated disassembly code from adding an
-  // operand at all.
-  return MCDisassembler::Success;
-}
-
-[[maybe_unused]] static DecodeStatus
-DecodeVpredNOperand(MCInst &Inst, unsigned RegNo, uint64_t Address,
-                    const MCDisassembler *Decoder) {
-  // Similar to above, we want to ensure that no operands are added for the
-  // vpred operands. (This is marked "maybe_unused" for the moment; because
-  // DecoderEmitter currently (wrongly) omits operands with no instruction bits,
-  // the decoder doesn't actually call it yet. That will be addressed in a
-  // future change.)
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus
 DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
                                   const MCDisassembler *Decoder) {
@@ -5668,6 +5683,7 @@ DecodeMVE_MEM_pre(MCInst &Inst, unsigned Val, uint64_t Address,
   if (!Check(S, AddrDecoder(Inst, addr, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  Check(S, DecodeVpredNOperand(Inst, Decoder));
   return S;
 }
 
@@ -5871,7 +5887,7 @@ static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn,
     return MCDisassembler::Fail;
   if (!Check(S, DecodeVCVTImmOperand(Inst, imm6, Address, Decoder)))
     return MCDisassembler::Fail;
-
+  Check(S, DecodeVpredROperand(Inst, Decoder));
   return S;
 }
 
@@ -5906,6 +5922,7 @@ static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address,
   if (!Check(S, predicate_decoder(Inst, fc, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  Check(S, DecodeVpredNOperand(Inst, Decoder));
   return S;
 }
 
@@ -5916,6 +5933,7 @@ static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
   if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
+  Check(S, DecodeVpredNOperand(Inst, Decoder));
   return S;
 }
 
@@ -5925,6 +5943,7 @@ static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn,
   DecodeStatus S = MCDisassembler::Success;
   Inst.addOperand(MCOperand::createReg(ARM::VPR));
   Inst.addOperand(MCOperand::createReg(ARM::VPR));
+  Check(S, DecodeVpredNOperand(Inst, Decoder));
   return S;
 }
 
@@ -6199,15 +6218,13 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
       (isVectorPredicable(MI) && ITBlock.instrInITBlock()))
     S = SoftFail;
 
-  // If we're in an IT/VPT block, base the predicate on that.  Otherwise,
+  // If we're in an IT block, base the predicate on that.  Otherwise,
   // assume a predicate of AL.
   unsigned CC = ARMCC::AL;
-  unsigned VCC = ARMVCC::None;
   if (ITBlock.instrInITBlock()) {
     CC = ITBlock.getITCC();
     ITBlock.advanceITState();
   } else if (VPTBlock.instrInVPTBlock()) {
-    VCC = VPTBlock.getVPTPred();
     VPTBlock.advanceVPTState();
   }
 
@@ -6230,34 +6247,6 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
     Check(S, SoftFail);
   }
 
-  MCInst::iterator VCCI = MI.begin();
-  unsigned VCCPos;
-  for (VCCPos = 0; VCCPos < MCID.NumOperands; ++VCCPos, ++VCCI) {
-    if (ARM::isVpred(MCID.operands()[VCCPos].OperandType) || VCCI == MI.end())
-      break;
-  }
-
-  if (isVectorPredicable(MI)) {
-    VCCI = MI.insert(VCCI, MCOperand::createImm(VCC));
-    ++VCCI;
-    if (VCC == ARMVCC::None)
-      VCCI = MI.insert(VCCI, MCOperand::createReg(0));
-    else
-      VCCI = MI.insert(VCCI, MCOperand::createReg(ARM::P0));
-    ++VCCI;
-    VCCI = MI.insert(VCCI, MCOperand::createReg(0));
-    ++VCCI;
-    if (MCID.operands()[VCCPos].OperandType == ARM::OPERAND_VPRED_R) {
-      int TiedOp = MCID.getOperandConstraint(VCCPos + 3, MCOI::TIED_TO);
-      assert(TiedOp >= 0 &&
-             "Inactive register in vpred_r is not tied to an output!");
-      // Copy the operand to ensure it's not invalidated when MI grows.
-      MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp)));
-    }
-  } else if (VCC != ARMVCC::None) {
-    Check(S, SoftFail);
-  }
-
   return S;
 }
 
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index dfd896f..8d8066a 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -49,7 +49,7 @@ public:
     DwarfUsesRelocationsAcrossSections = enable;
   }
 
-  MCSection *getNonexecutableStackSection(MCContext &Ctx) const override {
+  MCSection *getStackSection(MCContext &Ctx, bool Exec) const override {
     return nullptr;
   }
 };
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 3df448d..8f60e50 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OverflowInstAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -42,6 +43,7 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
+#include <optional>
 #include <utility>
 
 #define DEBUG_TYPE "instcombine"
@@ -1451,10 +1453,16 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
     return nullptr;
   };
 
-  if (Instruction *R = ReplaceOldOpWithNewOp(CmpLHS, CmpRHS))
-    return R;
-  if (Instruction *R = ReplaceOldOpWithNewOp(CmpRHS, CmpLHS))
-    return R;
+  bool CanReplaceCmpLHSWithRHS = canReplacePointersIfEqual(CmpLHS, CmpRHS, DL);
+  if (CanReplaceCmpLHSWithRHS) {
+    if (Instruction *R = ReplaceOldOpWithNewOp(CmpLHS, CmpRHS))
+      return R;
+  }
+  bool CanReplaceCmpRHSWithLHS = canReplacePointersIfEqual(CmpRHS, CmpLHS, DL);
+  if (CanReplaceCmpRHSWithLHS) {
+    if (Instruction *R = ReplaceOldOpWithNewOp(CmpRHS, CmpLHS))
+      return R;
+  }
 
   auto *FalseInst = dyn_cast<Instruction>(FalseVal);
   if (!FalseInst)
@@ -1469,12 +1477,14 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
   // Example:
   // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1
   SmallVector<Instruction *> DropFlags;
-  if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ,
-                             /* AllowRefinement */ false,
-                             &DropFlags) == TrueVal ||
-      simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ,
-                             /* AllowRefinement */ false,
-                             &DropFlags) == TrueVal) {
+  if ((CanReplaceCmpLHSWithRHS &&
+       simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ,
+                              /* AllowRefinement */ false,
+                              &DropFlags) == TrueVal) ||
+      (CanReplaceCmpRHSWithLHS &&
+       simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ,
+                              /* AllowRefinement */ false,
+                              &DropFlags) == TrueVal)) {
     for (Instruction *I : DropFlags) {
       I->dropPoisonGeneratingAnnotations();
       Worklist.add(I);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index ed68723..41f7ab8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -1219,14 +1219,14 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ;
 ; GISEL-LABEL: test_shl_i1024:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    sub sp, sp, #416
-; GISEL-NEXT:    stp x28, x27, [sp, #320] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x26, x25, [sp, #336] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x24, x23, [sp, #352] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x22, x21, [sp, #368] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x20, x19, [sp, #384] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x29, x30, [sp, #400] ; 16-byte Folded Spill
-; GISEL-NEXT:    .cfi_def_cfa_offset 416
+; GISEL-NEXT:    sub sp, sp, #432
+; GISEL-NEXT:    stp x28, x27, [sp, #336] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x26, x25, [sp, #352] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x24, x23, [sp, #368] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x22, x21, [sp, #384] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x20, x19, [sp, #400] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #416] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 432
 ; GISEL-NEXT:    .cfi_offset w30, -8
 ; GISEL-NEXT:    .cfi_offset w29, -16
 ; GISEL-NEXT:    .cfi_offset w19, -24
@@ -1242,38 +1242,44 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    ldp x10, x11, [x1]
 ; GISEL-NEXT:    mov w8, w2
 ; GISEL-NEXT:    lsr x9, x8, #6
-; GISEL-NEXT:    and x16, x8, #0x3f
+; GISEL-NEXT:    and x12, x8, #0x3f
+; GISEL-NEXT:    str x0, [sp, #144] ; 8-byte Folded Spill
+; GISEL-NEXT:    and x14, x8, #0x3f
 ; GISEL-NEXT:    mov w13, #64 ; =0x40
-; GISEL-NEXT:    sub x21, x13, x16
-; GISEL-NEXT:    str x0, [sp, #112] ; 8-byte Folded Spill
-; GISEL-NEXT:    mov x24, x16
-; GISEL-NEXT:    lsl x25, x10, x16
+; GISEL-NEXT:    and x16, x8, #0x3f
+; GISEL-NEXT:    lsl x0, x10, x12
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    lsr x26, x10, x21
-; GISEL-NEXT:    lsl x2, x11, x16
-; GISEL-NEXT:    lsr x23, x11, x21
-; GISEL-NEXT:    mov x22, x21
-; GISEL-NEXT:    csel x12, x25, xzr, eq
+; GISEL-NEXT:    sub x2, x13, x14
+; GISEL-NEXT:    lsr x3, x10, x2
+; GISEL-NEXT:    lsl x6, x11, x14
+; GISEL-NEXT:    and x14, x8, #0x3f
+; GISEL-NEXT:    csel x12, x0, xzr, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x1, [sp, #312] ; 8-byte Folded Spill
+; GISEL-NEXT:    lsr x20, x11, x2
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    str x23, [sp, #208] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x24, x0
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    stp x24, x22, [sp, #40] ; 16-byte Folded Spill
+; GISEL-NEXT:    mov x7, x3
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #4
+; GISEL-NEXT:    mov x28, x1
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #5
+; GISEL-NEXT:    and x21, x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #6
+; GISEL-NEXT:    str x6, [sp, #24] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #7
+; GISEL-NEXT:    str x28, [sp, #304] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #8
+; GISEL-NEXT:    str x7, [sp, #272] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #9
+; GISEL-NEXT:    str x20, [sp, #112] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
@@ -1290,13 +1296,13 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x10, x10, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #192] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x10, xzr, x26, eq
+; GISEL-NEXT:    str x10, [sp, #232] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x10, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x10, x2, x10
+; GISEL-NEXT:    orr x10, x6, x10
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    csel x10, x25, x10, eq
+; GISEL-NEXT:    csel x10, x0, x10, eq
 ; GISEL-NEXT:    cmp x9, #2
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #3
@@ -1327,25 +1333,24 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    lsl x20, x12, x16
+; GISEL-NEXT:    lsl x26, x12, x14
 ; GISEL-NEXT:    csel x11, x11, x13, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #184] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x11, xzr, x23, eq
+; GISEL-NEXT:    str x11, [sp, #224] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x11, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x20, x11
-; GISEL-NEXT:    lsr x15, x12, x21
-; GISEL-NEXT:    lsl x14, x10, x16
+; GISEL-NEXT:    orr x11, x26, x11
+; GISEL-NEXT:    lsr x15, x12, x2
+; GISEL-NEXT:    lsl x30, x10, x16
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsr x17, x10, x21
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    lsr x17, x10, x2
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x20, [sp, #8] ; 8-byte Folded Spill
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x0, x11, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #4
@@ -1375,23 +1380,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #176] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #216] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x14, x11
+; GISEL-NEXT:    orr x11, x30, x11
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x12, x20, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    orr x12, x6, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x0, x11, eq
 ; GISEL-NEXT:    cmp x9, #4
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #5
@@ -1421,33 +1426,33 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    lsl x0, x12, x16
 ; GISEL-NEXT:    csel x10, x10, x13, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #168] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x10, [sp, #208] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #0
 ; GISEL-NEXT:    orr x10, x0, x10
-; GISEL-NEXT:    lsr x27, x12, x21
+; GISEL-NEXT:    lsr x4, x12, x2
 ; GISEL-NEXT:    lsl x19, x11, x16
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsr x3, x11, x21
+; GISEL-NEXT:    mov x16, x15
 ; GISEL-NEXT:    csel x13, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    stp x27, x0, [sp, #240] ; 16-byte Folded Spill
-; GISEL-NEXT:    orr x13, x14, x13
-; GISEL-NEXT:    mov x7, x3
+; GISEL-NEXT:    str x4, [sp, #248] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    str x0, [sp, #48] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x23, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x13, x20, x13
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    csel x10, x25, x10, eq
+; GISEL-NEXT:    csel x10, x24, x10, eq
 ; GISEL-NEXT:    cmp x9, #5
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #6
@@ -1473,8 +1478,8 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #160] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x10, xzr, x27, eq
+; GISEL-NEXT:    str x10, [sp, #200] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x10, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #0
 ; GISEL-NEXT:    orr x10, x19, x10
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
@@ -1486,20 +1491,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x12, x14, x12
+; GISEL-NEXT:    and x15, x8, #0x3f
+; GISEL-NEXT:    orr x12, x30, x12
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x12, x20, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    lsr x3, x11, x2
+; GISEL-NEXT:    orr x12, x6, x12
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    csel x10, x25, x10, eq
+; GISEL-NEXT:    csel x10, x24, x10, eq
 ; GISEL-NEXT:    cmp x9, #6
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #7
@@ -1522,21 +1529,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    lsl x4, x12, x16
+; GISEL-NEXT:    lsl x22, x12, x15
 ; GISEL-NEXT:    csel x11, x11, x13, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #152] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #192] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x4, x11
-; GISEL-NEXT:    lsl x30, x10, x16
-; GISEL-NEXT:    lsr x28, x10, x21
+; GISEL-NEXT:    orr x11, x22, x11
+; GISEL-NEXT:    lsl x5, x10, x15
+; GISEL-NEXT:    lsr x27, x10, x2
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x27, eq
+; GISEL-NEXT:    csel x13, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x30, [sp, #200] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x25, x27
 ; GISEL-NEXT:    orr x13, x19, x13
+; GISEL-NEXT:    mov x14, x5
+; GISEL-NEXT:    str x27, [sp, #328] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x17, eq
@@ -1544,30 +1553,29 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x15, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x14, x13
+; GISEL-NEXT:    orr x13, x30, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x23, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x13, x20, x13
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    lsr x13, x12, x21
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    lsr x13, x12, x2
+; GISEL-NEXT:    csel x11, x24, x11, eq
 ; GISEL-NEXT:    cmp x9, #7
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    mov x6, x13
+; GISEL-NEXT:    mov x15, x13
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    str x6, [sp, #256] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
@@ -1584,18 +1592,18 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #144] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #184] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x30, x11
+; GISEL-NEXT:    orr x11, x5, x11
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x12, x4, x12
+; GISEL-NEXT:    orr x12, x22, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x27, eq
+; GISEL-NEXT:    csel x12, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #2
 ; GISEL-NEXT:    orr x12, x19, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
@@ -1605,22 +1613,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    orr x12, x0, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x15, eq
+; GISEL-NEXT:    csel x12, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x12, x14, x12
+; GISEL-NEXT:    orr x12, x30, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x12, x20, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    orr x12, x6, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x24, x11, eq
 ; GISEL-NEXT:    cmp x9, #8
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #9
@@ -1635,39 +1643,34 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #14
 ; GISEL-NEXT:    csel x12, xzr, x11, eq
-; GISEL-NEXT:    ldp x11, x5, [x1, #64]
+; GISEL-NEXT:    ldp x11, x1, [x1, #64]
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x12, x10, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsl x21, x11, x16
-; GISEL-NEXT:    str x12, [sp, #136] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x12, xzr, x28, eq
+; GISEL-NEXT:    lsl x23, x11, x21
+; GISEL-NEXT:    str x12, [sp, #176] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x12, x21, x12
-; GISEL-NEXT:    lsr x10, x11, x22
-; GISEL-NEXT:    mov x16, x19
+; GISEL-NEXT:    orr x12, x23, x12
+; GISEL-NEXT:    lsr x21, x11, x2
+; GISEL-NEXT:    str x23, [sp, #288] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x12, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    mov x1, x16
 ; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x16, [sp, #304] ; 8-byte Folded Spill
-; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    orr x13, x5, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    lsl x3, x5, x24
-; GISEL-NEXT:    orr x13, x4, x13
+; GISEL-NEXT:    orr x13, x22, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    stp x21, x3, [sp, #216] ; 16-byte Folded Spill
-; GISEL-NEXT:    csel x13, xzr, x27, eq
+; GISEL-NEXT:    csel x13, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    orr x13, x19, x13
-; GISEL-NEXT:    mov x19, x28
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x17, eq
@@ -1675,27 +1678,30 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x15, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x13, x14, x13
+; GISEL-NEXT:    orr x13, x30, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x23, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x13, x20, x13
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    csel x12, x25, x12, eq
+; GISEL-NEXT:    and x13, x8, #0x3f
+; GISEL-NEXT:    csel x12, x24, x12, eq
 ; GISEL-NEXT:    cmp x9, #9
+; GISEL-NEXT:    lsl x10, x1, x13
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #11
+; GISEL-NEXT:    stp x10, x15, [sp, #312] ; 16-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #12
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
@@ -1708,69 +1714,69 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x11, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #128] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x11, xzr, x10, eq
+; GISEL-NEXT:    str x11, [sp, #168] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x11, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x10, x11
+; GISEL-NEXT:    mov x10, x23
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x28, eq
+; GISEL-NEXT:    csel x12, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    mov x28, x4
-; GISEL-NEXT:    orr x12, x21, x12
-; GISEL-NEXT:    str x28, [sp, #32] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x27, x24
+; GISEL-NEXT:    orr x12, x23, x12
+; GISEL-NEXT:    mov x23, x15
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x6, eq
+; GISEL-NEXT:    csel x12, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x12, x30, x12
+; GISEL-NEXT:    mov x15, x22
+; GISEL-NEXT:    orr x12, x5, x12
+; GISEL-NEXT:    mov x5, x3
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x7, eq
+; GISEL-NEXT:    stp x14, x5, [sp, #256] ; 16-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x12, x4, x12
-; GISEL-NEXT:    mov x4, x20
+; GISEL-NEXT:    mov x5, x4
+; GISEL-NEXT:    orr x12, x22, x12
+; GISEL-NEXT:    lsr x22, x1, x2
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x27, eq
+; GISEL-NEXT:    csel x12, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    mov x27, x2
-; GISEL-NEXT:    orr x12, x16, x12
-; GISEL-NEXT:    mov x16, x17
+; GISEL-NEXT:    str x22, [sp, #240] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x19, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    mov x17, x15
 ; GISEL-NEXT:    orr x12, x0, x12
-; GISEL-NEXT:    lsr x0, x5, x22
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x15, eq
+; GISEL-NEXT:    csel x12, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    ldr x15, [sp, #312] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x12, x14, x12
-; GISEL-NEXT:    str x0, [sp, #280] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x30, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    mov x23, x25
-; GISEL-NEXT:    orr x12, x20, x12
-; GISEL-NEXT:    str x23, [sp, #288] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x12, x2, x12
-; GISEL-NEXT:    mov x2, x3
+; GISEL-NEXT:    mov x7, x14
+; GISEL-NEXT:    orr x12, x6, x12
+; GISEL-NEXT:    mov x6, x28
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x24, x11, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    mov x25, x26
+; GISEL-NEXT:    ldr x24, [x6, #88]
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #11
+; GISEL-NEXT:    ldr x6, [sp, #272] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #12
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
@@ -1780,80 +1786,84 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x12, xzr, x11, eq
+; GISEL-NEXT:    ldr x11, [x28, #80]
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    csel x12, x5, x12, eq
-; GISEL-NEXT:    ldp x11, x5, [x15, #80]
+; GISEL-NEXT:    csel x12, x1, x12, eq
+; GISEL-NEXT:    mov x28, x2
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x12, [sp, #120] ; 8-byte Folded Spill
-; GISEL-NEXT:    mov x15, x7
-; GISEL-NEXT:    csel x12, xzr, x0, eq
+; GISEL-NEXT:    lsl x2, x11, x13
+; GISEL-NEXT:    str x12, [sp, #160] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    str x15, [sp, #24] ; 8-byte Folded Spill
-; GISEL-NEXT:    lsl x20, x11, x24
-; GISEL-NEXT:    orr x12, x20, x12
-; GISEL-NEXT:    str x20, [sp, #232] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldr x1, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    str x28, [sp, #16] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    str x2, [sp, #280] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x12, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x10, eq
+; GISEL-NEXT:    csel x13, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x13, x3, x13
-; GISEL-NEXT:    lsl x3, x5, x24
+; GISEL-NEXT:    orr x13, x1, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x19, eq
+; GISEL-NEXT:    csel x13, xzr, x25, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    stp x19, x3, [sp, #264] ; 16-byte Folded Spill
-; GISEL-NEXT:    orr x13, x21, x13
+; GISEL-NEXT:    mov x25, x16
+; GISEL-NEXT:    orr x13, x10, x13
+; GISEL-NEXT:    mov x10, x30
+; GISEL-NEXT:    str x25, [sp, #80] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x6, eq
+; GISEL-NEXT:    csel x13, xzr, x23, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    mov x23, x3
+; GISEL-NEXT:    orr x13, x14, x13
+; GISEL-NEXT:    mov x14, x17
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x7, eq
-; GISEL-NEXT:    ldp x7, x30, [sp, #240] ; 16-byte Folded Reload
+; GISEL-NEXT:    stp x19, x14, [sp, #64] ; 16-byte Folded Spill
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x13, x28, x13
+; GISEL-NEXT:    mov x3, x21
+; GISEL-NEXT:    orr x13, x15, x13
+; GISEL-NEXT:    str x3, [sp, #32] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x7, eq
+; GISEL-NEXT:    csel x13, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x13, x1, x13
-; GISEL-NEXT:    mov x1, x14
+; GISEL-NEXT:    mov x4, x0
+; GISEL-NEXT:    orr x13, x19, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x16, eq
+; GISEL-NEXT:    csel x13, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    mov x17, x27
+; GISEL-NEXT:    orr x13, x0, x13
+; GISEL-NEXT:    ldr x0, [sp, #24] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x17, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x13, x14, x13
-; GISEL-NEXT:    ldr x14, [sp, #208] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    ldp x30, x16, [sp, #320] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x14, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x13, x4, x13
-; GISEL-NEXT:    mov x4, x10
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x6, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    mov x26, x27
-; GISEL-NEXT:    orr x13, x27, x13
-; GISEL-NEXT:    lsr x27, x11, x22
+; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    mov x13, x23
-; GISEL-NEXT:    csel x12, x23, x12, eq
+; GISEL-NEXT:    lsr x13, x11, x28
+; GISEL-NEXT:    csel x12, x27, x12, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    str x27, [sp, #64] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    mov x23, x20
+; GISEL-NEXT:    str x13, [sp, #96] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #13
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
@@ -1864,71 +1874,77 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x11, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #104] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x11, xzr, x27, eq
+; GISEL-NEXT:    str x11, [sp, #152] ; 8-byte Folded Spill
+; GISEL-NEXT:    and x11, x8, #0x3f
+; GISEL-NEXT:    lsl x27, x24, x11
+; GISEL-NEXT:    csel x11, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x27, x11
+; GISEL-NEXT:    str x27, [sp, #56] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x0, eq
+; GISEL-NEXT:    csel x12, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    mov x0, x7
-; GISEL-NEXT:    orr x12, x20, x12
-; GISEL-NEXT:    mov x20, x16
+; GISEL-NEXT:    mov x22, x2
+; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    mov x2, x14
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x10, eq
+; GISEL-NEXT:    csel x12, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    ldr x10, [sp, #312] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x12, x2, x12
-; GISEL-NEXT:    ldr x2, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x21, [sp, #288] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x12, x1, x12
+; GISEL-NEXT:    mov x1, x27
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x19, eq
+; GISEL-NEXT:    csel x12, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    orr x12, x21, x12
-; GISEL-NEXT:    ldr x21, [sp, #200] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x6, eq
+; GISEL-NEXT:    csel x12, xzr, x30, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x12, x21, x12
+; GISEL-NEXT:    orr x12, x7, x12
+; GISEL-NEXT:    mov x7, x15
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x15, eq
+; GISEL-NEXT:    str x7, [sp, #40] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x23, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x12, x28, x12
+; GISEL-NEXT:    orr x12, x15, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x7, eq
+; GISEL-NEXT:    csel x12, xzr, x5, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    mov x7, x17
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    mov x5, x19
+; GISEL-NEXT:    orr x12, x19, x12
+; GISEL-NEXT:    mov x19, x7
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x16, eq
+; GISEL-NEXT:    csel x12, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x12, x30, x12
+; GISEL-NEXT:    lsr x14, x24, x28
+; GISEL-NEXT:    orr x12, x4, x12
+; GISEL-NEXT:    mov x4, x10
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x17, eq
+; GISEL-NEXT:    csel x12, xzr, x25, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    mov x17, x24
-; GISEL-NEXT:    orr x12, x1, x12
+; GISEL-NEXT:    orr x12, x10, x12
+; GISEL-NEXT:    ldr x10, [sp, #304] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x14, eq
-; GISEL-NEXT:    ldr x14, [sp, #8] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x12, x14, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x25, eq
+; GISEL-NEXT:    csel x12, xzr, x6, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x12, x26, x12
+; GISEL-NEXT:    orr x12, x0, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    csel x11, x13, x11, eq
+; GISEL-NEXT:    csel x11, x17, x11, eq
 ; GISEL-NEXT:    cmp x9, #12
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #13
@@ -1937,393 +1953,395 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x12, xzr, x11, eq
+; GISEL-NEXT:    ldp x11, x6, [x10, #96]
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    ldp x11, x10, [x10, #96]
-; GISEL-NEXT:    csel x12, x5, x12, eq
-; GISEL-NEXT:    str x12, [sp, #96] ; 8-byte Folded Spill
-; GISEL-NEXT:    mov x12, x22
-; GISEL-NEXT:    lsr x22, x5, x22
-; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    mov x5, x27
-; GISEL-NEXT:    lsl x24, x11, x24
-; GISEL-NEXT:    str x10, [sp, #296] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x10, xzr, x22, eq
+; GISEL-NEXT:    and x10, x8, #0x3f
+; GISEL-NEXT:    csel x12, x24, x12, eq
+; GISEL-NEXT:    tst x8, #0x3f
+; GISEL-NEXT:    ldr x24, [sp, #248] ; 8-byte Folded Reload
+; GISEL-NEXT:    lsl x15, x11, x10
+; GISEL-NEXT:    csel x10, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    str x22, [sp, #16] ; 8-byte Folded Spill
-; GISEL-NEXT:    orr x10, x24, x10
+; GISEL-NEXT:    str x12, [sp, #136] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldr x12, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x10, x15, x10
+; GISEL-NEXT:    str x15, [sp, #296] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x15, x13
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x27, eq
+; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    ldr x27, [sp, #280] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x13, x3, x13
-; GISEL-NEXT:    mov x3, x26
+; GISEL-NEXT:    orr x13, x27, x13
+; GISEL-NEXT:    ldr x27, [sp, #240] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x13, x23, x13
-; GISEL-NEXT:    mov x23, x4
+; GISEL-NEXT:    orr x13, x22, x13
+; GISEL-NEXT:    ldr x22, [sp, #272] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x4, eq
-; GISEL-NEXT:    ldp x4, x16, [sp, #216] ; 16-byte Folded Reload
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x16, x13
+; GISEL-NEXT:    orr x13, x12, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x19, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    mov x19, x1
-; GISEL-NEXT:    orr x13, x4, x13
+; GISEL-NEXT:    mov x16, x17
+; GISEL-NEXT:    orr x13, x21, x13
+; GISEL-NEXT:    ldp x23, x21, [sp, #256] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x6, eq
+; GISEL-NEXT:    csel x13, xzr, x30, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    mov x6, x14
-; GISEL-NEXT:    orr x13, x21, x13
+; GISEL-NEXT:    mov x30, x0
+; GISEL-NEXT:    orr x13, x23, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x15, eq
+; GISEL-NEXT:    csel x13, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x13, x28, x13
+; GISEL-NEXT:    orr x13, x7, x13
+; GISEL-NEXT:    mov x7, x14
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x0, eq
+; GISEL-NEXT:    csel x13, xzr, x24, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    mov x0, x23
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x5, x13
+; GISEL-NEXT:    ldr x5, [sp, #48] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x20, eq
+; GISEL-NEXT:    csel x13, xzr, x2, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x13, x30, x13
-; GISEL-NEXT:    ldr x30, [sp, #208] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x2, [sp, #296] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x13, x5, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x7, eq
+; GISEL-NEXT:    csel x13, xzr, x25, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x13, x1, x13
+; GISEL-NEXT:    mov x25, x6
+; GISEL-NEXT:    orr x13, x4, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x30, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x13, x14, x13
-; GISEL-NEXT:    ldp x14, x2, [sp, #264] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x25, eq
+; GISEL-NEXT:    csel x13, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x13, x26, x13
-; GISEL-NEXT:    ldr x26, [sp, #288] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    lsr x13, x11, x12
-; GISEL-NEXT:    csel x10, x26, x10, eq
+; GISEL-NEXT:    lsr x13, x11, x28
+; GISEL-NEXT:    csel x10, x17, x10, eq
 ; GISEL-NEXT:    cmp x9, #13
+; GISEL-NEXT:    ldr x17, [sp, #80] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #14
-; GISEL-NEXT:    str x13, [sp, #72] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x13, [sp, #104] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #88] ; 8-byte Folded Spill
-; GISEL-NEXT:    ldr x10, [sp, #296] ; 8-byte Folded Reload
-; GISEL-NEXT:    lsl x11, x10, x17
+; GISEL-NEXT:    str x10, [sp, #128] ; 8-byte Folded Spill
+; GISEL-NEXT:    and x10, x8, #0x3f
+; GISEL-NEXT:    lsl x11, x6, x10
 ; GISEL-NEXT:    csel x10, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    ldr x17, [sp, #232] ; 8-byte Folded Reload
-; GISEL-NEXT:    ldr x13, [sp, #256] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldp x0, x13, [sp, #280] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x6, x16
 ; GISEL-NEXT:    orr x10, x11, x10
-; GISEL-NEXT:    str x11, [sp, #56] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #88] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x22, eq
+; GISEL-NEXT:    csel x11, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x11, x24, x11
+; GISEL-NEXT:    orr x11, x2, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x5, eq
+; GISEL-NEXT:    csel x11, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x11, x2, x11
-; GISEL-NEXT:    ldp x12, x5, [sp, #240] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x15, x3
+; GISEL-NEXT:    orr x11, x1, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    mov x27, x30
-; GISEL-NEXT:    orr x11, x17, x11
+; GISEL-NEXT:    orr x11, x0, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x23, eq
+; GISEL-NEXT:    csel x11, xzr, x3, eq
+; GISEL-NEXT:    ldp x14, x3, [sp, #320] ; 16-byte Folded Reload
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    mov x23, x20
-; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldr x16, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x12, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x14, eq
+; GISEL-NEXT:    csel x11, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x11, x4, x11
+; GISEL-NEXT:    orr x11, x13, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x13, eq
+; GISEL-NEXT:    csel x11, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x11, x21, x11
-; GISEL-NEXT:    ldr x21, [sp, #296] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x23, x11
+; GISEL-NEXT:    mov x23, x5
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x15, eq
+; GISEL-NEXT:    csel x11, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x11, x28, x11
+; GISEL-NEXT:    mov x21, x4
+; GISEL-NEXT:    orr x11, x19, x11
+; GISEL-NEXT:    ldp x12, x19, [sp, #64] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x12, eq
+; GISEL-NEXT:    csel x11, xzr, x24, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x11, x16, x11
+; GISEL-NEXT:    orr x11, x12, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x20, eq
+; GISEL-NEXT:    csel x11, xzr, x19, eq
 ; GISEL-NEXT:    cmp x9, #9
 ; GISEL-NEXT:    orr x11, x5, x11
+; GISEL-NEXT:    mov x5, x30
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x7, eq
+; GISEL-NEXT:    csel x11, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x11, x1, x11
-; GISEL-NEXT:    ldr x1, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x4, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x30, eq
+; GISEL-NEXT:    csel x11, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x11, x6, x11
+; GISEL-NEXT:    orr x11, x26, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x25, eq
+; GISEL-NEXT:    csel x11, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x30, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    cmp x9, #13
-; GISEL-NEXT:    csel x10, x26, x10, eq
+; GISEL-NEXT:    csel x10, x16, x10, eq
 ; GISEL-NEXT:    cmp x9, #14
+; GISEL-NEXT:    ldr x16, [sp, #304] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x11, xzr, x10, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    csel x11, x21, x11, eq
-; GISEL-NEXT:    ldp x10, x20, [x1, #112]
-; GISEL-NEXT:    str x11, [sp, #80] ; 8-byte Folded Spill
-; GISEL-NEXT:    ldp x11, x4, [sp, #40] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x10, x4, [x16, #112]
+; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    str x11, [sp, #120] ; 8-byte Folded Spill
+; GISEL-NEXT:    lsr x11, x25, x28
+; GISEL-NEXT:    and x16, x8, #0x3f
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsr x21, x21, x4
-; GISEL-NEXT:    lsl x28, x10, x11
-; GISEL-NEXT:    csel x1, xzr, x21, eq
-; GISEL-NEXT:    str x21, [sp, #296] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldr x25, [sp, #88] ; 8-byte Folded Reload
+; GISEL-NEXT:    lsl x24, x10, x16
+; GISEL-NEXT:    csel x1, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x1, x28, x1
-; GISEL-NEXT:    ldr x21, [sp, #72] ; 8-byte Folded Reload
-; GISEL-NEXT:    str x28, [sp, #312] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldp x16, x28, [sp, #96] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x1, x24, x1
 ; GISEL-NEXT:    csel x1, x1, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    ldr x28, [sp, #56] ; 8-byte Folded Reload
-; GISEL-NEXT:    csel x30, xzr, x21, eq
+; GISEL-NEXT:    csel x30, xzr, x28, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x30, x28, x30
+; GISEL-NEXT:    orr x30, x25, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x22, eq
+; GISEL-NEXT:    csel x30, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    ldr x22, [sp, #64] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x30, x24, x30
+; GISEL-NEXT:    orr x30, x2, x30
+; GISEL-NEXT:    ldr x2, [sp, #56] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x22, eq
+; GISEL-NEXT:    csel x30, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    orr x30, x2, x30
-; GISEL-NEXT:    ldr x2, [sp, #280] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x2, eq
+; GISEL-NEXT:    csel x30, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x30, x17, x30
-; GISEL-NEXT:    ldr x17, [sp, #224] ; 8-byte Folded Reload
+; GISEL-NEXT:    mov x27, x13
+; GISEL-NEXT:    orr x30, x0, x30
+; GISEL-NEXT:    ldr x0, [sp, #248] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x0, eq
+; GISEL-NEXT:    csel x30, xzr, x15, eq
+; GISEL-NEXT:    ldr x15, [sp, #312] ; 8-byte Folded Reload
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x30, x17, x30
+; GISEL-NEXT:    orr x30, x15, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x14, eq
-; GISEL-NEXT:    ldr x14, [sp, #216] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x30, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x30, x14, x30
+; GISEL-NEXT:    ldr x3, [sp, #40] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x30, x13, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x13, eq
-; GISEL-NEXT:    ldr x13, [sp, #200] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x30, xzr, x14, eq
+; GISEL-NEXT:    ldp x13, x14, [sp, #256] ; 16-byte Folded Reload
 ; GISEL-NEXT:    cmp x9, #7
 ; GISEL-NEXT:    orr x30, x13, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x15, eq
-; GISEL-NEXT:    ldr x15, [sp, #32] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x30, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x30, x15, x30
+; GISEL-NEXT:    orr x30, x3, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x12, eq
+; GISEL-NEXT:    csel x30, xzr, x0, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x30, x16, x30
+; GISEL-NEXT:    orr x30, x12, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x23, eq
+; GISEL-NEXT:    csel x30, xzr, x19, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x30, x5, x30
+; GISEL-NEXT:    orr x30, x23, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x7, eq
+; GISEL-NEXT:    csel x30, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x30, x19, x30
+; GISEL-NEXT:    orr x30, x21, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x27, eq
+; GISEL-NEXT:    csel x30, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    orr x30, x6, x30
+; GISEL-NEXT:    mov x20, x26
+; GISEL-NEXT:    orr x30, x26, x30
+; GISEL-NEXT:    mov x26, x5
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x25, eq
+; GISEL-NEXT:    csel x30, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #13
-; GISEL-NEXT:    orr x30, x3, x30
+; GISEL-NEXT:    orr x30, x5, x30
+; GISEL-NEXT:    ldr x5, [sp, #16] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    cmp x9, #14
-; GISEL-NEXT:    lsr x30, x10, x4
-; GISEL-NEXT:    csel x1, x26, x1, eq
+; GISEL-NEXT:    csel x1, x6, x1, eq
 ; GISEL-NEXT:    cmp x9, #15
+; GISEL-NEXT:    lsr x30, x10, x5
 ; GISEL-NEXT:    csel x1, xzr, x1, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    csel x26, x10, x1, eq
-; GISEL-NEXT:    lsl x10, x20, x11
+; GISEL-NEXT:    csel x5, x10, x1, eq
+; GISEL-NEXT:    and x10, x8, #0x3f
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x16, xzr, x30, eq
+; GISEL-NEXT:    lsl x10, x4, x10
+; GISEL-NEXT:    csel x1, xzr, x30, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    ldr x11, [sp, #296] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x10, x10, x16
-; GISEL-NEXT:    ldr x16, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldp x29, x30, [sp, #416] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x10, x10, x1
+; GISEL-NEXT:    ldr x1, [sp, #296] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldr x16, [sp, #272] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x24, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x21, eq
+; GISEL-NEXT:    csel x11, xzr, x28, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x11, x28, x11
-; GISEL-NEXT:    ldp x29, x30, [sp, #400] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x11, x25, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x11, eq
+; GISEL-NEXT:    csel x11, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x11, x24, x11
+; GISEL-NEXT:    orr x11, x1, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x22, eq
+; GISEL-NEXT:    csel x11, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldr x16, [sp, #232] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x16, [sp, #280] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x2, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
+; GISEL-NEXT:    ldr x11, [sp, #240] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x2, eq
+; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #5
 ; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldp x22, x21, [sp, #368] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
+; GISEL-NEXT:    ldr x11, [sp, #32] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x0, eq
+; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x11, x17, x11
+; GISEL-NEXT:    orr x11, x15, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #264] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #328] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x11, x14, x11
+; GISEL-NEXT:    orr x11, x27, x11
+; GISEL-NEXT:    ldp x28, x27, [sp, #336] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #256] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #320] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #8
 ; GISEL-NEXT:    orr x11, x13, x11
-; GISEL-NEXT:    ldr x13, [sp, #112] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x13, [sp, #144] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #24] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x11, eq
+; GISEL-NEXT:    csel x11, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x11, x15, x11
+; GISEL-NEXT:    orr x11, x3, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x12, eq
-; GISEL-NEXT:    ldr x12, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x11, xzr, x0, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    orr x11, x12, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #192] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #232] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13]
-; GISEL-NEXT:    ldp x12, x11, [sp, #176] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x12, x11, [sp, #216] ; 16-byte Folded Reload
 ; GISEL-NEXT:    stp x11, x12, [x13, #8]
-; GISEL-NEXT:    csel x11, xzr, x23, eq
+; GISEL-NEXT:    csel x11, xzr, x19, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x11, x5, x11
-; GISEL-NEXT:    ldp x24, x23, [sp, #352] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x11, x23, x11
+; GISEL-NEXT:    ldp x24, x23, [sp, #368] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #168] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #208] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13, #24]
-; GISEL-NEXT:    ldp x12, x11, [sp, #152] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x12, x11, [sp, #192] ; 16-byte Folded Reload
 ; GISEL-NEXT:    stp x11, x12, [x13, #32]
-; GISEL-NEXT:    csel x11, xzr, x7, eq
+; GISEL-NEXT:    csel x11, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    orr x11, x19, x11
+; GISEL-NEXT:    orr x11, x21, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #144] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #184] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13, #48]
-; GISEL-NEXT:    ldp x12, x11, [sp, #128] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x12, x11, [sp, #168] ; 16-byte Folded Reload
 ; GISEL-NEXT:    stp x11, x12, [x13, #56]
-; GISEL-NEXT:    csel x11, xzr, x27, eq
+; GISEL-NEXT:    ldr x11, [sp, #112] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x12, [sp, #136] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #13
-; GISEL-NEXT:    orr x11, x6, x11
-; GISEL-NEXT:    ldp x28, x27, [sp, #320] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x11, x20, x11
+; GISEL-NEXT:    ldp x20, x19, [sp, #400] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #120] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #160] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13, #72]
-; GISEL-NEXT:    ldp x12, x11, [sp, #96] ; 16-byte Folded Reload
-; GISEL-NEXT:    stp x11, x12, [x13, #80]
-; GISEL-NEXT:    csel x11, xzr, x25, eq
+; GISEL-NEXT:    ldr x11, [sp, #152] ; 8-byte Folded Reload
+; GISEL-NEXT:    str x11, [x13, #80]
+; GISEL-NEXT:    csel x11, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #14
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x26, x11
+; GISEL-NEXT:    ldp x22, x21, [sp, #384] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    cmp x9, #15
-; GISEL-NEXT:    ldr x9, [sp, #288] ; 8-byte Folded Reload
-; GISEL-NEXT:    ldr x11, [sp, #88] ; 8-byte Folded Reload
-; GISEL-NEXT:    csel x9, x9, x10, eq
+; GISEL-NEXT:    ldr x9, [sp, #128] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldp x26, x25, [sp, #352] ; 16-byte Folded Reload
+; GISEL-NEXT:    stp x12, x9, [x13, #88]
+; GISEL-NEXT:    csel x9, x6, x10, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    ldr x8, [sp, #80] ; 8-byte Folded Reload
-; GISEL-NEXT:    stp x11, x8, [x13, #96]
-; GISEL-NEXT:    csel x8, x20, x9, eq
-; GISEL-NEXT:    stp x26, x8, [x13, #112]
-; GISEL-NEXT:    ldp x20, x19, [sp, #384] ; 16-byte Folded Reload
-; GISEL-NEXT:    ldp x26, x25, [sp, #336] ; 16-byte Folded Reload
-; GISEL-NEXT:    add sp, sp, #416
+; GISEL-NEXT:    ldr x8, [sp, #120] ; 8-byte Folded Reload
+; GISEL-NEXT:    stp x8, x5, [x13, #104]
+; GISEL-NEXT:    csel x8, x4, x9, eq
+; GISEL-NEXT:    str x8, [x13, #120]
+; GISEL-NEXT:    add sp, sp, #432
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i1024, ptr %input, align 128
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 63c08dd..b215c51 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -267,7 +267,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-SD-NEXT:    and x9, x8, #0xfffffff0
 ; CHECK-SD-NEXT:    add x10, x2, #32
 ; CHECK-SD-NEXT:    add x11, x0, #16
-; CHECK-SD-NEXT:    mov x12, x9
+; CHECK-SD-NEXT:    and x12, x8, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB3_4: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldp q1, q2, [x11, #-16]
@@ -313,7 +313,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-GI-NEXT:    and x10, x9, #0xfffffff0
 ; CHECK-GI-NEXT:    add x11, x2, #32
 ; CHECK-GI-NEXT:    add x12, x0, #16
-; CHECK-GI-NEXT:    mov x13, x10
+; CHECK-GI-NEXT:    and x13, x9, #0xfffffff0
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:  .LBB3_3: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -428,7 +428,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-SD-NEXT:    and x9, x8, #0xfffffff0
 ; CHECK-SD-NEXT:    add x10, x2, #32
 ; CHECK-SD-NEXT:    add x11, x0, #16
-; CHECK-SD-NEXT:    mov x12, x9
+; CHECK-SD-NEXT:    and x12, x8, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB4_4: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldp q1, q2, [x11, #-16]
@@ -472,7 +472,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-GI-NEXT:    and x8, x9, #0xfffffff0
 ; CHECK-GI-NEXT:    add x10, x2, #32
 ; CHECK-GI-NEXT:    add x11, x0, #16
-; CHECK-GI-NEXT:    mov x12, x8
+; CHECK-GI-NEXT:    and x12, x9, #0xfffffff0
 ; CHECK-GI-NEXT:  .LBB4_3: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    and w13, w1, #0xffff
@@ -596,7 +596,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
 ; CHECK-SD-NEXT:    and x11, x10, #0xfffffff0
 ; CHECK-SD-NEXT:    fmov s2, w9
 ; CHECK-SD-NEXT:    add x8, x0, #8
-; CHECK-SD-NEXT:    mov x12, x11
+; CHECK-SD-NEXT:    and x12, x10, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldp d3, d4, [x8, #-8]
@@ -646,10 +646,10 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
 ; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-GI-NEXT:    add x10, x0, #8
+; CHECK-GI-NEXT:    and x11, x8, #0xfffffff0
 ; CHECK-GI-NEXT:    sbfx w9, w9, #8, #8
 ; CHECK-GI-NEXT:    dup v2.8h, w9
 ; CHECK-GI-NEXT:    and x9, x8, #0xfffffff0
-; CHECK-GI-NEXT:    mov x11, x9
 ; CHECK-GI-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldp d3, d4, [x10, #-8]
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
index 4c8e589..c23e4e1 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
@@ -17,7 +17,7 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
 ; CHECK-NEXT:    and x9, x8, #0xfffffff0
 ; CHECK-NEXT:    add x10, x1, #16
 ; CHECK-NEXT:    add x11, x0, #16
-; CHECK-NEXT:    mov x12, x9
+; CHECK-NEXT:    and x12, x8, #0xfffffff0
 ; CHECK-NEXT:  .LBB0_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp q1, q4, [x10, #-16]
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
index f6bbdf5..1770bb9 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
@@ -14,7 +14,6 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:    mov x9, xzr
 ; CHECK-NEXT:    and x12, x10, #0xfffffff0
 ; CHECK-NEXT:    add x13, x1, #32
-; CHECK-NEXT:    add x14, x2, #16
 ; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
@@ -27,52 +26,52 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB0_6 Depth 2
 ; CHECK-NEXT:    // Child Loop BB0_9 Depth 2
-; CHECK-NEXT:    ldrsh w15, [x2, x9, lsl #1]
+; CHECK-NEXT:    ldrsh w14, [x2, x9, lsl #1]
 ; CHECK-NEXT:    cmp w0, #16
 ; CHECK-NEXT:    b.hs .LBB0_5
 ; CHECK-NEXT:  // %bb.4: // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    mov x18, xzr
+; CHECK-NEXT:    mov x17, xzr
 ; CHECK-NEXT:    b .LBB0_8
 ; CHECK-NEXT:  .LBB0_5: // %vector.ph
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    dup v0.8h, w15
-; CHECK-NEXT:    mov x16, x14
-; CHECK-NEXT:    mov x17, x13
-; CHECK-NEXT:    mov x18, x12
+; CHECK-NEXT:    dup v0.8h, w14
+; CHECK-NEXT:    add x15, x2, #16
+; CHECK-NEXT:    mov x16, x13
+; CHECK-NEXT:    and x17, x10, #0xfffffff0
 ; CHECK-NEXT:  .LBB0_6: // %vector.body
 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldp q1, q4, [x16, #-16]
-; CHECK-NEXT:    subs x18, x18, #16
-; CHECK-NEXT:    ldp q3, q2, [x17, #-32]
-; CHECK-NEXT:    add x16, x16, #32
-; CHECK-NEXT:    ldp q6, q5, [x17]
+; CHECK-NEXT:    ldp q1, q4, [x15, #-16]
+; CHECK-NEXT:    subs x17, x17, #16
+; CHECK-NEXT:    ldp q3, q2, [x16, #-32]
+; CHECK-NEXT:    add x15, x15, #32
+; CHECK-NEXT:    ldp q6, q5, [x16]
 ; CHECK-NEXT:    smlal2 v2.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    smlal v3.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    smlal2 v5.4s, v0.8h, v4.8h
 ; CHECK-NEXT:    smlal v6.4s, v0.4h, v4.4h
-; CHECK-NEXT:    stp q3, q2, [x17, #-32]
-; CHECK-NEXT:    stp q6, q5, [x17], #64
+; CHECK-NEXT:    stp q3, q2, [x16, #-32]
+; CHECK-NEXT:    stp q6, q5, [x16], #64
 ; CHECK-NEXT:    b.ne .LBB0_6
 ; CHECK-NEXT:  // %bb.7: // %middle.block
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    cmp x12, x10
-; CHECK-NEXT:    mov x18, x12
+; CHECK-NEXT:    and x17, x10, #0xfffffff0
 ; CHECK-NEXT:    b.eq .LBB0_2
 ; CHECK-NEXT:  .LBB0_8: // %for.body4.us.preheader
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    add x16, x18, x8
-; CHECK-NEXT:    add x17, x2, x18, lsl #1
-; CHECK-NEXT:    sub x18, x10, x18
-; CHECK-NEXT:    add x16, x1, x16, lsl #2
+; CHECK-NEXT:    add x15, x17, x8
+; CHECK-NEXT:    add x16, x2, x17, lsl #1
+; CHECK-NEXT:    sub x17, x10, x17
+; CHECK-NEXT:    add x15, x1, x15, lsl #2
 ; CHECK-NEXT:  .LBB0_9: // %for.body4.us
 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrsh w3, [x17], #2
-; CHECK-NEXT:    ldr w4, [x16]
-; CHECK-NEXT:    subs x18, x18, #1
-; CHECK-NEXT:    madd w3, w3, w15, w4
-; CHECK-NEXT:    str w3, [x16], #4
+; CHECK-NEXT:    ldrsh w18, [x16], #2
+; CHECK-NEXT:    ldr w3, [x15]
+; CHECK-NEXT:    subs x17, x17, #1
+; CHECK-NEXT:    madd w18, w18, w14, w3
+; CHECK-NEXT:    str w18, [x15], #4
 ; CHECK-NEXT:    b.ne .LBB0_9
 ; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_10: // %for.cond.cleanup
diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
index 3caac1d..74b0e69 100644
--- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -278,9 +278,9 @@ define i64 @test_and_4(i64 %x, i64 %y) {
 ; CHECK-GI-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NEXT:    .cfi_offset w30, -32
-; CHECK-GI-NEXT:    and x20, x0, #0x3
 ; CHECK-GI-NEXT:    mov x19, x0
-; CHECK-GI-NEXT:    mov x0, x20
+; CHECK-GI-NEXT:    and x20, x0, #0x3
+; CHECK-GI-NEXT:    and x0, x0, #0x3
 ; CHECK-GI-NEXT:    bl callee
 ; CHECK-GI-NEXT:    tst x19, #0x3
 ; CHECK-GI-NEXT:    csel x0, x20, x0, eq
diff --git a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
index e0f2155..58c01db 100644
--- a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
+++ b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
@@ -7,20 +7,16 @@
 define void @foo(i64 %v1, i64 %v2, ptr %ptr) {
 ; CHECK-LABEL: foo:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    add x3, x0, x1
-; CHECK-NEXT:    str x3, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    str x3, [x2, #8]
 ; CHECK-NEXT:    ldr x3, [x2, #16]
 ; CHECK-NEXT:    add x3, x0, x3
 ; CHECK-NEXT:    sub x3, x3, x1
 ; CHECK-NEXT:    str x3, [x2, #16]
-; CHECK-NEXT:    ldr x3, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    add x3, x0, x1
 ; CHECK-NEXT:    str x3, [x2, #24]
 ; CHECK-NEXT:    str x0, [x2, #32]
 ; CHECK-NEXT:    str x1, [x2, #40]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %v3 = add i64 %v1, %v2
   %p1 = getelementptr i64, ptr %ptr, i64 1
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 5fc996a..0f62997 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -23,7 +23,7 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    add x13, x1, #16
 ; CHECK-NEXT:    add x8, x1, x10, lsl #2
 ; CHECK-NEXT:    add x9, x0, x10
-; CHECK-NEXT:    mov x14, x10
+; CHECK-NEXT:    and x14, x11, #0x1fffffff8
 ; CHECK-NEXT:  .LBB0_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp q1, q2, [x13, #-16]
@@ -194,9 +194,9 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    mov w8, #1132396544 // =0x437f0000
 ; CHECK-NEXT:    and x10, x11, #0x1fffffffc
 ; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    and x12, x11, #0x1fffffffc
 ; CHECK-NEXT:    add x8, x1, x10, lsl #3
 ; CHECK-NEXT:    add x9, x0, x10, lsl #1
-; CHECK-NEXT:    mov x12, x10
 ; CHECK-NEXT:  .LBB1_9: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld2 { v1.4s, v2.4s }, [x1], #32
@@ -341,7 +341,7 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:    ldr q1, [x12, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    add x9, x10, x10, lsl #1
-; CHECK-NEXT:    mov x12, x10
+; CHECK-NEXT:    and x12, x11, #0x1fffffffc
 ; CHECK-NEXT:    add x8, x1, x9, lsl #2
 ; CHECK-NEXT:    add x9, x0, x9
 ; CHECK-NEXT:  .LBB2_4: // %vector.body
@@ -597,7 +597,7 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    ldr q1, [x12, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    add x8, x1, x10, lsl #4
 ; CHECK-NEXT:    add x9, x0, x10, lsl #2
-; CHECK-NEXT:    mov x12, x10
+; CHECK-NEXT:    and x12, x11, #0x1fffffffc
 ; CHECK-NEXT:  .LBB3_9: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64
diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll
index 0e68270..3e933fa 100644
--- a/llvm/test/CodeGen/AArch64/trampoline.ll
+++ b/llvm/test/CodeGen/AArch64/trampoline.ll
@@ -263,3 +263,9 @@ define i64 @func2() {
   %fp = call ptr @llvm.adjust.trampoline(ptr @trampg)
   ret i64 0
 }
+
+; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the
+; presence of trampolines.
+; UTC_ARGS: --disable
+; CHECK-LINUX:         .section        ".note.GNU-stack","x",@progbits
+; UTC_ARGS: --enable
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
index c82b341..5bc9cdb 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -256,7 +256,7 @@ endif:                                            ; preds = %else, %if
 define amdgpu_kernel void @copy1(ptr addrspace(1) %out, ptr addrspace(1) %in0) {
 entry:
   %tmp = load float, ptr addrspace(1) %in0
-  %tmp1 = fcmp oeq float %tmp, 0.000000e+00
+  %tmp1 = fcmp one float %tmp, 0.000000e+00
   br i1 %tmp1, label %if0, label %endif
 
 if0:                                              ; preds = %entry
diff --git a/llvm/test/CodeGen/ARM/combine-movc-sub.ll b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
index ca5d089..8ca4c43 100644
--- a/llvm/test/CodeGen/ARM/combine-movc-sub.ll
+++ b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
@@ -27,11 +27,11 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    sub.w r7, r2, #32
-; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    sub.w r8, r2, #32
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    mov r4, r2
-; CHECK-NEXT:    add.w r6, r0, r7, lsr #5
+; CHECK-NEXT:    add.w r7, r0, r8, lsr #5
 ; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    b .LBB0_2
@@ -44,16 +44,16 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
 ; CHECK-NEXT:    mov r2, r4
 ; CHECK-NEXT:    cmp r4, #31
 ; CHECK-NEXT:    ldr r0, [r1, #16]
-; CHECK-NEXT:    add.w r0, r0, r6, lsl #2
+; CHECK-NEXT:    add.w r0, r0, r7, lsl #2
 ; CHECK-NEXT:    ldr r0, [r0, #40]
 ; CHECK-NEXT:    it hi
-; CHECK-NEXT:    andhi r2, r7, #31
+; CHECK-NEXT:    andhi r2, r8, #31
 ; CHECK-NEXT:    lsrs r0, r2
 ; CHECK-NEXT:    lsls r0, r0, #31
 ; CHECK-NEXT:    beq .LBB0_1
 ; CHECK-NEXT:  @ %bb.3: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl foo
 ; CHECK-NEXT:    str.w r9, [r5, #4]
 ; CHECK-NEXT:    b .LBB0_1
diff --git a/llvm/test/CodeGen/ARM/extract-bits.ll b/llvm/test/CodeGen/ARM/extract-bits.ll
index 77deaa5..d717806 100644
--- a/llvm/test/CodeGen/ARM/extract-bits.ll
+++ b/llvm/test/CodeGen/ARM/extract-bits.ll
@@ -316,28 +316,28 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ;
 ; V7A-LABEL: bextr64_a0:
 ; V7A:       @ %bb.0:
-; V7A-NEXT:    .save {r4, lr}
-; V7A-NEXT:    push {r4, lr}
-; V7A-NEXT:    ldr r12, [sp, #8]
-; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    .save {r4, r5, r11, lr}
+; V7A-NEXT:    push {r4, r5, r11, lr}
+; V7A-NEXT:    ldr lr, [sp, #16]
+; V7A-NEXT:    mov r5, #1
 ; V7A-NEXT:    lsr r0, r0, r2
-; V7A-NEXT:    rsb r3, r12, #32
-; V7A-NEXT:    subs r4, r12, #32
-; V7A-NEXT:    lsr r3, lr, r3
-; V7A-NEXT:    lslpl r3, lr, r4
-; V7A-NEXT:    lsl r4, lr, r12
-; V7A-NEXT:    movwpl r4, #0
-; V7A-NEXT:    subs r4, r4, #1
-; V7A-NEXT:    sbc r12, r3, #0
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    orr r0, r0, r1, lsl r3
-; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsrpl r0, r1, r3
+; V7A-NEXT:    rsb r12, lr, #32
+; V7A-NEXT:    subs r4, lr, #32
+; V7A-NEXT:    lsr r3, r5, r12
+; V7A-NEXT:    lslpl r3, r5, r4
+; V7A-NEXT:    lsl r5, r5, lr
+; V7A-NEXT:    movwpl r5, #0
+; V7A-NEXT:    rsb r4, r2, #32
+; V7A-NEXT:    subs r5, r5, #1
+; V7A-NEXT:    sbc r3, r3, #0
+; V7A-NEXT:    orr r0, r0, r1, lsl r4
+; V7A-NEXT:    subs r4, r2, #32
+; V7A-NEXT:    lsrpl r0, r1, r4
 ; V7A-NEXT:    lsr r1, r1, r2
 ; V7A-NEXT:    movwpl r1, #0
-; V7A-NEXT:    and r0, r4, r0
-; V7A-NEXT:    and r1, r12, r1
-; V7A-NEXT:    pop {r4, pc}
+; V7A-NEXT:    and r0, r5, r0
+; V7A-NEXT:    and r1, r3, r1
+; V7A-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; V7A-T-LABEL: bextr64_a0:
 ; V7A-T:       @ %bb.0:
@@ -434,28 +434,28 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n
 ;
 ; V7A-LABEL: bextr64_a0_arithmetic:
 ; V7A:       @ %bb.0:
-; V7A-NEXT:    .save {r4, lr}
-; V7A-NEXT:    push {r4, lr}
-; V7A-NEXT:    ldr r12, [sp, #8]
-; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    .save {r4, r5, r11, lr}
+; V7A-NEXT:    push {r4, r5, r11, lr}
+; V7A-NEXT:    ldr lr, [sp, #16]
+; V7A-NEXT:    mov r5, #1
 ; V7A-NEXT:    lsr r0, r0, r2
-; V7A-NEXT:    rsb r3, r12, #32
-; V7A-NEXT:    subs r4, r12, #32
-; V7A-NEXT:    lsr r3, lr, r3
-; V7A-NEXT:    lslpl r3, lr, r4
-; V7A-NEXT:    lsl r4, lr, r12
-; V7A-NEXT:    movwpl r4, #0
-; V7A-NEXT:    subs r4, r4, #1
-; V7A-NEXT:    sbc r12, r3, #0
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    orr r0, r0, r1, lsl r3
-; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    rsb r12, lr, #32
+; V7A-NEXT:    subs r4, lr, #32
+; V7A-NEXT:    lsr r3, r5, r12
+; V7A-NEXT:    lslpl r3, r5, r4
+; V7A-NEXT:    lsl r5, r5, lr
+; V7A-NEXT:    movwpl r5, #0
+; V7A-NEXT:    rsb r4, r2, #32
+; V7A-NEXT:    subs r5, r5, #1
+; V7A-NEXT:    sbc r3, r3, #0
+; V7A-NEXT:    orr r0, r0, r1, lsl r4
+; V7A-NEXT:    subs r4, r2, #32
 ; V7A-NEXT:    asr r2, r1, r2
-; V7A-NEXT:    asrpl r0, r1, r3
 ; V7A-NEXT:    asrpl r2, r1, #31
-; V7A-NEXT:    and r0, r4, r0
-; V7A-NEXT:    and r1, r12, r2
-; V7A-NEXT:    pop {r4, pc}
+; V7A-NEXT:    asrpl r0, r1, r4
+; V7A-NEXT:    and r1, r3, r2
+; V7A-NEXT:    and r0, r5, r0
+; V7A-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; V7A-T-LABEL: bextr64_a0_arithmetic:
 ; V7A-T:       @ %bb.0:
@@ -911,28 +911,28 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ;
 ; V7A-LABEL: bextr64_a4_commutative:
 ; V7A:       @ %bb.0:
-; V7A-NEXT:    .save {r4, lr}
-; V7A-NEXT:    push {r4, lr}
-; V7A-NEXT:    ldr r12, [sp, #8]
-; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    .save {r4, r5, r11, lr}
+; V7A-NEXT:    push {r4, r5, r11, lr}
+; V7A-NEXT:    ldr lr, [sp, #16]
+; V7A-NEXT:    mov r5, #1
 ; V7A-NEXT:    lsr r0, r0, r2
-; V7A-NEXT:    rsb r3, r12, #32
-; V7A-NEXT:    subs r4, r12, #32
-; V7A-NEXT:    lsr r3, lr, r3
-; V7A-NEXT:    lslpl r3, lr, r4
-; V7A-NEXT:    lsl r4, lr, r12
-; V7A-NEXT:    movwpl r4, #0
-; V7A-NEXT:    subs r4, r4, #1
-; V7A-NEXT:    sbc r12, r3, #0
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    orr r0, r0, r1, lsl r3
-; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsrpl r0, r1, r3
+; V7A-NEXT:    rsb r12, lr, #32
+; V7A-NEXT:    subs r4, lr, #32
+; V7A-NEXT:    lsr r3, r5, r12
+; V7A-NEXT:    lslpl r3, r5, r4
+; V7A-NEXT:    lsl r5, r5, lr
+; V7A-NEXT:    movwpl r5, #0
+; V7A-NEXT:    rsb r4, r2, #32
+; V7A-NEXT:    subs r5, r5, #1
+; V7A-NEXT:    sbc r3, r3, #0
+; V7A-NEXT:    orr r0, r0, r1, lsl r4
+; V7A-NEXT:    subs r4, r2, #32
+; V7A-NEXT:    lsrpl r0, r1, r4
 ; V7A-NEXT:    lsr r1, r1, r2
 ; V7A-NEXT:    movwpl r1, #0
-; V7A-NEXT:    and r0, r0, r4
-; V7A-NEXT:    and r1, r1, r12
-; V7A-NEXT:    pop {r4, pc}
+; V7A-NEXT:    and r0, r0, r5
+; V7A-NEXT:    and r1, r1, r3
+; V7A-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; V7A-T-LABEL: bextr64_a4_commutative:
 ; V7A-T:       @ %bb.0:
@@ -3456,22 +3456,22 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; V7M-NEXT:    uxtb r2, r2
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r1, #0
-; V7M-NEXT:    rsb.w r12, r2, #32
+; V7M-NEXT:    rsb.w r3, r2, #32
 ; V7M-NEXT:    lsls r1, r2
-; V7M-NEXT:    sub.w r3, r2, #32
-; V7M-NEXT:    lsr.w r4, r0, r12
+; V7M-NEXT:    sub.w r12, r2, #32
+; V7M-NEXT:    lsr.w r4, r0, r3
 ; V7M-NEXT:    orrs r1, r4
-; V7M-NEXT:    cmp r3, #0
+; V7M-NEXT:    cmp.w r12, #0
 ; V7M-NEXT:    it pl
-; V7M-NEXT:    lslpl.w r1, r0, r3
+; V7M-NEXT:    lslpl.w r1, r0, r12
 ; V7M-NEXT:    lsl.w r0, r0, r2
-; V7M-NEXT:    lsl.w r4, r1, r12
+; V7M-NEXT:    lsl.w r3, r1, r3
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r0, #0
 ; V7M-NEXT:    lsr.w r0, r0, r2
-; V7M-NEXT:    orr.w r0, r0, r4
+; V7M-NEXT:    orr.w r0, r0, r3
 ; V7M-NEXT:    it pl
-; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    lsrpl.w r0, r1, r12
 ; V7M-NEXT:    lsr.w r1, r1, r2
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r1, #0
@@ -3715,26 +3715,26 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 ; V7M-NEXT:    uxtb r2, r2
 ; V7M-NEXT:    lsl.w r0, lr, r0
 ; V7M-NEXT:    orr.w r0, r0, r12
-; V7M-NEXT:    rsb.w r12, r2, #32
+; V7M-NEXT:    sub.w r12, r2, #32
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    lsrpl.w r0, lr, r3
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    rsb.w r3, r2, #32
 ; V7M-NEXT:    lsls r1, r2
-; V7M-NEXT:    sub.w r3, r2, #32
-; V7M-NEXT:    lsr.w r4, r0, r12
-; V7M-NEXT:    orrs r1, r4
-; V7M-NEXT:    cmp r3, #0
+; V7M-NEXT:    cmp.w r12, #0
+; V7M-NEXT:    lsr.w r4, r0, r3
+; V7M-NEXT:    orr.w r1, r1, r4
 ; V7M-NEXT:    it pl
-; V7M-NEXT:    lslpl.w r1, r0, r3
+; V7M-NEXT:    lslpl.w r1, r0, r12
 ; V7M-NEXT:    lsl.w r0, r0, r2
-; V7M-NEXT:    lsl.w r4, r1, r12
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    lsl.w r3, r1, r3
 ; V7M-NEXT:    lsr.w r0, r0, r2
-; V7M-NEXT:    orr.w r0, r0, r4
+; V7M-NEXT:    orr.w r0, r0, r3
 ; V7M-NEXT:    it pl
-; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    lsrpl.w r0, r1, r12
 ; V7M-NEXT:    lsr.w r1, r1, r2
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r1, #0
diff --git a/llvm/test/CodeGen/ARM/extract-lowbits.ll b/llvm/test/CodeGen/ARM/extract-lowbits.ll
index b483793..373d998 100644
--- a/llvm/test/CodeGen/ARM/extract-lowbits.ll
+++ b/llvm/test/CodeGen/ARM/extract-lowbits.ll
@@ -243,15 +243,15 @@ define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind {
 ; V7A:       @ %bb.0:
 ; V7A-NEXT:    .save {r11, lr}
 ; V7A-NEXT:    push {r11, lr}
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    mov r12, #1
-; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    rsb r12, r2, #32
+; V7A-NEXT:    mov lr, #1
 ; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    lsl r2, lr, r2
+; V7A-NEXT:    lsr r12, lr, r12
 ; V7A-NEXT:    movwpl r2, #0
-; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    lslpl r12, lr, r3
 ; V7A-NEXT:    subs r2, r2, #1
-; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    sbc r3, r12, #0
 ; V7A-NEXT:    and r0, r2, r0
 ; V7A-NEXT:    and r1, r3, r1
 ; V7A-NEXT:    pop {r11, pc}
@@ -323,15 +323,15 @@ define i64 @bzhi64_a0_masked(i64 %val, i64 %numlowbits) nounwind {
 ; V7A-NEXT:    .save {r11, lr}
 ; V7A-NEXT:    push {r11, lr}
 ; V7A-NEXT:    and r2, r2, #63
-; V7A-NEXT:    mov r12, #1
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    rsb r12, r2, #32
 ; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    lsl r2, lr, r2
+; V7A-NEXT:    lsr r12, lr, r12
 ; V7A-NEXT:    movwpl r2, #0
-; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    lslpl r12, lr, r3
 ; V7A-NEXT:    subs r2, r2, #1
-; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    sbc r3, r12, #0
 ; V7A-NEXT:    and r0, r2, r0
 ; V7A-NEXT:    and r1, r3, r1
 ; V7A-NEXT:    pop {r11, pc}
@@ -404,15 +404,15 @@ define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 ; V7A:       @ %bb.0:
 ; V7A-NEXT:    .save {r11, lr}
 ; V7A-NEXT:    push {r11, lr}
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    mov r12, #1
-; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    rsb r12, r2, #32
+; V7A-NEXT:    mov lr, #1
 ; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    lsl r2, lr, r2
+; V7A-NEXT:    lsr r12, lr, r12
 ; V7A-NEXT:    movwpl r2, #0
-; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    lslpl r12, lr, r3
 ; V7A-NEXT:    subs r2, r2, #1
-; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    sbc r3, r12, #0
 ; V7A-NEXT:    and r0, r2, r0
 ; V7A-NEXT:    and r1, r3, r1
 ; V7A-NEXT:    pop {r11, pc}
@@ -644,15 +644,15 @@ define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; V7A:       @ %bb.0:
 ; V7A-NEXT:    .save {r11, lr}
 ; V7A-NEXT:    push {r11, lr}
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    mov r12, #1
-; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    rsb r12, r2, #32
+; V7A-NEXT:    mov lr, #1
 ; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    lsl r2, lr, r2
+; V7A-NEXT:    lsr r12, lr, r12
 ; V7A-NEXT:    movwpl r2, #0
-; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    lslpl r12, lr, r3
 ; V7A-NEXT:    subs r2, r2, #1
-; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    sbc r3, r12, #0
 ; V7A-NEXT:    and r0, r0, r2
 ; V7A-NEXT:    and r1, r1, r3
 ; V7A-NEXT:    pop {r11, pc}
@@ -2144,23 +2144,23 @@ define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind {
 ;
 ; V7A-LABEL: bzhi64_d2_load:
 ; V7A:       @ %bb.0:
-; V7A-NEXT:    .save {r5, r7, r11, lr}
-; V7A-NEXT:    push {r5, r7, r11, lr}
+; V7A-NEXT:    .save {r5, lr}
+; V7A-NEXT:    push {r5, lr}
 ; V7A-NEXT:    rsb r3, r2, #64
-; V7A-NEXT:    ldm r0, {r0, r7}
-; V7A-NEXT:    rsb r1, r3, #32
+; V7A-NEXT:    ldm r0, {r0, r5}
+; V7A-NEXT:    rsb r12, r3, #32
 ; V7A-NEXT:    rsbs r2, r2, #32
-; V7A-NEXT:    lsr r5, r0, r1
-; V7A-NEXT:    orr r7, r5, r7, lsl r3
-; V7A-NEXT:    lslpl r7, r0, r2
+; V7A-NEXT:    lsr r1, r0, r12
+; V7A-NEXT:    orr r1, r1, r5, lsl r3
+; V7A-NEXT:    lslpl r1, r0, r2
 ; V7A-NEXT:    lsl r0, r0, r3
 ; V7A-NEXT:    movwpl r0, #0
 ; V7A-NEXT:    lsr r0, r0, r3
-; V7A-NEXT:    orr r0, r0, r7, lsl r1
-; V7A-NEXT:    lsr r1, r7, r3
-; V7A-NEXT:    lsrpl r0, r7, r2
+; V7A-NEXT:    orr r0, r0, r1, lsl r12
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    lsr r1, r1, r3
 ; V7A-NEXT:    movwpl r1, #0
-; V7A-NEXT:    pop {r5, r7, r11, pc}
+; V7A-NEXT:    pop {r5, pc}
 ;
 ; V7A-T-LABEL: bzhi64_d2_load:
 ; V7A-T:       @ %bb.0:
@@ -2237,26 +2237,26 @@ define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ;
 ; V7A-LABEL: bzhi64_d3_load_indexzext:
 ; V7A:       @ %bb.0:
-; V7A-NEXT:    .save {r5, r7, r11, lr}
-; V7A-NEXT:    push {r5, r7, r11, lr}
+; V7A-NEXT:    .save {r5, lr}
+; V7A-NEXT:    push {r5, lr}
 ; V7A-NEXT:    rsb r1, r1, #64
-; V7A-NEXT:    ldm r0, {r0, r7}
+; V7A-NEXT:    ldm r0, {r0, r5}
 ; V7A-NEXT:    uxtb r2, r1
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    lsr r5, r0, r3
-; V7A-NEXT:    orr r7, r5, r7, lsl r2
+; V7A-NEXT:    rsb r12, r2, #32
+; V7A-NEXT:    lsr r3, r0, r12
+; V7A-NEXT:    orr r3, r3, r5, lsl r2
 ; V7A-NEXT:    mvn r5, #31
 ; V7A-NEXT:    uxtab r1, r5, r1
 ; V7A-NEXT:    cmp r1, #0
-; V7A-NEXT:    lslpl r7, r0, r1
+; V7A-NEXT:    lslpl r3, r0, r1
 ; V7A-NEXT:    lsl r0, r0, r2
 ; V7A-NEXT:    movwpl r0, #0
 ; V7A-NEXT:    lsr r0, r0, r2
-; V7A-NEXT:    orr r0, r0, r7, lsl r3
-; V7A-NEXT:    lsrpl r0, r7, r1
-; V7A-NEXT:    lsr r1, r7, r2
+; V7A-NEXT:    orr r0, r0, r3, lsl r12
+; V7A-NEXT:    lsrpl r0, r3, r1
+; V7A-NEXT:    lsr r1, r3, r2
 ; V7A-NEXT:    movwpl r1, #0
-; V7A-NEXT:    pop {r5, r7, r11, pc}
+; V7A-NEXT:    pop {r5, pc}
 ;
 ; V7A-T-LABEL: bzhi64_d3_load_indexzext:
 ; V7A-T:       @ %bb.0:
diff --git a/llvm/test/CodeGen/ARM/llround-conv.ll b/llvm/test/CodeGen/ARM/llround-conv.ll
index 0f57e4a..f734db8 100644
--- a/llvm/test/CodeGen/ARM/llround-conv.ll
+++ b/llvm/test/CodeGen/ARM/llround-conv.ll
@@ -1,25 +1,71 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+
+define i64 @testmsxh_builtin(half %x) {
+; CHECK-SOFT-LABEL: testmsxh_builtin:
+; CHECK-SOFT:       @ %bb.0: @ %entry
+; CHECK-SOFT-NEXT:    .save {r11, lr}
+; CHECK-SOFT-NEXT:    push {r11, lr}
+; CHECK-SOFT-NEXT:    bl __aeabi_h2f
+; CHECK-SOFT-NEXT:    bl llroundf
+; CHECK-SOFT-NEXT:    pop {r11, pc}
+;
+; CHECK-NOFP16-LABEL: testmsxh_builtin:
+; CHECK-NOFP16:       @ %bb.0: @ %entry
+; CHECK-NOFP16-NEXT:    .save {r11, lr}
+; CHECK-NOFP16-NEXT:    push {r11, lr}
+; CHECK-NOFP16-NEXT:    vmov r0, s0
+; CHECK-NOFP16-NEXT:    bl __aeabi_h2f
+; CHECK-NOFP16-NEXT:    vmov s0, r0
+; CHECK-NOFP16-NEXT:    bl llroundf
+; CHECK-NOFP16-NEXT:    pop {r11, pc}
+;
+; CHECK-FP16-LABEL: testmsxh_builtin:
+; CHECK-FP16:       @ %bb.0: @ %entry
+; CHECK-FP16-NEXT:    .save {r11, lr}
+; CHECK-FP16-NEXT:    push {r11, lr}
+; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT:    bl llroundf
+; CHECK-FP16-NEXT:    pop {r11, pc}
+entry:
+  %0 = tail call i64 @llvm.llround.i64.f16(half %x)
+  ret i64 %0
+}
 
-; SOFTFP-LABEL: testmsxs_builtin:
-; SOFTFP:       bl      llroundf
-; HARDFP-LABEL: testmsxs_builtin:
-; HARDFP:       bl      llroundf
 define i64 @testmsxs_builtin(float %x) {
+; CHECK-LABEL: testmsxs_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl llroundf
+; CHECK-NEXT:    pop {r11, pc}
 entry:
-  %0 = tail call i64 @llvm.llround.f32(float %x)
+  %0 = tail call i64 @llvm.llround.i64.f32(float %x)
   ret i64 %0
 }
 
-; SOFTFP-LABEL: testmsxd_builtin:
-; SOFTFP:       bl      llround
-; HARDFP-LABEL: testmsxd_builtin:
-; HARDFP:       bl      llround
 define i64 @testmsxd_builtin(double %x) {
+; CHECK-LABEL: testmsxd_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl llround
+; CHECK-NEXT:    pop {r11, pc}
 entry:
-  %0 = tail call i64 @llvm.llround.f64(double %x)
+  %0 = tail call i64 @llvm.llround.i64.f64(double %x)
   ret i64 %0
 }
 
-declare i64 @llvm.llround.f32(float) nounwind readnone
-declare i64 @llvm.llround.f64(double) nounwind readnone
+define i64 @testmsxq_builtin(fp128 %x) {
+; CHECK-LABEL: testmsxq_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl llroundl
+; CHECK-NEXT:    pop {r11, pc}
+entry:
+  %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x)
+  ret i64 %0
+}
diff --git a/llvm/test/CodeGen/ARM/lround-conv.ll b/llvm/test/CodeGen/ARM/lround-conv.ll
index 3aaed74..03f7a0d 100644
--- a/llvm/test/CodeGen/ARM/lround-conv.ll
+++ b/llvm/test/CodeGen/ARM/lround-conv.ll
@@ -1,25 +1,47 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FPv8
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+
+;define i32 @testmswh_builtin(half %x) {
+;entry:
+;  %0 = tail call i32 @llvm.lround.i32.f16(half %x)
+;  ret i32 %0
+;}
 
-; SOFTFP-LABEL: testmsws_builtin:
-; SOFTFP:       bl      lroundf
-; HARDFP-LABEL: testmsws_builtin:
-; HARDFP:       bl      lroundf
 define i32 @testmsws_builtin(float %x) {
+; CHECK-LABEL: testmsws_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    b lroundf
 entry:
   %0 = tail call i32 @llvm.lround.i32.f32(float %x)
   ret i32 %0
 }
 
-; SOFTFP-LABEL: testmswd_builtin:
-; SOFTFP:       bl      lround
-; HARDFP-LABEL: testmswd_builtin:
-; HARDFP:       bl      lround
 define i32 @testmswd_builtin(double %x) {
+; CHECK-LABEL: testmswd_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    b lround
 entry:
   %0 = tail call i32 @llvm.lround.i32.f64(double %x)
   ret i32 %0
 }
 
-declare i32 @llvm.lround.i32.f32(float) nounwind readnone
-declare i32 @llvm.lround.i32.f64(double) nounwind readnone
+define i32 @testmswq_builtin(fp128 %x) {
+; CHECK-LABEL: testmswq_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl lroundl
+; CHECK-NEXT:    pop {r11, pc}
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f128(fp128 %x)
+  ret i32 %0
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-FP16: {{.*}}
+; CHECK-FPv8: {{.*}}
+; CHECK-NOFP16: {{.*}}
+; CHECK-SOFT: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index e761d3a..33b89a4 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -39,119 +39,118 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    slli a2, a2, 1
 ; NOREMAT-NEXT:    sub sp, sp, a2
 ; NOREMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
-; NOREMAT-NEXT:    mv a7, a0
-; NOREMAT-NEXT:    li a0, 32
-; NOREMAT-NEXT:    addi a5, a7, 512
-; NOREMAT-NEXT:    addi a4, a7, 1024
-; NOREMAT-NEXT:    addi a6, a7, 1536
-; NOREMAT-NEXT:    li t4, 1
-; NOREMAT-NEXT:    li a2, 5
+; NOREMAT-NEXT:    li a7, 32
+; NOREMAT-NEXT:    addi s10, a0, 512
+; NOREMAT-NEXT:    addi a4, a0, 1024
+; NOREMAT-NEXT:    addi a6, a0, 1536
+; NOREMAT-NEXT:    li t0, 1
+; NOREMAT-NEXT:    li a3, 5
 ; NOREMAT-NEXT:    li t1, 3
-; NOREMAT-NEXT:    li t0, 7
-; NOREMAT-NEXT:    lui t5, 1
+; NOREMAT-NEXT:    li a2, 7
+; NOREMAT-NEXT:    lui t2, 1
 ; NOREMAT-NEXT:    li s4, 9
 ; NOREMAT-NEXT:    li s6, 11
 ; NOREMAT-NEXT:    li s9, 13
 ; NOREMAT-NEXT:    li ra, 15
-; NOREMAT-NEXT:    lui t2, 2
+; NOREMAT-NEXT:    lui a5, 2
 ; NOREMAT-NEXT:    lui s1, 3
 ; NOREMAT-NEXT:    lui t3, 4
 ; NOREMAT-NEXT:    lui s0, 5
 ; NOREMAT-NEXT:    lui s3, 6
 ; NOREMAT-NEXT:    lui s7, 7
-; NOREMAT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; NOREMAT-NEXT:    slli t4, t4, 11
-; NOREMAT-NEXT:    sd t4, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli a3, a2, 9
-; NOREMAT-NEXT:    sd a3, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vsetvli zero, a7, e32, m2, ta, ma
+; NOREMAT-NEXT:    slli t0, t0, 11
+; NOREMAT-NEXT:    sd t0, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli t4, a3, 9
+; NOREMAT-NEXT:    sd t4, 504(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    slli t6, t1, 10
-; NOREMAT-NEXT:    slli s2, t0, 9
-; NOREMAT-NEXT:    add a0, a7, t5
+; NOREMAT-NEXT:    slli s2, a2, 9
+; NOREMAT-NEXT:    add a7, a0, t2
 ; NOREMAT-NEXT:    lui s11, 1
 ; NOREMAT-NEXT:    slli s4, s4, 9
-; NOREMAT-NEXT:    slli s5, a2, 10
+; NOREMAT-NEXT:    slli s5, a3, 10
 ; NOREMAT-NEXT:    slli s6, s6, 9
 ; NOREMAT-NEXT:    slli s8, t1, 11
-; NOREMAT-NEXT:    vle32.v v8, (a5)
+; NOREMAT-NEXT:    vle32.v v8, (s10)
 ; NOREMAT-NEXT:    slli s9, s9, 9
 ; NOREMAT-NEXT:    li t5, 13
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    slli s10, t0, 10
+; NOREMAT-NEXT:    slli s10, a2, 10
 ; NOREMAT-NEXT:    vle32.v v0, (a6)
 ; NOREMAT-NEXT:    vle32.v v12, (a6)
 ; NOREMAT-NEXT:    slli ra, ra, 9
-; NOREMAT-NEXT:    vle32.v v4, (a0)
-; NOREMAT-NEXT:    vle32.v v20, (a0)
-; NOREMAT-NEXT:    add a4, a7, t2
+; NOREMAT-NEXT:    vle32.v v4, (a7)
+; NOREMAT-NEXT:    vle32.v v20, (a7)
+; NOREMAT-NEXT:    add a4, a0, a5
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    add a4, a7, s1
+; NOREMAT-NEXT:    add a4, a0, s1
 ; NOREMAT-NEXT:    vle32.v v28, (a4)
 ; NOREMAT-NEXT:    vle32.v v26, (a4)
-; NOREMAT-NEXT:    add a4, a7, t3
+; NOREMAT-NEXT:    add a4, a0, t3
 ; NOREMAT-NEXT:    vle32.v v24, (a4)
 ; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    add a4, a7, s0
-; NOREMAT-NEXT:    vle32.v v14, (a7)
+; NOREMAT-NEXT:    add a4, a0, s0
+; NOREMAT-NEXT:    vle32.v v14, (a0)
 ; NOREMAT-NEXT:    vle32.v v18, (a4)
 ; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    add a4, a7, s3
+; NOREMAT-NEXT:    add a4, a0, s3
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v8
 ; NOREMAT-NEXT:    vle32.v v14, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    addi a0, sp, 640
-; NOREMAT-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, t4
+; NOREMAT-NEXT:    addi a4, sp, 640
+; NOREMAT-NEXT:    vs2r.v v8, (a4) # vscale x 16-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a0, t0
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, a3
+; NOREMAT-NEXT:    add a4, a0, t4
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v10
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, t6
+; NOREMAT-NEXT:    add a4, a0, t6
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, s2
+; NOREMAT-NEXT:    add a4, a0, s2
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s7
+; NOREMAT-NEXT:    add a4, a0, s7
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v8
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, s4
+; NOREMAT-NEXT:    add a4, a0, s4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s5
+; NOREMAT-NEXT:    add a4, a0, s5
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v8
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, s6
+; NOREMAT-NEXT:    add a4, a0, s6
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s8
+; NOREMAT-NEXT:    add a4, a0, s8
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, s9
+; NOREMAT-NEXT:    add a4, a0, s9
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s10
+; NOREMAT-NEXT:    add a4, a0, s10
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, ra
+; NOREMAT-NEXT:    add a4, a0, ra
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    lui t4, 8
-; NOREMAT-NEXT:    add a5, a7, t4
+; NOREMAT-NEXT:    add a5, a0, t4
 ; NOREMAT-NEXT:    vle32.v v20, (a5)
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v2
@@ -159,14 +158,14 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    slli a4, a4, 9
 ; NOREMAT-NEXT:    li s1, 17
 ; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v6
 ; NOREMAT-NEXT:    li a5, 9
 ; NOREMAT-NEXT:    slli a4, a5, 10
 ; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
@@ -174,256 +173,257 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    slli a4, a4, 9
 ; NOREMAT-NEXT:    li t2, 19
 ; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    slli a3, a2, 11
+; NOREMAT-NEXT:    slli a3, a3, 11
 ; NOREMAT-NEXT:    sd a3, 600(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li s7, 21
 ; NOREMAT-NEXT:    slli a3, s7, 9
 ; NOREMAT-NEXT:    sd a3, 592(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
 ; NOREMAT-NEXT:    li a6, 11
 ; NOREMAT-NEXT:    slli a3, a6, 10
 ; NOREMAT-NEXT:    sd a3, 584(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
 ; NOREMAT-NEXT:    li s3, 23
 ; NOREMAT-NEXT:    slli a3, s3, 9
 ; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
 ; NOREMAT-NEXT:    li s0, 25
 ; NOREMAT-NEXT:    slli a3, s0, 9
 ; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
 ; NOREMAT-NEXT:    slli a3, t5, 10
 ; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v28
 ; NOREMAT-NEXT:    li t3, 27
 ; NOREMAT-NEXT:    slli a3, t3, 9
 ; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v28, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
-; NOREMAT-NEXT:    slli a2, t0, 11
+; NOREMAT-NEXT:    slli a2, a2, 11
 ; NOREMAT-NEXT:    sd a2, 544(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li t0, 29
 ; NOREMAT-NEXT:    slli a2, t0, 9
 ; NOREMAT-NEXT:    sd a2, 536(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT:    li a3, 15
-; NOREMAT-NEXT:    slli a2, a3, 10
+; NOREMAT-NEXT:    li a7, 15
+; NOREMAT-NEXT:    slli a2, a7, 10
 ; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
 ; NOREMAT-NEXT:    li t1, 31
 ; NOREMAT-NEXT:    slli a2, t1, 9
 ; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT:    lui a4, 4
-; NOREMAT-NEXT:    addi a0, a4, 512
-; NOREMAT-NEXT:    sd a0, 496(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a7, a0
-; NOREMAT-NEXT:    vle32.v v8, (a0)
-; NOREMAT-NEXT:    vle32.v v26, (a0)
+; NOREMAT-NEXT:    lui a3, 4
+; NOREMAT-NEXT:    addi a2, a3, 512
+; NOREMAT-NEXT:    sd a2, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v28
 ; NOREMAT-NEXT:    slli a2, s1, 10
 ; NOREMAT-NEXT:    sd a2, 488(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    addi a2, a4, 1536
+; NOREMAT-NEXT:    addi a2, a3, 1536
 ; NOREMAT-NEXT:    sd a2, 480(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    lui a4, 4
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    slli a2, a5, 11
 ; NOREMAT-NEXT:    sd a2, 472(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v24
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v8
 ; NOREMAT-NEXT:    lui a5, 5
 ; NOREMAT-NEXT:    addi a2, a5, -1536
 ; NOREMAT-NEXT:    sd a2, 464(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v28
 ; NOREMAT-NEXT:    slli a2, t2, 10
 ; NOREMAT-NEXT:    sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li t2, 19
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    li a3, 19
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
 ; NOREMAT-NEXT:    addi a2, a5, -512
 ; NOREMAT-NEXT:    sd a2, 448(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v24
 ; NOREMAT-NEXT:    addi a2, a5, 512
 ; NOREMAT-NEXT:    sd a2, 440(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    slli a2, s7, 10
 ; NOREMAT-NEXT:    sd a2, 432(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
 ; NOREMAT-NEXT:    addi a2, a5, 1536
 ; NOREMAT-NEXT:    sd a2, 424(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    slli a2, a6, 11
 ; NOREMAT-NEXT:    sd a2, 416(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v18
 ; NOREMAT-NEXT:    lui a6, 6
 ; NOREMAT-NEXT:    addi a2, a6, -1536
 ; NOREMAT-NEXT:    sd a2, 408(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    slli a2, s3, 10
 ; NOREMAT-NEXT:    sd a2, 400(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v24
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
 ; NOREMAT-NEXT:    addi a2, a6, -512
 ; NOREMAT-NEXT:    sd a2, 392(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v22
 ; NOREMAT-NEXT:    addi a2, a6, 512
 ; NOREMAT-NEXT:    sd a2, 384(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    slli a2, s0, 10
 ; NOREMAT-NEXT:    sd a2, 376(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v18
 ; NOREMAT-NEXT:    addi a2, a6, 1536
 ; NOREMAT-NEXT:    sd a2, 368(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    slli a2, t5, 11
 ; NOREMAT-NEXT:    sd a2, 360(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v16
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v8
 ; NOREMAT-NEXT:    lui s0, 7
 ; NOREMAT-NEXT:    addi a2, s0, -1536
 ; NOREMAT-NEXT:    sd a2, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    slli a2, t3, 10
 ; NOREMAT-NEXT:    sd a2, 344(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v14
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addi a0, sp, 640
-; NOREMAT-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; NOREMAT-NEXT:    addi a2, sp, 640
+; NOREMAT-NEXT:    vl2r.v v12, (a2) # vscale x 16-byte Folded Reload
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v22
 ; NOREMAT-NEXT:    addi a2, s0, -512
 ; NOREMAT-NEXT:    sd a2, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v26
 ; NOREMAT-NEXT:    addi a2, s0, 512
 ; NOREMAT-NEXT:    sd a2, 328(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui t3, 7
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    slli a2, t0, 10
 ; NOREMAT-NEXT:    sd a2, 320(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v18
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v16
 ; NOREMAT-NEXT:    addi a2, t3, 1536
 ; NOREMAT-NEXT:    sd a2, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    slli a2, a3, 11
+; NOREMAT-NEXT:    slli a2, a7, 11
 ; NOREMAT-NEXT:    sd a2, 304(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v14
 ; NOREMAT-NEXT:    addi a2, t4, -1536
 ; NOREMAT-NEXT:    sd a2, 296(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    slli a2, t1, 10
 ; NOREMAT-NEXT:    sd a2, 288(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v22
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addi a0, t4, -512
-; NOREMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a7, a0
+; NOREMAT-NEXT:    addi a2, t4, -512
+; NOREMAT-NEXT:    sd a2, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a0, a2
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
 ; NOREMAT-NEXT:    vle32.v v12, (a0)
 ; NOREMAT-NEXT:    vle32.v v0, (a0)
@@ -476,7 +476,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    addi s11, a0, 512
 ; NOREMAT-NEXT:    addi s7, a0, 1024
 ; NOREMAT-NEXT:    addi s3, a0, 1536
-; NOREMAT-NEXT:    slli s1, t2, 11
+; NOREMAT-NEXT:    slli s1, a3, 11
 ; NOREMAT-NEXT:    lui a0, 10
 ; NOREMAT-NEXT:    addi t2, a0, -1536
 ; NOREMAT-NEXT:    addi a7, a0, -1024
diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
index 34d4657..c68fa59 100644
--- a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
@@ -78,3 +78,10 @@ define i64 @test0(i64 %n, ptr %p) nounwind {
   ret i64 %ret
 
 }
+
+; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the
+; presence of trampolines.
+; UTC_ARGS: --disable
+; RV64-LINUX:         .section        ".note.GNU-stack","x",@progbits
+; RV64:               .section        ".note.GNU-stack","x",@progbits
+; UTC_ARGS: --enable
diff --git a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
index 9798077..e3ed31f 100644
--- a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
+++ b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
@@ -163,9 +163,9 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
 ; LINUX-NEXT:    ld %f10, 8(%r3)
 ; LINUX-NEXT:    ld %f0, 16(%r3)
 ; LINUX-NEXT:    ld %f2, 24(%r3)
-; LINUX-NEXT:    la %r3, 16(%r2)
-; LINUX-NEXT:    la %r4, 48(%r2)
 ; LINUX-NEXT:    la %r2, 176(%r15)
+; LINUX-NEXT:    la %r3, 16(%r13)
+; LINUX-NEXT:    la %r4, 48(%r13)
 ; LINUX-NEXT:    std %f0, 176(%r15)
 ; LINUX-NEXT:    std %f2, 184(%r15)
 ; LINUX-NEXT:    brasl %r14, sincosl@PLT
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index 6f986ce..c418038 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -541,11 +541,11 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(ptr nocaptur
 ; CHECK-NEXT:    cbz r2, .LBB7_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    bic r3, r3, #7
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    subs r3, #8
-; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    add.w r12, r4, r3, lsr #3
 ; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    mov r4, r1
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
index 4020709..fe06601 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
@@ -16,39 +16,40 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
 ; CHECK-NEXT:    sub sp, #64
 ; CHECK-NEXT:    ldrsh.w r7, [r2]
 ; CHECK-NEXT:    cmp r7, #1
-; CHECK-NEXT:    blt.w .LBB0_6
+; CHECK-NEXT:    blt .LBB0_6
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond3.preheader.us.preheader
-; CHECK-NEXT:    movs r2, #252
 ; CHECK-NEXT:    ldr r4, [sp, #152]
+; CHECK-NEXT:    movs r2, #252
 ; CHECK-NEXT:    and.w r6, r2, r3, lsr #3
 ; CHECK-NEXT:    movs r2, #120
 ; CHECK-NEXT:    and.w r5, r2, r3, lsr #9
 ; CHECK-NEXT:    lsls r3, r3, #3
-; CHECK-NEXT:    uxtb r3, r3
 ; CHECK-NEXT:    muls r6, r4, r6
+; CHECK-NEXT:    uxtb r3, r3
 ; CHECK-NEXT:    rsb.w r2, r4, #256
-; CHECK-NEXT:    vmov.i16 q2, #0xfc
+; CHECK-NEXT:    vmov.i16 q1, #0xfc
+; CHECK-NEXT:    vdup.16 q0, r6
 ; CHECK-NEXT:    mul lr, r5, r4
-; CHECK-NEXT:    vdup.16 q4, r6
 ; CHECK-NEXT:    mov.w r6, #2016
-; CHECK-NEXT:    vmov.i16 q6, #0xf8
+; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    mul r5, r3, r4
 ; CHECK-NEXT:    adds r3, r7, #7
+; CHECK-NEXT:    vdup.16 q0, r6
 ; CHECK-NEXT:    bic r3, r3, #7
-; CHECK-NEXT:    vdup.16 q3, lr
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vdup.16 q0, r5
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vdup.16 q0, lr
 ; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    vdup.16 q0, r5
-; CHECK-NEXT:    lsls r1, r1, #1
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    add.w r3, r4, r3, lsr #3
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q0, #0xf800
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    lsls r1, r1, #1
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    vdup.16 q5, r6
-; CHECK-NEXT:    vmov.i16 q7, #0x78
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i16 q4, #0xf8
 ; CHECK-NEXT:  .LBB0_3: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
@@ -59,37 +60,31 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
 ; CHECK-NEXT:    @ Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.16 r6
-; CHECK-NEXT:    subs r6, #8
+; CHECK-NEXT:    vmov.i16 q5, #0xf800
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrht.u16 q0, [r5]
-; CHECK-NEXT:    vshr.u16 q1, q0, #3
-; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov q2, q4
-; CHECK-NEXT:    vmla.i16 q2, q1, r2
-; CHECK-NEXT:    vshr.u16 q1, q2, #5
-; CHECK-NEXT:    vshl.i16 q2, q0, #3
-; CHECK-NEXT:    vand q3, q1, q5
-; CHECK-NEXT:    vmov q1, q7
-; CHECK-NEXT:    vand q2, q2, q6
-; CHECK-NEXT:    vmov q7, q6
-; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    vmov q5, q4
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    subs r6, #8
+; CHECK-NEXT:    vshr.u16 q3, q0, #3
+; CHECK-NEXT:    vand q3, q3, q1
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmla.i16 q1, q3, r2
+; CHECK-NEXT:    vshl.i16 q3, q0, #3
+; CHECK-NEXT:    vand q3, q3, q4
+; CHECK-NEXT:    vmov q4, q6
+; CHECK-NEXT:    vshr.u16 q1, q1, #5
+; CHECK-NEXT:    vmla.i16 q4, q3, r2
+; CHECK-NEXT:    vshr.u16 q3, q4, #11
+; CHECK-NEXT:    vand q1, q1, q7
+; CHECK-NEXT:    vorr q1, q1, q3
 ; CHECK-NEXT:    vshr.u16 q0, q0, #9
-; CHECK-NEXT:    vmla.i16 q4, q2, r2
-; CHECK-NEXT:    vshr.u16 q2, q4, #11
-; CHECK-NEXT:    vmov q4, q5
-; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    vmov q6, q7
-; CHECK-NEXT:    vmov q7, q1
-; CHECK-NEXT:    vorr q1, q3, q2
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vand q0, q0, q7
-; CHECK-NEXT:    vmla.i16 q2, q0, r2
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vand q0, q2, q0
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i16 q3, #0x78
+; CHECK-NEXT:    vmov.i16 q4, #0xf8
+; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmla.i16 q3, q0, r2
+; CHECK-NEXT:    vand q0, q3, q5
 ; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vmov.i16 q1, #0xfc
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrht.16 q0, [r5], #16
 ; CHECK-NEXT:    le lr, .LBB0_4
@@ -190,7 +185,7 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    ldrsh.w r12, [r2, #2]
 ; CHECK-NEXT:    cmp.w r12, #1
-; CHECK-NEXT:    blt.w .LBB1_7
+; CHECK-NEXT:    blt .LBB1_7
 ; CHECK-NEXT:  @ %bb.1: @ %for.cond3.preheader.lr.ph
 ; CHECK-NEXT:    ldrsh.w r2, [r2]
 ; CHECK-NEXT:    cmp r2, #1
@@ -200,71 +195,70 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    sub sp, #80
-; CHECK-NEXT:    ldr r7, [sp, #168]
+; CHECK-NEXT:    ldr r7, [sp, #88]
 ; CHECK-NEXT:    movs r5, #120
 ; CHECK-NEXT:    lsls r6, r3, #3
 ; CHECK-NEXT:    movs r4, #252
 ; CHECK-NEXT:    and.w r5, r5, r3, lsr #9
 ; CHECK-NEXT:    uxtb r6, r6
 ; CHECK-NEXT:    and.w r3, r4, r3, lsr #3
+; CHECK-NEXT:    adds r4, r2, #7
 ; CHECK-NEXT:    muls r6, r7, r6
+; CHECK-NEXT:    bic r4, r4, #7
 ; CHECK-NEXT:    mul lr, r3, r7
-; CHECK-NEXT:    vdup.16 q0, r6
-; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vdup.16 q0, lr
 ; CHECK-NEXT:    muls r5, r7, r5
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q0, #0xfc
-; CHECK-NEXT:    mov.w r6, #2016
-; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vdup.16 q0, r5
 ; CHECK-NEXT:    rsb.w r3, r7, #256
 ; CHECK-NEXT:    lsls r7, r1, #1
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vdup.16 q0, r6
+; CHECK-NEXT:    sub.w r1, r4, #8
+; CHECK-NEXT:    movs r4, #1
 ; CHECK-NEXT:    vmov.i16 q2, #0xf8
-; CHECK-NEXT:    vmov.i16 q5, #0x78
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q6, #0xf800
+; CHECK-NEXT:    add.w r1, r4, r1, lsr #3
+; CHECK-NEXT:    vdup.16 q6, r6
+; CHECK-NEXT:    mov.w r6, #2016
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vdup.16 q3, lr
+; CHECK-NEXT:    vdup.16 q5, r5
+; CHECK-NEXT:    vdup.16 q7, r6
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LBB1_3: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB1_4 Depth 2
 ; CHECK-NEXT:    mov r5, r0
-; CHECK-NEXT:    dlstp.16 lr, r2
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB1_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrh.u16 q0, [r5]
+; CHECK-NEXT:    vctp.16 r6
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r5]
 ; CHECK-NEXT:    vshl.i16 q1, q0, #3
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    subs r6, #8
 ; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmla.i16 q3, q1, r3
-; CHECK-NEXT:    vmov.f64 d8, d4
-; CHECK-NEXT:    vmov.f64 d9, d5
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vshr.u16 q2, q0, #9
+; CHECK-NEXT:    vmov.i16 q2, #0x78
+; CHECK-NEXT:    vshr.u16 q4, q0, #9
+; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov q2, q6
+; CHECK-NEXT:    vmla.i16 q2, q1, r3
 ; CHECK-NEXT:    vshr.u16 q0, q0, #3
+; CHECK-NEXT:    vmov.i16 q1, #0xfc
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov q1, q3
 ; CHECK-NEXT:    vmla.i16 q1, q0, r3
-; CHECK-NEXT:    vand q2, q2, q5
-; CHECK-NEXT:    vshr.u16 q0, q3, #11
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vshr.u16 q0, q2, #11
+; CHECK-NEXT:    vmov q2, q5
+; CHECK-NEXT:    vmla.i16 q2, q4, r3
 ; CHECK-NEXT:    vshr.u16 q1, q1, #5
-; CHECK-NEXT:    vmla.i16 q3, q2, r3
+; CHECK-NEXT:    vmov.i16 q4, #0xf800
 ; CHECK-NEXT:    vand q1, q1, q7
 ; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vand q1, q3, q6
+; CHECK-NEXT:    vand q1, q2, q4
+; CHECK-NEXT:    vmov.i16 q2, #0xf8
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vstrh.16 q0, [r5], #16
-; CHECK-NEXT:    vmov.f64 d4, d8
-; CHECK-NEXT:    vmov.f64 d5, d9
-; CHECK-NEXT:    letp lr, .LBB1_4
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r5], #16
+; CHECK-NEXT:    le lr, .LBB1_4
 ; CHECK-NEXT:  @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
 ; CHECK-NEXT:    @ in Loop: Header=BB1_3 Depth=1
 ; CHECK-NEXT:    adds r4, #1
@@ -272,7 +266,6 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
 ; CHECK-NEXT:    cmp r4, r12
 ; CHECK-NEXT:    bne .LBB1_3
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:    add sp, #80
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, lr}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 07c06e1..1769c5d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -17,17 +17,16 @@
 define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr {
 ; ENABLED-LABEL: varying_outer_2d_reduction:
 ; ENABLED:       @ %bb.0: @ %entry
-; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; ENABLED-NEXT:    sub sp, #4
 ; ENABLED-NEXT:    cmp r3, #1
-; ENABLED-NEXT:    str r0, [sp] @ 4-byte Spill
-; ENABLED-NEXT:    blt .LBB0_8
-; ENABLED-NEXT:  @ %bb.1: @ %for.body.lr.ph
-; ENABLED-NEXT:    ldr r0, [sp, #36]
-; ENABLED-NEXT:    add.w r12, r2, #3
-; ENABLED-NEXT:    ldr.w r10, [sp] @ 4-byte Reload
-; ENABLED-NEXT:    mov.w r8, #0
-; ENABLED-NEXT:    mov r9, r12
+; ENABLED-NEXT:    it lt
+; ENABLED-NEXT:    bxlt lr
+; ENABLED-NEXT:  .LBB0_1: @ %for.body.lr.ph
+; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT:    mov r11, r0
+; ENABLED-NEXT:    ldr r0, [sp, #32]
+; ENABLED-NEXT:    add.w r9, r2, #3
+; ENABLED-NEXT:    mov.w r12, #0
+; ENABLED-NEXT:    mov r10, r11
 ; ENABLED-NEXT:    uxth r0, r0
 ; ENABLED-NEXT:    rsbs r5, r0, #0
 ; ENABLED-NEXT:    b .LBB0_4
@@ -37,31 +36,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    lsrs r0, r0, #16
 ; ENABLED-NEXT:    sub.w r9, r9, #1
-; ENABLED-NEXT:    strh.w r0, [r1, r8, lsl #1]
-; ENABLED-NEXT:    add.w r8, r8, #1
+; ENABLED-NEXT:    strh.w r0, [r1, r12, lsl #1]
+; ENABLED-NEXT:    add.w r12, r12, #1
 ; ENABLED-NEXT:    add.w r10, r10, #2
-; ENABLED-NEXT:    cmp r8, r3
+; ENABLED-NEXT:    cmp r12, r3
 ; ENABLED-NEXT:    beq .LBB0_8
 ; ENABLED-NEXT:  .LBB0_4: @ %for.body
 ; ENABLED-NEXT:    @ =>This Loop Header: Depth=1
 ; ENABLED-NEXT:    @ Child Loop BB0_6 Depth 2
-; ENABLED-NEXT:    cmp r2, r8
+; ENABLED-NEXT:    cmp r2, r12
 ; ENABLED-NEXT:    ble .LBB0_2
 ; ENABLED-NEXT:  @ %bb.5: @ %vector.ph
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    bic r0, r9, #3
 ; ENABLED-NEXT:    movs r7, #1
 ; ENABLED-NEXT:    subs r0, #4
-; ENABLED-NEXT:    sub.w r4, r2, r8
+; ENABLED-NEXT:    sub.w r4, r2, r12
 ; ENABLED-NEXT:    vmov.i32 q1, #0x0
 ; ENABLED-NEXT:    add.w r6, r7, r0, lsr #2
-; ENABLED-NEXT:    sub.w r0, r12, r8
+; ENABLED-NEXT:    adds r0, r2, #3
+; ENABLED-NEXT:    sub.w r0, r0, r12
 ; ENABLED-NEXT:    bic r0, r0, #3
 ; ENABLED-NEXT:    subs r0, #4
 ; ENABLED-NEXT:    add.w r0, r7, r0, lsr #2
 ; ENABLED-NEXT:    mov r7, r10
 ; ENABLED-NEXT:    dls lr, r0
-; ENABLED-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; ENABLED-NEXT:    mov r0, r11
 ; ENABLED-NEXT:  .LBB0_6: @ %vector.body
 ; ENABLED-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; ENABLED-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -82,23 +82,22 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    vpsel q0, q1, q0
 ; ENABLED-NEXT:    vaddv.u32 r0, q0
 ; ENABLED-NEXT:    b .LBB0_3
-; ENABLED-NEXT:  .LBB0_8: @ %for.end17
-; ENABLED-NEXT:    add sp, #4
-; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; ENABLED-NEXT:  .LBB0_8:
+; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT:    bx lr
 ;
 ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
 ; NOREDUCTIONS:       @ %bb.0: @ %entry
-; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; NOREDUCTIONS-NEXT:    sub sp, #4
 ; NOREDUCTIONS-NEXT:    cmp r3, #1
-; NOREDUCTIONS-NEXT:    str r0, [sp] @ 4-byte Spill
-; NOREDUCTIONS-NEXT:    blt .LBB0_8
-; NOREDUCTIONS-NEXT:  @ %bb.1: @ %for.body.lr.ph
-; NOREDUCTIONS-NEXT:    ldr r0, [sp, #36]
-; NOREDUCTIONS-NEXT:    add.w r12, r2, #3
-; NOREDUCTIONS-NEXT:    ldr.w r10, [sp] @ 4-byte Reload
-; NOREDUCTIONS-NEXT:    mov.w r8, #0
-; NOREDUCTIONS-NEXT:    mov r9, r12
+; NOREDUCTIONS-NEXT:    it lt
+; NOREDUCTIONS-NEXT:    bxlt lr
+; NOREDUCTIONS-NEXT:  .LBB0_1: @ %for.body.lr.ph
+; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT:    mov r11, r0
+; NOREDUCTIONS-NEXT:    ldr r0, [sp, #32]
+; NOREDUCTIONS-NEXT:    add.w r9, r2, #3
+; NOREDUCTIONS-NEXT:    mov.w r12, #0
+; NOREDUCTIONS-NEXT:    mov r10, r11
 ; NOREDUCTIONS-NEXT:    uxth r0, r0
 ; NOREDUCTIONS-NEXT:    rsbs r5, r0, #0
 ; NOREDUCTIONS-NEXT:    b .LBB0_4
@@ -108,31 +107,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    lsrs r0, r0, #16
 ; NOREDUCTIONS-NEXT:    sub.w r9, r9, #1
-; NOREDUCTIONS-NEXT:    strh.w r0, [r1, r8, lsl #1]
-; NOREDUCTIONS-NEXT:    add.w r8, r8, #1
+; NOREDUCTIONS-NEXT:    strh.w r0, [r1, r12, lsl #1]
+; NOREDUCTIONS-NEXT:    add.w r12, r12, #1
 ; NOREDUCTIONS-NEXT:    add.w r10, r10, #2
-; NOREDUCTIONS-NEXT:    cmp r8, r3
+; NOREDUCTIONS-NEXT:    cmp r12, r3
 ; NOREDUCTIONS-NEXT:    beq .LBB0_8
 ; NOREDUCTIONS-NEXT:  .LBB0_4: @ %for.body
 ; NOREDUCTIONS-NEXT:    @ =>This Loop Header: Depth=1
 ; NOREDUCTIONS-NEXT:    @ Child Loop BB0_6 Depth 2
-; NOREDUCTIONS-NEXT:    cmp r2, r8
+; NOREDUCTIONS-NEXT:    cmp r2, r12
 ; NOREDUCTIONS-NEXT:    ble .LBB0_2
 ; NOREDUCTIONS-NEXT:  @ %bb.5: @ %vector.ph
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    bic r0, r9, #3
 ; NOREDUCTIONS-NEXT:    movs r7, #1
 ; NOREDUCTIONS-NEXT:    subs r0, #4
-; NOREDUCTIONS-NEXT:    sub.w r4, r2, r8
+; NOREDUCTIONS-NEXT:    sub.w r4, r2, r12
 ; NOREDUCTIONS-NEXT:    vmov.i32 q1, #0x0
 ; NOREDUCTIONS-NEXT:    add.w r6, r7, r0, lsr #2
-; NOREDUCTIONS-NEXT:    sub.w r0, r12, r8
+; NOREDUCTIONS-NEXT:    adds r0, r2, #3
+; NOREDUCTIONS-NEXT:    sub.w r0, r0, r12
 ; NOREDUCTIONS-NEXT:    bic r0, r0, #3
 ; NOREDUCTIONS-NEXT:    subs r0, #4
 ; NOREDUCTIONS-NEXT:    add.w r0, r7, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    mov r7, r10
 ; NOREDUCTIONS-NEXT:    dls lr, r0
-; NOREDUCTIONS-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; NOREDUCTIONS-NEXT:    mov r0, r11
 ; NOREDUCTIONS-NEXT:  .LBB0_6: @ %vector.body
 ; NOREDUCTIONS-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -153,9 +153,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    vpsel q0, q1, q0
 ; NOREDUCTIONS-NEXT:    vaddv.u32 r0, q0
 ; NOREDUCTIONS-NEXT:    b .LBB0_3
-; NOREDUCTIONS-NEXT:  .LBB0_8: @ %for.end17
-; NOREDUCTIONS-NEXT:    add sp, #4
-; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; NOREDUCTIONS-NEXT:  .LBB0_8:
+; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT:    bx lr
 entry:
   %conv = sext i16 %N to i32
   %cmp36 = icmp sgt i16 %N, 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
index e0a61b1..78dc35b 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
@@ -49,18 +49,17 @@ define i32 @vcmp_new_vpst_combination(i32 %len, ptr nocapture readonly %arr) {
 ; CHECK-NEXT:    cmp r0, #1
 ; CHECK-NEXT:    blt .LBB1_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    vmov.i32 q1, #0x1
+; CHECK-NEXT:    vmov.i32 q0, #0x1
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    dlstp.32 lr, r0
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
-; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
+; CHECK-NEXT:    vcmp.i32 ne, q1, zr
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmovt q2, q1
-; CHECK-NEXT:    vaddva.u32 r2, q2
+; CHECK-NEXT:    vmovt q1, q0
+; CHECK-NEXT:    vaddva.u32 r2, q1
 ; CHECK-NEXT:    letp lr, .LBB1_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index c8dd949..a904347 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -993,10 +993,10 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    .pad #20
+; CHECK-NEXT:    sub sp, #20
 ; CHECK-NEXT:    cmp r3, #8
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    blo.w .LBB16_12
 ; CHECK-NEXT:  @ %bb.1: @ %if.then
 ; CHECK-NEXT:    lsrs.w r12, r3, #2
@@ -1016,50 +1016,48 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, r7, #2
 ; CHECK-NEXT:    rsbs r7, r4, #0
-; CHECK-NEXT:    str r7, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    add.w r7, r3, #16
-; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r4, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB16_6
 ; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    add.w r5, r5, r0, lsl #1
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    add.w r6, r6, r0, lsl #1
 ; CHECK-NEXT:    b .LBB16_5
 ; CHECK-NEXT:  .LBB16_4: @ %for.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    wls lr, r0, .LBB16_5
 ; CHECK-NEXT:    b .LBB16_10
 ; CHECK-NEXT:  .LBB16_5: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #8
-; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
+; CHECK-NEXT:    add.w r0, r6, r0, lsl #1
 ; CHECK-NEXT:    add.w r5, r0, #8
 ; CHECK-NEXT:    beq.w .LBB16_12
 ; CHECK-NEXT:  .LBB16_6: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    ldrh.w lr, [r3, #14]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
-; CHECK-NEXT:    ldrh.w r8, [r3, #12]
+; CHECK-NEXT:    ldrh.w r10, [r3, #12]
 ; CHECK-NEXT:    ldrh r7, [r3, #10]
 ; CHECK-NEXT:    ldrh r4, [r3, #8]
 ; CHECK-NEXT:    ldrh r6, [r3, #6]
 ; CHECK-NEXT:    ldrh.w r9, [r3, #4]
 ; CHECK-NEXT:    ldrh.w r11, [r3, #2]
-; CHECK-NEXT:    ldrh.w r10, [r3]
+; CHECK-NEXT:    ldrh.w r8, [r3]
 ; CHECK-NEXT:    vstrb.8 q0, [r1], #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r5, #2
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmul.f16 q0, q0, r10
+; CHECK-NEXT:    vmul.f16 q0, q0, r8
 ; CHECK-NEXT:    adds r0, r5, #6
 ; CHECK-NEXT:    vfma.f16 q0, q1, r11
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
@@ -1068,73 +1066,73 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    add.w r0, r5, #10
 ; CHECK-NEXT:    vfma.f16 q0, q1, r6
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
+; CHECK-NEXT:    add.w r6, r5, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    add.w r0, r5, #14
 ; CHECK-NEXT:    vfma.f16 q0, q1, r7
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #12]
-; CHECK-NEXT:    adds r5, #16
-; CHECK-NEXT:    vfma.f16 q0, q1, r8
+; CHECK-NEXT:    vfma.f16 q0, q1, r10
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f16 q0, q1, lr
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    blo .LBB16_9
 ; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    add.w r5, r3, #16
 ; CHECK-NEXT:    dls lr, r0
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_8: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh r0, [r6], #16
-; CHECK-NEXT:    vldrw.u32 q1, [r5]
-; CHECK-NEXT:    adds r4, r5, #2
+; CHECK-NEXT:    ldrh r0, [r5], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r6]
+; CHECK-NEXT:    adds r4, r6, #2
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
-; CHECK-NEXT:    ldrh r0, [r6, #-14]
-; CHECK-NEXT:    adds r4, r5, #6
+; CHECK-NEXT:    ldrh r0, [r5, #-14]
+; CHECK-NEXT:    adds r4, r6, #6
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    ldrh r0, [r6, #-12]
-; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
+; CHECK-NEXT:    ldrh r0, [r5, #-12]
+; CHECK-NEXT:    vldrw.u32 q1, [r6, #4]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
-; CHECK-NEXT:    ldrh r0, [r6, #-10]
-; CHECK-NEXT:    add.w r4, r5, #10
+; CHECK-NEXT:    ldrh r0, [r5, #-10]
+; CHECK-NEXT:    add.w r4, r6, #10
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    ldrh r0, [r6, #-8]
-; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
+; CHECK-NEXT:    ldrh r0, [r5, #-8]
+; CHECK-NEXT:    vldrw.u32 q1, [r6, #8]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
-; CHECK-NEXT:    ldrh r0, [r6, #-6]
-; CHECK-NEXT:    ldrh r4, [r6, #-2]
+; CHECK-NEXT:    ldrh r0, [r5, #-6]
+; CHECK-NEXT:    ldrh r4, [r5, #-2]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    ldrh r0, [r6, #-4]
-; CHECK-NEXT:    vldrw.u32 q1, [r5, #12]
+; CHECK-NEXT:    ldrh r0, [r5, #-4]
+; CHECK-NEXT:    vldrw.u32 q1, [r6, #12]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    add.w r0, r5, #14
+; CHECK-NEXT:    add.w r0, r6, #14
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    adds r5, #16
+; CHECK-NEXT:    adds r6, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    le lr, .LBB16_8
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    add.w r5, r3, #16
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:  .LBB16_11: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh r4, [r6], #2
+; CHECK-NEXT:    ldrh r4, [r5], #2
 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    le lr, .LBB16_11
 ; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
-; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    add sp, #20
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 28166e4..f7b4548 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -995,46 +995,44 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    ldrh r6, [r0]
-; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    ldrd r4, r10, [r0, #4]
+; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    ldrd r7, r10, [r0, #4]
 ; CHECK-NEXT:    sub.w r0, r6, #8
 ; CHECK-NEXT:    add.w r3, r0, r0, lsr #29
 ; CHECK-NEXT:    and r0, r0, #7
-; CHECK-NEXT:    asrs r7, r3, #3
-; CHECK-NEXT:    cmp r7, #1
+; CHECK-NEXT:    asrs r5, r3, #3
+; CHECK-NEXT:    cmp r5, #1
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    asrgt r5, r3, #3
-; CHECK-NEXT:    add.w r3, r4, r6, lsl #2
+; CHECK-NEXT:    asrgt r4, r3, #3
+; CHECK-NEXT:    add.w r3, r7, r6, lsl #2
 ; CHECK-NEXT:    sub.w r9, r3, #4
 ; CHECK-NEXT:    rsbs r3, r6, #0
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    add.w r3, r10, #32
-; CHECK-NEXT:    str r5, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r4, [sp] @ 4-byte Spill
+; CHECK-NEXT:    str r6, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB16_6
 ; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r4, r4, r0, lsl #2
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    add.w r7, r7, r0, lsl #2
 ; CHECK-NEXT:    b .LBB16_5
 ; CHECK-NEXT:  .LBB16_4: @ %for.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r0, r9, [sp, #12] @ 8-byte Folded Reload
 ; CHECK-NEXT:    wls lr, r0, .LBB16_5
 ; CHECK-NEXT:    b .LBB16_10
 ; CHECK-NEXT:  .LBB16_5: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
-; CHECK-NEXT:    add.w r0, r4, r0, lsl #2
-; CHECK-NEXT:    add.w r4, r0, #16
+; CHECK-NEXT:    add.w r0, r7, r0, lsl #2
+; CHECK-NEXT:    add.w r7, r0, #16
 ; CHECK-NEXT:    beq .LBB16_12
 ; CHECK-NEXT:  .LBB16_6: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
@@ -1042,76 +1040,76 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
 ; CHECK-NEXT:    add.w lr, r10, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
-; CHECK-NEXT:    ldrd r3, r7, [r10]
+; CHECK-NEXT:    ldrd r3, r4, [r10]
 ; CHECK-NEXT:    ldm.w lr, {r0, r5, r6, lr}
 ; CHECK-NEXT:    ldrd r11, r8, [r10, #24]
 ; CHECK-NEXT:    vstrb.8 q0, [r9], #16
-; CHECK-NEXT:    vldrw.u32 q0, [r4], #32
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    str.w r9, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
+; CHECK-NEXT:    vldrw.u32 q0, [r7], #32
+; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str.w r9, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-28]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r3
-; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
-; CHECK-NEXT:    vldrw.u32 q4, [r4, #-20]
-; CHECK-NEXT:    vfma.f32 q0, q1, r7
-; CHECK-NEXT:    vldrw.u32 q5, [r4, #-16]
+; CHECK-NEXT:    vldrw.u32 q6, [r7, #-24]
+; CHECK-NEXT:    vldrw.u32 q4, [r7, #-20]
+; CHECK-NEXT:    vfma.f32 q0, q1, r4
+; CHECK-NEXT:    vldrw.u32 q5, [r7, #-16]
 ; CHECK-NEXT:    vfma.f32 q0, q6, r0
-; CHECK-NEXT:    vldrw.u32 q2, [r4, #-12]
+; CHECK-NEXT:    vldrw.u32 q2, [r7, #-12]
 ; CHECK-NEXT:    vfma.f32 q0, q4, r5
-; CHECK-NEXT:    vldrw.u32 q3, [r4, #-8]
+; CHECK-NEXT:    vldrw.u32 q3, [r7, #-8]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r6
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-4]
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-4]
 ; CHECK-NEXT:    vfma.f32 q0, q2, lr
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    vfma.f32 q0, q1, r8
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    blo .LBB16_9
 ; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    add.w r4, r10, #32
 ; CHECK-NEXT:    dls lr, r0
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_8: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldm.w r7, {r0, r3, r5, r6, r8, r11}
-; CHECK-NEXT:    vldrw.u32 q1, [r4], #32
-; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
-; CHECK-NEXT:    vldrw.u32 q4, [r4, #-20]
+; CHECK-NEXT:    ldm.w r4, {r0, r3, r5, r6, r8, r11}
+; CHECK-NEXT:    vldrw.u32 q1, [r7], #32
+; CHECK-NEXT:    vldrw.u32 q6, [r7, #-24]
+; CHECK-NEXT:    vldrw.u32 q4, [r7, #-20]
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
-; CHECK-NEXT:    vldrw.u32 q5, [r4, #-16]
-; CHECK-NEXT:    vldrw.u32 q2, [r4, #-12]
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-28]
+; CHECK-NEXT:    vldrw.u32 q5, [r7, #-16]
+; CHECK-NEXT:    vldrw.u32 q2, [r7, #-12]
 ; CHECK-NEXT:    vfma.f32 q0, q1, r3
-; CHECK-NEXT:    ldrd r9, r1, [r7, #24]
+; CHECK-NEXT:    ldrd r9, r1, [r4, #24]
 ; CHECK-NEXT:    vfma.f32 q0, q6, r5
-; CHECK-NEXT:    vldrw.u32 q3, [r4, #-8]
+; CHECK-NEXT:    vldrw.u32 q3, [r7, #-8]
 ; CHECK-NEXT:    vfma.f32 q0, q4, r6
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-4]
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-4]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r8
-; CHECK-NEXT:    adds r7, #32
+; CHECK-NEXT:    adds r4, #32
 ; CHECK-NEXT:    vfma.f32 q0, q2, r11
 ; CHECK-NEXT:    vfma.f32 q0, q3, r9
 ; CHECK-NEXT:    vfma.f32 q0, q1, r1
 ; CHECK-NEXT:    le lr, .LBB16_8
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    add.w r4, r10, #32
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:  .LBB16_11: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldr r0, [r7], #4
+; CHECK-NEXT:    ldr r0, [r4], #4
 ; CHECK-NEXT:    vldrw.u32 q1, [r3], #4
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
 ; CHECK-NEXT:    le lr, .LBB16_11
 ; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12:
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index e8b49c1..0d86f22 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -711,8 +711,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #136
-; CHECK-NEXT:    sub sp, #136
+; CHECK-NEXT:    .pad #120
+; CHECK-NEXT:    sub sp, #120
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    strd r1, r2, [sp, #64] @ 8-byte Folded Spill
 ; CHECK-NEXT:    blt.w .LBB14_5
@@ -725,22 +725,20 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #8
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #40] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q2, #0x18
 ; CHECK-NEXT:    add.w r1, r2, r1, lsr #3
 ; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
 ; CHECK-NEXT:    adr r1, .LCPI14_0
 ; CHECK-NEXT:    adr r2, .LCPI14_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #24] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    add r2, sp, #120
+; CHECK-NEXT:    add r2, sp, #104
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #8] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB14_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB14_3 Depth 2
 ; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT:    add.w r10, sp, #104
+; CHECK-NEXT:    add.w r10, sp, #88
 ; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    ldr r7, [sp, #64] @ 4-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #24] @ 16-byte Reload
@@ -762,7 +760,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vmov r6, r2, d4
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh.w r12, [r4]
-; CHECK-NEXT:    add r4, sp, #88
+; CHECK-NEXT:    add r4, sp, #72
 ; CHECK-NEXT:    ldrh.w r11, [r5]
 ; CHECK-NEXT:    ldrh r3, [r3]
 ; CHECK-NEXT:    ldrh r5, [r6]
@@ -807,7 +805,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vmov.16 q3[0], r2
 ; CHECK-NEXT:    vmov.16 q3[1], r5
 ; CHECK-NEXT:    vmov r2, r5, d5
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i16 q2, #0x18
 ; CHECK-NEXT:    vadd.i16 q6, q6, q2
 ; CHECK-NEXT:    vadd.i16 q5, q5, q2
 ; CHECK-NEXT:    vadd.i16 q4, q4, q2
@@ -849,7 +847,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    cmp r1, r3
 ; CHECK-NEXT:    bne.w .LBB14_2
 ; CHECK-NEXT:  .LBB14_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #136
+; CHECK-NEXT:    add sp, #120
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -950,7 +948,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    adr r6, .LCPI15_9
-; CHECK-NEXT:    vmov.i32 q2, #0x30
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
@@ -963,212 +960,213 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:  .LBB15_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB15_3 Depth 2
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    adr r1, .LCPI15_3
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r1, .LCPI15_4
 ; CHECK-NEXT:    vldrw.u32 q5, [r1]
+; CHECK-NEXT:    adr r1, .LCPI15_4
+; CHECK-NEXT:    vstrw.32 q2, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_2
-; CHECK-NEXT:    vldrw.u32 q3, [r1]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #280] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_10
-; CHECK-NEXT:    vstrw.32 q6, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q2, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q3, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_11
 ; CHECK-NEXT:    ldr.w r8, [sp, #116] @ 4-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q6, [sp, #264] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q2, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q7, [r1]
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    mov r11, r10
-; CHECK-NEXT:    vstrw.32 q6, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q3, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q2, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #216] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB15_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB15_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vadd.i32 q4, q1, r0
-; CHECK-NEXT:    vstrw.32 q7, [sp, #136] @ 16-byte Spill
-; CHECK-NEXT:    vmov r1, lr, d8
-; CHECK-NEXT:    vadd.i32 q7, q7, r0
-; CHECK-NEXT:    vmov r5, r4, d15
-; CHECK-NEXT:    vadd.i32 q6, q0, r0
-; CHECK-NEXT:    vmov r6, r7, d13
+; CHECK-NEXT:    vmov q0, q7
+; CHECK-NEXT:    vstrw.32 q7, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT:    vadd.i32 q7, q5, r0
+; CHECK-NEXT:    vstrw.32 q5, [sp, #200] @ 16-byte Spill
+; CHECK-NEXT:    vadd.i32 q5, q0, r0
+; CHECK-NEXT:    vmov q0, q6
+; CHECK-NEXT:    vadd.i32 q6, q4, r0
+; CHECK-NEXT:    vmov r5, r4, d11
+; CHECK-NEXT:    vmov r1, lr, d12
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vmov r6, r7, d15
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #152] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp, #168] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #216] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vstrw.32 q5, [sp, #120] @ 16-byte Spill
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov q1, q3
+; CHECK-NEXT:    vstrw.32 q4, [sp, #168] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #248] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #120] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #136] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #184] @ 16-byte Reload
 ; CHECK-NEXT:    subs.w r11, r11, #16
-; CHECK-NEXT:    ldrb.w r9, [r1]
-; CHECK-NEXT:    vmov r1, r3, d14
 ; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    ldrb.w r9, [r1]
+; CHECK-NEXT:    vmov r1, r3, d10
 ; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[0], r1
+; CHECK-NEXT:    vmov.8 q5[0], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q7[1], r1
-; CHECK-NEXT:    vmov r1, r3, d12
-; CHECK-NEXT:    vmov.8 q7[2], r5
+; CHECK-NEXT:    vmov.8 q5[1], r1
+; CHECK-NEXT:    vmov r1, r3, d14
+; CHECK-NEXT:    vmov.8 q5[2], r5
 ; CHECK-NEXT:    ldrb r5, [r6]
 ; CHECK-NEXT:    ldrb r6, [r4]
-; CHECK-NEXT:    vmov.8 q7[3], r6
+; CHECK-NEXT:    vmov.8 q5[3], r6
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.8 q6[0], r1
-; CHECK-NEXT:    vmov r6, r1, d2
-; CHECK-NEXT:    vmov.8 q6[1], r3
-; CHECK-NEXT:    vmov.8 q6[2], r5
-; CHECK-NEXT:    vmov.8 q6[3], r7
+; CHECK-NEXT:    vmov.8 q7[0], r1
+; CHECK-NEXT:    vmov r6, r1, d4
+; CHECK-NEXT:    vmov.8 q7[1], r3
+; CHECK-NEXT:    vmov.8 q7[2], r5
+; CHECK-NEXT:    vmov.8 q7[3], r7
 ; CHECK-NEXT:    ldrb.w r7, [lr]
-; CHECK-NEXT:    vmov.8 q6[4], r9
-; CHECK-NEXT:    vmov.8 q6[5], r7
+; CHECK-NEXT:    vmov.8 q7[4], r9
+; CHECK-NEXT:    vmov.8 q7[5], r7
 ; CHECK-NEXT:    ldrb r4, [r1]
-; CHECK-NEXT:    vmov r1, r5, d3
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #232] @ 16-byte Reload
+; CHECK-NEXT:    vmov r1, r5, d5
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #280] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, r3, d9
+; CHECK-NEXT:    vmov r1, r3, d13
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #184] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #232] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.8 q6[6], r1
-; CHECK-NEXT:    vmov r1, r7, d0
-; CHECK-NEXT:    vmov.8 q6[7], r3
+; CHECK-NEXT:    vmov.8 q7[6], r1
+; CHECK-NEXT:    vmov r1, r7, d4
+; CHECK-NEXT:    vmov.8 q7[7], r3
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    vmov.8 q7[4], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #264] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q7[5], r7
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov.8 q5[4], r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vmov.8 q5[5], r7
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #296] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.8 q7[6], r1
+; CHECK-NEXT:    vmov.8 q5[6], r1
 ; CHECK-NEXT:    ldrb r1, [r6]
-; CHECK-NEXT:    vmov r7, r6, d0
-; CHECK-NEXT:    vmov.8 q7[7], r3
-; CHECK-NEXT:    vmov r3, lr, d1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q7[8], r1
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov.8 q7[9], r4
-; CHECK-NEXT:    vmov r4, r1, d0
-; CHECK-NEXT:    vmov.8 q7[10], r12
-; CHECK-NEXT:    vmov.8 q7[11], r5
+; CHECK-NEXT:    vmov.8 q5[7], r3
+; CHECK-NEXT:    vmov r7, r6, d4
+; CHECK-NEXT:    vmov r3, lr, d5
+; CHECK-NEXT:    vmov.8 q5[8], r1
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vmov.8 q5[9], r4
+; CHECK-NEXT:    vmov r4, r1, d4
+; CHECK-NEXT:    vmov.8 q5[10], r12
+; CHECK-NEXT:    vmov.8 q5[11], r5
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #264] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    ldrb r6, [r6]
 ; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[8], r4
-; CHECK-NEXT:    vmov r5, r4, d1
-; CHECK-NEXT:    vmov.8 q6[9], r1
-; CHECK-NEXT:    vadd.i32 q0, q5, r0
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #200] @ 16-byte Reload
+; CHECK-NEXT:    vmov.8 q7[8], r4
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    vmov.8 q7[9], r1
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #216] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vmov.8 q6[10], r5
-; CHECK-NEXT:    vmov.8 q6[11], r4
-; CHECK-NEXT:    vmov.8 q6[12], r7
-; CHECK-NEXT:    vmov.8 q6[13], r6
-; CHECK-NEXT:    vmov.8 q6[14], r3
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov.8 q7[10], r5
+; CHECK-NEXT:    vmov.8 q7[11], r4
+; CHECK-NEXT:    vmov.8 q7[12], r7
+; CHECK-NEXT:    vmov.8 q7[13], r6
+; CHECK-NEXT:    vmov.8 q7[14], r3
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[12], r1
+; CHECK-NEXT:    vmov.8 q5[12], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q7[13], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q1, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vstrw.32 q1, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vstrw.32 q1, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #152] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vmov.8 q5[13], r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[14], r1
+; CHECK-NEXT:    vmov.8 q5[14], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q7[15], r1
+; CHECK-NEXT:    vmov.8 q5[15], r1
 ; CHECK-NEXT:    ldrb.w r1, [lr]
-; CHECK-NEXT:    vmov.8 q6[15], r1
-; CHECK-NEXT:    vmov r1, r3, d0
-; CHECK-NEXT:    vadd.i8 q6, q6, q7
+; CHECK-NEXT:    vmov.8 q7[15], r1
+; CHECK-NEXT:    vmov r1, r3, d4
+; CHECK-NEXT:    vadd.i8 q5, q7, q5
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    vmov.8 q7[0], r1
 ; CHECK-NEXT:    vmov.8 q7[1], r3
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q3, r0
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #296] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #264] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q4, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[2], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[3], r1
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[4], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[5], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q5, r0
-; CHECK-NEXT:    vadd.i32 q5, q5, q2
-; CHECK-NEXT:    vstrw.32 q5, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #120] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q5, q5, q2
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q6, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[6], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[7], r1
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[8], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[9], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q4, r0
-; CHECK-NEXT:    vadd.i32 q4, q4, q2
-; CHECK-NEXT:    vstrw.32 q4, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[10], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[11], r1
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[12], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[13], r1
-; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vmov r1, r3, d5
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[14], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[15], r1
-; CHECK-NEXT:    vadd.i8 q0, q6, q7
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #136] @ 16-byte Reload
-; CHECK-NEXT:    vstrb.8 q0, [r8], #16
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #168] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q7, q7, q2
+; CHECK-NEXT:    vadd.i8 q2, q5, q7
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #200] @ 16-byte Reload
+; CHECK-NEXT:    vstrb.8 q2, [r8], #16
+; CHECK-NEXT:    vmov.i32 q2, #0x30
+; CHECK-NEXT:    vadd.i32 q6, q6, q2
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vstrw.32 q6, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #296] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vadd.i32 q4, q4, q2
+; CHECK-NEXT:    vadd.i32 q6, q6, q2
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vmov q7, q3
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #136] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q1, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #152] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q4, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #168] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q6, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #120] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #216] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #280] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q5, q5, q2
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vadd.i32 q4, q4, q2
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vadd.i32 q6, q6, q2
+; CHECK-NEXT:    vstrw.32 q0, [sp, #280] @ 16-byte Spill
 ; CHECK-NEXT:    bne.w .LBB15_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB15_2 Depth=1
@@ -1501,14 +1499,14 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n)
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    blt .LBB18_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    adr.w lr, .LCPI18_0
+; CHECK-NEXT:    adr r3, .LCPI18_0
 ; CHECK-NEXT:    adr r4, .LCPI18_1
 ; CHECK-NEXT:    adr r5, .LCPI18_2
 ; CHECK-NEXT:    adr r6, .LCPI18_3
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
 ; CHECK-NEXT:    vldrw.u32 q2, [r4]
-; CHECK-NEXT:    vldrw.u32 q3, [lr]
+; CHECK-NEXT:    vldrw.u32 q3, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r1
 ; CHECK-NEXT:    vadd.i32 q2, q2, r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index dad856c..00a998c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -38,7 +38,7 @@ define arm_aapcs_vfpcc void @k() {
 ; CHECK-NEXT:    vmov.i32 q5, #0x0
 ; CHECK-NEXT:    vpsel q6, q4, q3
 ; CHECK-NEXT:    vstrh.16 q6, [r0]
-; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vmov.i32 q6, #0x0
 ; CHECK-NEXT:    cbz r1, .LBB0_2
 ; CHECK-NEXT:    le .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: @ %for.cond4.preheader
@@ -135,12 +135,12 @@ vector.body115:                                   ; preds = %vector.body115, %ve
 define dso_local i32 @e() #0 {
 ; CHECK-LABEL: e:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #408
-; CHECK-NEXT:    sub sp, #408
+; CHECK-NEXT:    .pad #392
+; CHECK-NEXT:    sub sp, #392
 ; CHECK-NEXT:    movw r7, :lower16:.L_MergedGlobals
 ; CHECK-NEXT:    vldr s15, .LCPI1_1
 ; CHECK-NEXT:    movt r7, :upper16:.L_MergedGlobals
@@ -148,18 +148,16 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    mov r4, r7
 ; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:    ldr r6, [r4, #8]!
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    ldr r0, [r3, #4]!
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    movt r2, :upper16:e
+; CHECK-NEXT:    ldr r0, [r3, #4]!
 ; CHECK-NEXT:    vmov r5, s15
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r4
-; CHECK-NEXT:    vmov s13, r3
 ; CHECK-NEXT:    vldr s12, .LCPI1_0
+; CHECK-NEXT:    vmov s13, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r2
 ; CHECK-NEXT:    vdup.32 q7, r3
 ; CHECK-NEXT:    vmov q6[2], q6[0], r3, r5
-; CHECK-NEXT:    vstrw.32 q0, [sp, #92]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #76]
 ; CHECK-NEXT:    vmov q0, q7
 ; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
 ; CHECK-NEXT:    vmov q4, q7
@@ -168,7 +166,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov s21, r2
 ; CHECK-NEXT:    movs r1, #64
 ; CHECK-NEXT:    vmov.f32 s20, s12
-; CHECK-NEXT:    str r0, [sp, #40]
+; CHECK-NEXT:    str r0, [sp, #24]
 ; CHECK-NEXT:    vmov.f32 s22, s13
 ; CHECK-NEXT:    str r6, [r0]
 ; CHECK-NEXT:    vmov.f32 s23, s15
@@ -186,12 +184,12 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov q2[3], q2[1], r4, r5
 ; CHECK-NEXT:    vmov.32 q4[0], r8
 ; CHECK-NEXT:    @ implicit-def: $r2
-; CHECK-NEXT:    str.w r8, [sp, #44]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #60]
-; CHECK-NEXT:    strh.w r12, [sp, #406]
+; CHECK-NEXT:    str.w r8, [sp, #28]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #44]
+; CHECK-NEXT:    strh.w r12, [sp, #390]
 ; CHECK-NEXT:    wlstp.8 lr, r1, .LBB1_2
 ; CHECK-NEXT:  .LBB1_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    letp lr, .LBB1_1
 ; CHECK-NEXT:  .LBB1_2: @ %entry
@@ -199,7 +197,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    str.w r8, [r7]
 ; CHECK-NEXT:    vstrw.32 q4, [r0]
 ; CHECK-NEXT:    vstrw.32 q2, [r0]
-; CHECK-NEXT:    str.w r12, [sp, #324]
+; CHECK-NEXT:    str.w r12, [sp, #308]
 ; CHECK-NEXT:  .LBB1_3: @ %for.cond
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    b .LBB1_3
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index f90af3c..2587a0bb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -115,17 +115,17 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    movs r4, #1
 ; CHECK-NEXT:    ldr r3, [r0]
 ; CHECK-NEXT:    add.w r11, r3, r12, lsl #2
-; CHECK-NEXT:    add.w r7, r3, r12, lsl #3
-; CHECK-NEXT:    lsl.w r9, r12, #3
+; CHECK-NEXT:    add.w r6, r3, r12, lsl #3
+; CHECK-NEXT:    lsl.w r10, r12, #3
 ; CHECK-NEXT:  .LBB1_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB1_3 Depth 2
 ; CHECK-NEXT:    ldr r5, [sp] @ 4-byte Reload
+; CHECK-NEXT:    add.w r9, r4, #1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w r10, r4, #1
 ; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB1_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB1_2 Depth=1
@@ -139,11 +139,11 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s2, s2, s3
-; CHECK-NEXT:    add.w r0, r2, r10, lsl #2
+; CHECK-NEXT:    add.w r0, r2, r9, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    add r11, r9
+; CHECK-NEXT:    add r11, r10
 ; CHECK-NEXT:    vadd.f32 s6, s6, s7
-; CHECK-NEXT:    add r7, r9
+; CHECK-NEXT:    add r6, r10
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vadd.f32 s2, s4, s6
@@ -228,46 +228,40 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
-; CHECK-NEXT:    str r2, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #3
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo .LBB2_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r9, [r0, #8]
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    add.w r0, r3, r3, lsl #1
-; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
-; CHECK-NEXT:    add.w r12, r1, r3, lsl #3
-; CHECK-NEXT:    adds r3, #3
+; CHECK-NEXT:    add.w r3, r9, #3
 ; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    add.w r10, r1, r0, lsl #2
+; CHECK-NEXT:    add.w r0, r9, r9, lsl #1
 ; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    add.w r10, r1, r9, lsl #2
+; CHECK-NEXT:    add.w r12, r1, r9, lsl #3
+; CHECK-NEXT:    add.w r1, r1, r0, lsl #2
+; CHECK-NEXT:    add.w r3, r5, r3, lsr #2
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    lsl.w r11, r0, #2
-; CHECK-NEXT:    add.w r1, r5, r3, lsr #2
-; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB2_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB2_3 Depth 2
-; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    adds r0, r5, #2
-; CHECK-NEXT:    adds r2, r5, #1
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    mov r3, r9
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    mov r3, r10
 ; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    mov r4, r10
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    dlstp.32 lr, r7
+; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r9
 ; CHECK-NEXT:  .LBB2_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB2_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -282,31 +276,31 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB2_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
-; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r5, #1
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    add r9, r11
+; CHECK-NEXT:    add r10, r11
 ; CHECK-NEXT:    vadd.f32 s6, s6, s7
-; CHECK-NEXT:    add.w r0, r1, r2, lsl #2
+; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    add r12, r11
 ; CHECK-NEXT:    vadd.f32 s2, s2, s3
-; CHECK-NEXT:    add r10, r11
+; CHECK-NEXT:    add r1, r11
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    vadd.f32 s8, s8, s10
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vstr s8, [r0]
-; CHECK-NEXT:    add.w r0, r1, r5, lsl #2
-; CHECK-NEXT:    adds r5, #3
+; CHECK-NEXT:    add.w r0, r2, r5, lsl #2
 ; CHECK-NEXT:    vstr s4, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
+; CHECK-NEXT:    adds r0, r5, #2
+; CHECK-NEXT:    adds r5, #3
+; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
 ; CHECK-NEXT:    vstr s0, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r5, r0
 ; CHECK-NEXT:    blo .LBB2_2
 ; CHECK-NEXT:  .LBB2_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -394,15 +388,15 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    .pad #40
-; CHECK-NEXT:    sub sp, #40
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
-; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #4
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
-; CHECK-NEXT:    blo.w .LBB3_5
+; CHECK-NEXT:    blo .LBB3_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    ldr r2, [r0, #8]
 ; CHECK-NEXT:    movs r6, #1
@@ -410,34 +404,28 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    add.w r0, r2, r2, lsl #1
 ; CHECK-NEXT:    add.w r12, r1, r2, lsl #2
 ; CHECK-NEXT:    add.w r8, r1, r2, lsl #3
-; CHECK-NEXT:    add.w r9, r1, r2, lsl #4
-; CHECK-NEXT:    add.w r11, r1, r0, lsl #2
+; CHECK-NEXT:    add.w r10, r1, r2, lsl #4
+; CHECK-NEXT:    add.w r9, r1, r0, lsl #2
 ; CHECK-NEXT:    adds r0, r2, #3
 ; CHECK-NEXT:    bic r0, r0, #3
 ; CHECK-NEXT:    subs r0, #4
 ; CHECK-NEXT:    add.w r0, r6, r0, lsr #2
-; CHECK-NEXT:    strd r0, r2, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT:    strd r0, r2, [sp, #4] @ 8-byte Folded Spill
 ; CHECK-NEXT:    lsls r0, r2, #4
-; CHECK-NEXT:    ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ldrd r2, r7, [sp, #4] @ 8-byte Folded Reload
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB3_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB3_3 Depth 2
-; CHECK-NEXT:    adds r0, r6, #3
-; CHECK-NEXT:    str r0, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT:    adds r0, r6, #2
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    adds r0, r6, #1
-; CHECK-NEXT:    str r0, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r5, r11
-; CHECK-NEXT:    mov r4, r9
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    mov r5, r9
+; CHECK-NEXT:    mov r4, r10
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    dlstp.32 lr, r7
 ; CHECK-NEXT:  .LBB3_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB3_2 Depth=1
@@ -455,9 +443,9 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s14, s14, s15
-; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r6, #1
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
@@ -471,24 +459,24 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vstr s12, [r0]
 ; CHECK-NEXT:    add.w r0, r1, r6, lsl #2
-; CHECK-NEXT:    adds r6, #4
 ; CHECK-NEXT:    vstr s8, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r6, #2
 ; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
 ; CHECK-NEXT:    vstr s4, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r6, #3
+; CHECK-NEXT:    adds r6, #4
 ; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
 ; CHECK-NEXT:    vstr s0, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r12, r0
 ; CHECK-NEXT:    add r8, r0
-; CHECK-NEXT:    add r11, r0
 ; CHECK-NEXT:    add r9, r0
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    add r10, r0
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r6, r0
 ; CHECK-NEXT:    blo .LBB3_2
 ; CHECK-NEXT:  .LBB3_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #40
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -588,60 +576,53 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #5
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB4_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r12, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r12, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r12, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r5, r3, #2
+; CHECK-NEXT:    lsl.w r5, r12, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    add.w r1, r3, r3, lsl #2
-; CHECK-NEXT:    lsls r1, r1, #2
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    add.w r1, r12, r12, lsl #2
+; CHECK-NEXT:    lsls r1, r1, #2
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB4_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB4_3 Depth 2
-; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #3
-; CHECK-NEXT:    add.w r10, r0, #2
+; CHECK-NEXT:    ldr r4, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r11, r0, #1
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    dlstp.32 lr, r7
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB4_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB4_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    add.w r9, r3, r5
 ; CHECK-NEXT:    vldrw.u32 q5, [r4], #16
 ; CHECK-NEXT:    vldrw.u32 q6, [r3], #16
-; CHECK-NEXT:    add.w r12, r9, r5
+; CHECK-NEXT:    add.w r10, r9, r5
 ; CHECK-NEXT:    vfma.f32 q3, q6, q5
 ; CHECK-NEXT:    vldrw.u32 q6, [r9]
-; CHECK-NEXT:    add.w r6, r12, r5
+; CHECK-NEXT:    add.w r6, r10, r5
 ; CHECK-NEXT:    vfma.f32 q4, q6, q5
-; CHECK-NEXT:    vldrw.u32 q6, [r12]
+; CHECK-NEXT:    vldrw.u32 q6, [r10]
 ; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vfma.f32 q2, q6, q5
 ; CHECK-NEXT:    vldrw.u32 q6, [r6]
@@ -662,30 +643,31 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    vadd.f32 s1, s16, s18
-; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    vadd.f32 s12, s12, s14
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
 ; CHECK-NEXT:    vadd.f32 s6, s8, s10
 ; CHECK-NEXT:    vstr s1, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
-; CHECK-NEXT:    vadd.f32 s0, s0, s2
-; CHECK-NEXT:    adds r0, #5
 ; CHECK-NEXT:    vstr s12, [r1]
-; CHECK-NEXT:    add.w r1, r2, r10, lsl #2
+; CHECK-NEXT:    adds r1, r0, #2
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s0, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #4
+; CHECK-NEXT:    adds r0, #5
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r8, r1
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
-; CHECK-NEXT:    blo.w .LBB4_2
+; CHECK-NEXT:    blo .LBB4_2
 ; CHECK-NEXT:  .LBB4_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -797,63 +779,54 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #6
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB5_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r12, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r12, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r12, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r5, r3, #2
+; CHECK-NEXT:    lsl.w r5, r12, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    add.w r1, r3, r3, lsl #1
+; CHECK-NEXT:    add.w r1, r12, r12, lsl #1
 ; CHECK-NEXT:    lsls r1, r1, #3
 ; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB5_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB5_3 Depth 2
-; CHECK-NEXT:    adds r1, r0, #5
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #3
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    add.w r11, r0, #2
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    adds r4, r0, #1
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vmov q5, q1
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    dlstp.32 lr, r7
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB5_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB5_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    add.w r12, r3, r5
+; CHECK-NEXT:    add.w r10, r3, r5
 ; CHECK-NEXT:    vldrw.u32 q6, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q7, [r3], #16
-; CHECK-NEXT:    add.w r10, r12, r5
+; CHECK-NEXT:    add.w r11, r10, r5
 ; CHECK-NEXT:    vfma.f32 q4, q7, q6
-; CHECK-NEXT:    vldrw.u32 q7, [r12]
-; CHECK-NEXT:    add.w r6, r10, r5
-; CHECK-NEXT:    vfma.f32 q5, q7, q6
 ; CHECK-NEXT:    vldrw.u32 q7, [r10]
+; CHECK-NEXT:    add.w r6, r11, r5
+; CHECK-NEXT:    vfma.f32 q5, q7, q6
+; CHECK-NEXT:    vldrw.u32 q7, [r11]
 ; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vfma.f32 q2, q7, q6
 ; CHECK-NEXT:    vldrw.u32 q7, [r6]
@@ -885,28 +858,29 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vstr s1, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
-; CHECK-NEXT:    adds r0, #6
 ; CHECK-NEXT:    vstr s3, [r1]
-; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
+; CHECK-NEXT:    adds r1, r0, #2
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
+; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s8, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    vadd.f32 s6, s12, s14
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s0, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #5
+; CHECK-NEXT:    adds r0, #6
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
 ; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r8, r1
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB5_2
 ; CHECK-NEXT:  .LBB5_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1030,73 +1004,64 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #72
-; CHECK-NEXT:    sub sp, #72
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #7
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB6_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r10, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r10, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r9, r1, r10, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r5, r3, #2
+; CHECK-NEXT:    lsl.w r5, r10, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    rsb r1, r3, r3, lsl #3
-; CHECK-NEXT:    lsls r1, r1, #2
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    rsb r1, r10, r10, lsl #3
+; CHECK-NEXT:    lsls r1, r1, #2
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB6_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB6_3 Depth 2
-; CHECK-NEXT:    adds r1, r0, #6
-; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #5
-; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #3
-; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    adds r4, r0, #2
 ; CHECK-NEXT:    add.w r8, r0, #1
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    mov r3, r9
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov q5, q2
-; CHECK-NEXT:    vmov q3, q2
-; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov q1, q2
-; CHECK-NEXT:    mov r12, r7
-; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT:    dls lr, r6
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vmov.i32 q6, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB6_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB6_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r12
-; CHECK-NEXT:    add.w r10, r3, r5
+; CHECK-NEXT:    add.w r11, r3, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q7, [r1], #16
 ; CHECK-NEXT:    vldrwt.u32 q0, [r3], #16
-; CHECK-NEXT:    add.w r11, r10, r5
+; CHECK-NEXT:    add.w r6, r11, r5
 ; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q5, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r10]
-; CHECK-NEXT:    add.w r6, r11, r5
+; CHECK-NEXT:    vldrwt.u32 q0, [r11]
+; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q6, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r11]
-; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q1, q0, q7
@@ -1104,26 +1069,26 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q0, [r6]
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r6, r5
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q1, q0, q7
 ; CHECK-NEXT:    vldrwt.u32 q0, [r7]
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    adds r6, r7, r5
-; CHECK-NEXT:    vstrw.32 q1, [sp, #56] @ 16-byte Spill
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vfmat.f32 q1, q0, q7
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
+; CHECK-NEXT:    adds r7, r6, r5
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q1, q3
 ; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q3, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r6]
+; CHECK-NEXT:    vldrwt.u32 q0, [r7]
 ; CHECK-NEXT:    vmov q4, q5
-; CHECK-NEXT:    adds r7, r6, r5
+; CHECK-NEXT:    adds r6, r7, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q4, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r7]
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
 ; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q2, q0, q7
 ; CHECK-NEXT:    le lr, .LBB6_3
@@ -1138,45 +1103,45 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vadd.f32 s9, s18, s19
 ; CHECK-NEXT:    vadd.f32 s11, s16, s17
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT:    vadd.f32 s2, s3, s1
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vadd.f32 s5, s18, s19
 ; CHECK-NEXT:    vadd.f32 s7, s16, s17
+; CHECK-NEXT:    vadd.f32 s2, s3, s1
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
-; CHECK-NEXT:    vstr s0, [r1]
-; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s14, s14, s15
-; CHECK-NEXT:    adds r0, #7
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vstr s2, [r1]
-; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
+; CHECK-NEXT:    vstr s0, [r1]
+; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s8, s8, s10
 ; CHECK-NEXT:    vadd.f32 s6, s7, s5
-; CHECK-NEXT:    vstr s4, [r1]
+; CHECK-NEXT:    vstr s2, [r1]
+; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vadd.f32 s10, s11, s9
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    vstr s4, [r1]
+; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    vadd.f32 s12, s12, s14
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s12, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #5
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s10, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #6
+; CHECK-NEXT:    adds r0, #7
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s8, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r9, r1
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB6_2
 ; CHECK-NEXT:  .LBB6_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #72
+; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1312,107 +1277,99 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #88
-; CHECK-NEXT:    sub sp, #88
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #8
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB7_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r11, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r11, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r12, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r12, r1, r11, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r6, r3, #2
+; CHECK-NEXT:    lsl.w r6, r11, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    lsls r1, r3, #5
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    lsl.w r1, r11, #5
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB7_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB7_3 Depth 2
-; CHECK-NEXT:    adds r1, r0, #7
-; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #6
-; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #5
-; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    ldr.w r9, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    ldr.w r9, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    adds r4, r0, #3
 ; CHECK-NEXT:    add.w r8, r0, #2
 ; CHECK-NEXT:    adds r1, r0, #1
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    mov r3, r12
-; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    vmov q6, q3
-; CHECK-NEXT:    vmov q4, q3
-; CHECK-NEXT:    vmov q7, q3
-; CHECK-NEXT:    vmov q2, q3
-; CHECK-NEXT:    mov r10, r7
-; CHECK-NEXT:    vstrw.32 q3, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q3, [sp, #72] @ 16-byte Spill
-; CHECK-NEXT:    dls lr, r5
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vmov.i32 q6, #0x0
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vmov.i32 q7, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    mov r10, r11
+; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB7_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB7_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r10
-; CHECK-NEXT:    add.w r11, r3, r6
+; CHECK-NEXT:    adds r5, r3, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q0, [r9], #16
 ; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
-; CHECK-NEXT:    add.w r5, r11, r6
+; CHECK-NEXT:    adds r7, r5, r6
 ; CHECK-NEXT:    sub.w r10, r10, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q6, q1, q0
-; CHECK-NEXT:    vldrwt.u32 q1, [r11]
-; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q7, q1, q0
-; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    vmov q3, q4
-; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmov q5, q4
+; CHECK-NEXT:    vmov q4, q3
+; CHECK-NEXT:    vmov q3, q2
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q1, [r5]
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r5, r6
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
-; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    adds r5, r7, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    adds r7, r5, r6
-; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
-; CHECK-NEXT:    vmov q2, q4
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
 ; CHECK-NEXT:    adds r5, r7, r6
-; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov q2, q3
+; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q4, q1, q0
+; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    vmov q4, q5
+; CHECK-NEXT:    adds r7, r5, r6
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vfmat.f32 q4, q1, q0
+; CHECK-NEXT:    vldrwt.u32 q1, [r7]
 ; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    add r5, r6
+; CHECK-NEXT:    adds r5, r7, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q5, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q3, q1, q0
 ; CHECK-NEXT:    le lr, .LBB7_3
@@ -1425,12 +1382,12 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s6, s24, s25
 ; CHECK-NEXT:    vadd.f32 s5, s18, s19
 ; CHECK-NEXT:    vadd.f32 s7, s16, s17
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #56] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    vadd.f32 s9, s18, s19
 ; CHECK-NEXT:    vadd.f32 s11, s16, s17
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.f32 s14, s14, s15
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
 ; CHECK-NEXT:    vadd.f32 s13, s18, s19
@@ -1445,33 +1402,33 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vstr s0, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s3, s20, s21
-; CHECK-NEXT:    adds r0, #8
 ; CHECK-NEXT:    vstr s2, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r8, lsl #2
 ; CHECK-NEXT:    vadd.f32 s12, s7, s5
 ; CHECK-NEXT:    vstr s10, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vstr s14, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT:    vadd.f32 s4, s3, s1
+; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
+; CHECK-NEXT:    vadd.f32 s4, s3, s1
 ; CHECK-NEXT:    vstr s8, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #5
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s12, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #6
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #7
+; CHECK-NEXT:    adds r0, #8
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r12, r1
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB7_2
 ; CHECK-NEXT:  .LBB7_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #88
+; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
index 29c4fb9..413c4a1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
@@ -1496,15 +1496,14 @@ define void @vfmasq(ptr %x, ptr %y, i32 %n) {
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
 ; CHECK-NEXT:  .LBB34_1: @ %for.body.preheader
-; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB34_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
-; CHECK-NEXT:    vfma.f32 q3, q2, q1
-; CHECK-NEXT:    vstrw.32 q3, [r1], #16
+; CHECK-NEXT:    vmov.f32 q2, #1.000000e+01
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vfma.f32 q2, q1, q0
+; CHECK-NEXT:    vstrw.32 q2, [r1], #16
 ; CHECK-NEXT:    letp lr, .LBB34_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -1542,15 +1541,14 @@ define void @vfmas(ptr %s1, ptr %s2, i32 %N) {
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
 ; CHECK-NEXT:  .LBB35_1: @ %while.body.lr.ph
-; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB35_2: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vfma.f32 q3, q2, q1
-; CHECK-NEXT:    vstrw.32 q3, [r0], #16
+; CHECK-NEXT:    vmov.f32 q2, #1.000000e+01
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vfma.f32 q2, q1, q0
+; CHECK-NEXT:    vstrw.32 q2, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB35_2
 ; CHECK-NEXT:  @ %bb.3: @ %while.end
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
index e845070..62482c1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
@@ -287,17 +287,17 @@ define void @shlor(ptr nocapture readonly %x, ptr noalias nocapture %y, i32 %n)
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    blt .LBB5_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    adr.w lr, .LCPI5_0
-; CHECK-NEXT:    adr r4, .LCPI5_1
+; CHECK-NEXT:    adr r4, .LCPI5_0
+; CHECK-NEXT:    adr r3, .LCPI5_1
 ; CHECK-NEXT:    adr r5, .LCPI5_2
 ; CHECK-NEXT:    adr r6, .LCPI5_3
-; CHECK-NEXT:    vldrw.u32 q2, [r4]
+; CHECK-NEXT:    vldrw.u32 q2, [r3]
+; CHECK-NEXT:    vldrw.u32 q3, [r4]
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
-; CHECK-NEXT:    vldrw.u32 q3, [lr]
+; CHECK-NEXT:    vadd.i32 q2, q2, r1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r1
-; CHECK-NEXT:    vadd.i32 q2, q2, r1
 ; CHECK-NEXT:    vadd.i32 q3, q3, r1
 ; CHECK-NEXT:    mov.w r12, #1
 ; CHECK-NEXT:    movs r4, #3
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index f9948db..c92c2be 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -656,14 +656,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q1, #0xff
 ; CHECK-NEXT:    vpsel q5, q1, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r0, q5[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[1]
@@ -706,7 +704,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    orrs r1, r3
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    vmov r2, r3, d15
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q2[3]
@@ -785,6 +783,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vpsel q6, q1, q7
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -853,7 +852,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov r2, r3, d1
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -2065,14 +2063,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q1, #0xff
 ; CHECK-NEXT:    vpsel q5, q1, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r2, q5[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[1]
@@ -2115,7 +2111,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    orr.w lr, lr, r3
 ; CHECK-NEXT:    add r12, r2
 ; CHECK-NEXT:    vmov r3, r2, d15
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vmov.u8 r2, q2[3]
@@ -2194,6 +2190,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vpsel q6, q1, q7
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -2264,7 +2261,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    adc.w r3, r3, lr
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index 63b1431..9f55183 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -817,16 +817,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q6, q2, q0
-; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vmov.u8 r0, q6[0]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov.16 q0[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q6[1]
 ; CHECK-NEXT:    vmov.16 q0[1], r0
@@ -842,9 +840,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q0[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q6[7]
 ; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vmov.u8 r2, q3[0]
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vpsel q7, q2, q4
 ; CHECK-NEXT:    vmov.u16 r0, q7[2]
 ; CHECK-NEXT:    vmov.u16 r1, q7[0]
@@ -895,7 +892,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0xff
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -916,8 +913,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.u16 r3, q7[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vpsel q0, q0, q4
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q0, q4, q0
 ; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
 ; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
@@ -932,7 +929,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r0, s30
 ; CHECK-NEXT:    vmov r1, s28
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0xff
 ; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -960,7 +957,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -1041,7 +1038,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -1062,7 +1059,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.u16 r3, q6[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q0, #0xff
 ; CHECK-NEXT:    vpsel q0, q0, q4
 ; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
@@ -1117,7 +1114,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    adc.w r1, r1, lr
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -1137,16 +1133,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q5, q2, q0
-; CHECK-NEXT:    vmov.s8 r2, q1[0]
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r0, q5[0]
-; CHECK-NEXT:    vmov.s8 r3, q3[0]
+; CHECK-NEXT:    vmov.s8 r2, q1[0]
 ; CHECK-NEXT:    vmov.16 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[1]
 ; CHECK-NEXT:    vmov.16 q4[1], r0
@@ -1162,9 +1156,9 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q4[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[7]
 ; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.s8 r3, q3[0]
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vpsel q6, q2, q0
 ; CHECK-NEXT:    vmov.u16 r0, q6[2]
 ; CHECK-NEXT:    vmov.u16 r1, q6[0]
@@ -1198,7 +1192,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.s8 r3, q3[3]
 ; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
@@ -1219,7 +1213,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q6, q2, q0
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -1273,17 +1268,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q6[7], r2
 ; CHECK-NEXT:    vmov.s8 r0, q1[8]
 ; CHECK-NEXT:    vcmp.i16 ne, q6, zr
+; CHECK-NEXT:    vmov.i8 q6, #0x0
+; CHECK-NEXT:    vpsel q5, q2, q6
 ; CHECK-NEXT:    vmov.s8 r1, q3[8]
-; CHECK-NEXT:    vpsel q5, q2, q7
-; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov.u16 r2, q5[2]
 ; CHECK-NEXT:    vmov.u16 r3, q5[0]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[3]
 ; CHECK-NEXT:    vmov.u16 r3, q5[1]
+; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vpsel q6, q2, q6
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -1365,7 +1361,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    adc.w r1, r1, lr
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -2296,16 +2291,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q6, q2, q0
-; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vmov.u8 r2, q6[0]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov.16 q0[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q6[1]
 ; CHECK-NEXT:    vmov.16 q0[1], r2
@@ -2321,9 +2314,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q0[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q6[7]
 ; CHECK-NEXT:    vmov.16 q0[7], r2
-; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vmov.u8 r4, q3[2]
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vpsel q7, q2, q4
 ; CHECK-NEXT:    vmov.u16 r2, q7[2]
 ; CHECK-NEXT:    vmov.u16 r3, q7[0]
@@ -2374,7 +2366,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s18
 ; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0xff
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    umull r2, r3, r3, r2
 ; CHECK-NEXT:    umull r4, r5, r5, r4
@@ -2395,8 +2387,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.u16 r4, q7[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vpsel q0, q0, q4
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q0, q4, q0
 ; CHECK-NEXT:    vmov r5, r4, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
 ; CHECK-NEXT:    vmov q4[3], q4[1], r5, r4
@@ -2411,7 +2403,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r5, s18
 ; CHECK-NEXT:    vmov r2, s30
 ; CHECK-NEXT:    vmov r3, s28
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0xff
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    umull r2, r5, r2, r5
 ; CHECK-NEXT:    umull r3, r4, r3, r4
@@ -2439,7 +2431,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r5, s2
 ; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    umull r2, r5, r2, r5
 ; CHECK-NEXT:    umull r3, r4, r3, r4
@@ -2520,7 +2512,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r5, s2
 ; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    umull r2, r5, r2, r5
 ; CHECK-NEXT:    umull r3, r4, r3, r4
@@ -2541,7 +2533,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.u16 r4, q6[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q0, #0xff
 ; CHECK-NEXT:    vpsel q0, q0, q4
 ; CHECK-NEXT:    vmov r5, r4, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
@@ -2598,7 +2590,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -2619,14 +2610,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q5, q2, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r2, q5[0]
 ; CHECK-NEXT:    vmov.s8 r4, q1[2]
 ; CHECK-NEXT:    vmov.16 q4[0], r2
@@ -2676,7 +2665,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, r3, d15
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[3]
 ; CHECK-NEXT:    vmov.s8 r3, q3[3]
@@ -2701,7 +2690,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q6, q2, q0
 ; CHECK-NEXT:    vmov r5, r4, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
@@ -2755,17 +2745,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q6[7], r5
 ; CHECK-NEXT:    vmov.s8 r2, q1[8]
 ; CHECK-NEXT:    vcmp.i16 ne, q6, zr
+; CHECK-NEXT:    vmov.i8 q6, #0x0
+; CHECK-NEXT:    vpsel q5, q2, q6
 ; CHECK-NEXT:    vmov.s8 r3, q3[8]
-; CHECK-NEXT:    vpsel q5, q2, q7
-; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov.u16 r5, q5[2]
 ; CHECK-NEXT:    vmov.u16 r4, q5[0]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
 ; CHECK-NEXT:    vmov.u16 r5, q5[3]
 ; CHECK-NEXT:    vmov.u16 r4, q5[1]
+; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vpsel q6, q2, q6
 ; CHECK-NEXT:    vmov r5, r4, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
@@ -2849,7 +2840,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
index 0c349c3..cba394f 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
@@ -59,18 +59,18 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT:   [[MOVSX64rm32_:%[0-9]+]]:gr64_nosp = MOVSX64rm32 %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
   ; CHECK-NEXT:   [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[MOV32rm2]].sub_32bit
   ; CHECK-NEXT:   [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
-  ; CHECK-NEXT:   MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_1]] :: (store (s64) into %stack.5)
   ; CHECK-NEXT:   [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr32 = COPY [[MOV32rm3]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[COPY6]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm3]]
   ; CHECK-NEXT:   [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]]
   ; CHECK-NEXT:   MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8)
+  ; CHECK-NEXT:   MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[COPY1]] :: (store (s64) into %stack.5)
   ; CHECK-NEXT:   MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6)
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY7]], [[MOVSX64rr32_2]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY7]], [[MOVSX64rm32_]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY7]], 0, $noreg
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY6]], [[MOVSX64rr32_2]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY6]], [[MOVSX64rm32_]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY6]], 0, $noreg
   ; CHECK-NEXT:   MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9)
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[MOV32rm2]]
   ; CHECK-NEXT:   MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7)
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gr64 = IMUL64rr [[COPY8]], [[MOVSX64rr32_2]], implicit-def dead $eflags
@@ -87,8 +87,11 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
   ; CHECK-NEXT:   CMP32rm [[MOV32rm4]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16)
-  ; CHECK-NEXT:   [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
-  ; CHECK-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+  ; CHECK-NEXT:   [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
+  ; CHECK-NEXT:   [[MOV64rm:%[0-9]+]]:gr64_nosp = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
+  ; CHECK-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
+  ; CHECK-NEXT:   [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm6]]
   ; CHECK-NEXT:   JCC_1 %bb.5, 13, implicit $eflags
   ; CHECK-NEXT:   JMP_1 %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -98,9 +101,8 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT:   [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
   ; CHECK-NEXT:   [[MOV64rm2:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm2]], [[MOVSX64rr32_]], implicit-def dead $eflags
   ; CHECK-NEXT:   [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm2]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1)
-  ; CHECK-NEXT:   MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[MOV64rm2]] :: (store (s64) into %stack.13)
-  ; CHECK-NEXT:   [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12)
-  ; CHECK-NEXT:   undef [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm5]]
+  ; CHECK-NEXT:   [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12)
+  ; CHECK-NEXT:   undef [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm7]]
   ; CHECK-NEXT:   [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9)
   ; CHECK-NEXT:   [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
   ; CHECK-NEXT:   JMP_1 %bb.6
@@ -123,40 +125,30 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT: bb.6.for.body17:
   ; CHECK-NEXT:   successors: %bb.6(0x7c000000), %bb.5(0x04000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit
-  ; CHECK-NEXT:   [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13)
-  ; CHECK-NEXT:   [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm7]], 1, [[MOVSX64rr32_]], 0, $noreg
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gr64 = COPY [[MOV64rm1]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:gr64 = COPY [[MOV32rm2]]
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:gr64 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY18]], 1, [[COPY9]], 0, $noreg
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]]
-  ; CHECK-NEXT:   [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
-  ; CHECK-NEXT:   [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[COPY19]], 0, $noreg
-  ; CHECK-NEXT:   [[MOV64rm:%[0-9]+]]:gr64 = COPY [[COPY19]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY18]]
-  ; CHECK-NEXT:   [[MOV32rm2:%[0-9]+]]:gr64_nosp = COPY [[COPY17]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY16]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY15]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY14]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr32 = COPY [[COPY13]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr32 = COPY [[COPY12]]
-  ; CHECK-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = COPY [[COPY11]]
-  ; CHECK-NEXT:   [[MOV64rm8:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
-  ; CHECK-NEXT:   [[LEA64_32r1:%[0-9]+]]:gr32 = COPY [[COPY10]]
-  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY6]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]]
-  ; CHECK-NEXT:   PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]]
+  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit
+  ; CHECK-NEXT:   [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY9]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm2]], 1, [[MOVSX64rr32_]], 0, $noreg
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY10]].sub_32bit
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:gr64 = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+  ; CHECK-NEXT:   [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[MOV64rm7]], 1, [[COPY10]], 0, $noreg
+  ; CHECK-NEXT:   [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm]], 0, $noreg
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[COPY15]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY14]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY13]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY12]]
+  ; CHECK-NEXT:   [[MOVSX64rm32_1:%[0-9]+]]:gr64 = COPY [[COPY11]]
+  ; CHECK-NEXT:   [[MOV32rm8:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm8]]
+  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY9]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]]
+  ; CHECK-NEXT:   PTILESTOREDV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]]
   ; CHECK-NEXT:   [[MOV64rm4:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm4]], [[MOVSX64rr32_3]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm8]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY9]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags
-  ; CHECK-NEXT:   CMP64rr [[MOV64rm4]], [[MOV64rm1]], implicit-def $eflags
+  ; CHECK-NEXT:   [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm1]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY10]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags
+  ; CHECK-NEXT:   CMP64rr [[MOV64rm4]], [[MOVSX64rm32_1]], implicit-def $eflags
   ; CHECK-NEXT:   JCC_1 %bb.6, 12, implicit $eflags
   ; CHECK-NEXT:   JMP_1 %bb.5
 entry:
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index bf6b096..b428ce4 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -136,8 +136,6 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    movl (%r8), %edx
 ; CHECK-NEXT:    leal 8(,%rbx,8), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    leaq 8(%rsi), %rax
-; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    leaq 32(%rsi), %r11
 ; CHECK-NEXT:    leaq 8(,%rbx,8), %rbx
 ; CHECK-NEXT:    xorl %r14d, %r14d
@@ -189,7 +187,8 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    jae .LBB1_7
 ; CHECK-NEXT:  # %bb.6: # %vector.memcheck
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; CHECK-NEXT:    leaq 8(%rsi), %r9
+; CHECK-NEXT:    addq %r9, %rax
 ; CHECK-NEXT:    leaq (%rax,%r10,8), %rax
 ; CHECK-NEXT:    cmpq %r15, %rax
 ; CHECK-NEXT:    ja .LBB1_14
diff --git a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
index 10ee445..d355374 100644
--- a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
+++ b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
@@ -7,8 +7,8 @@
 # CHECK: jne
 # CHECK: andl    $-16, %edx
 # CHECK: xorl    %ebx, %ebx
-# CHECK: movl    -16(%ebp), %esi
-# CHECK: xorl    %eax, %eax
+# CHECK: xorl    %esi, %esi
+# CHECK: movl    %eax, %ecx
 
 name:            test
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/X86/inalloca-invoke.ll b/llvm/test/CodeGen/X86/inalloca-invoke.ll
index c2728f7..68cb24d 100644
--- a/llvm/test/CodeGen/X86/inalloca-invoke.ll
+++ b/llvm/test/CodeGen/X86/inalloca-invoke.ll
@@ -23,7 +23,6 @@ blah:
 ; CHECK:  pushl   %eax
 ; CHECK:  subl    $20, %esp
 ; CHECK:  movl %esp, %[[beg:[^ ]*]]
-; CHECK:  leal 12(%[[beg]]), %[[end:[^ ]*]]
 
   call void @begin(ptr sret(%Iter) %temp.lvalue)
 ; CHECK:  calll _begin
@@ -32,6 +31,7 @@ blah:
           to label %invoke.cont unwind label %lpad
 
 ;  Uses end as sret param.
+; CHECK:  leal 12(%[[beg]]), %[[end:[^ ]*]]
 ; CHECK:  pushl %[[end]]
 ; CHECK:  calll _plus
 
diff --git a/llvm/test/CodeGen/X86/licm-regpressure.ll b/llvm/test/CodeGen/X86/licm-regpressure.ll
index 72a4832..26ed2a3 100644
--- a/llvm/test/CodeGen/X86/licm-regpressure.ll
+++ b/llvm/test/CodeGen/X86/licm-regpressure.ll
@@ -1,14 +1,64 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; This tests currently fails as MachineLICM does not compute register pressure
-; correctly. More details: llvm.org/PR23143
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-linux -stop-after=early-machinelicm -o - | FileCheck %s
 
-; MachineLICM should take register pressure into account.
-; CHECK-NOT: Spill
+; FIXME: MachineLICM does not compute register pressure correctly and we end up
+; emitting too many ADD64ri32s. More details: llvm.org/PR23143
 
 %struct.A = type { i32, i32, i32, i32, i32, i32, i32 }
 
 define void @test(i1 %b, ptr %a) nounwind {
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $edi, $rsi
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr32 = COPY $edi
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr8 = COPY [[COPY1]].sub_8bit
+  ; CHECK-NEXT:   [[ADD64ri32_:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 4, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_1:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 8, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_2:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 12, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_3:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 16, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_4:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 20, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_5:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 24, implicit-def dead $eflags
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.loop-body:
+  ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[COPY]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_1]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_2]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_3]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_4]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_5]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   TEST8ri [[COPY2]], 1, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.1, 5, implicit $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.loop-exit:
+  ; CHECK-NEXT:   RET 0
 entry:
   br label %loop-header
 
diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
index 7f32766..0ccaa9c 100644
--- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
@@ -172,10 +172,8 @@ define float @clamp_negative_wrong_const(float %x) {
 ; Like @clamp_test_1 but both are min
 define float @clamp_negative_same_op(float %x) {
 ; CHECK-LABEL: @clamp_negative_same_op(
-; CHECK-NEXT:    [[INNER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02
-; CHECK-NEXT:    [[INNER_SEL:%.*]] = select nnan ninf i1 [[INNER_CMP_INV]], float 2.550000e+02, float [[X]]
-; CHECK-NEXT:    [[OUTER_CMP:%.*]] = fcmp fast ult float [[X]], 1.000000e+00
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 1.000000e+00
+; CHECK-NEXT:    [[OUTER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 1.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = select nnan ninf i1 [[OUTER_CMP_INV]], float 1.000000e+00, float [[X]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %inner_cmp = fcmp fast ult float %x, 255.0
diff --git a/llvm/test/Transforms/InstCombine/select-gep.ll b/llvm/test/Transforms/InstCombine/select-gep.ll
index dd8dffb..7181336 100644
--- a/llvm/test/Transforms/InstCombine/select-gep.ll
+++ b/llvm/test/Transforms/InstCombine/select-gep.ll
@@ -286,3 +286,35 @@ define <2 x ptr> @test7(<2 x ptr> %p1, i64 %idx, <2 x i1> %cc) {
   %select = select <2 x i1> %cc, <2 x ptr> %p1, <2 x ptr> %gep
   ret <2 x ptr> %select
 }
+
+define ptr @ptr_eq_replace_freeze1(ptr %p, ptr %q) {
+; CHECK-LABEL: @ptr_eq_replace_freeze1(
+; CHECK-NEXT:    [[Q_FR:%.*]] = freeze ptr [[Q:%.*]]
+; CHECK-NEXT:    [[Q_FR1:%.*]] = freeze ptr [[Q1:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[Q_FR]], [[Q_FR1]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], ptr [[Q_FR]], ptr [[Q_FR1]]
+; CHECK-NEXT:    ret ptr [[SELECT]]
+;
+  %p.fr = freeze ptr %p
+  %q.fr = freeze ptr %q
+  %cmp = icmp eq ptr %p.fr, %q.fr
+  %select = select i1 %cmp, ptr %p.fr, ptr %q.fr
+  ret ptr %select
+}
+
+define ptr @ptr_eq_replace_freeze2(ptr %p, ptr %q) {
+; CHECK-LABEL: @ptr_eq_replace_freeze2(
+; CHECK-NEXT:    [[P_FR:%.*]] = freeze ptr [[P:%.*]]
+; CHECK-NEXT:    [[P_FR1:%.*]] = freeze ptr [[P1:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P_FR1]], [[P_FR]]
+; CHECK-NEXT:    [[SELECT_V:%.*]] = select i1 [[CMP]], ptr [[P_FR1]], ptr [[P_FR]]
+; CHECK-NEXT:    [[SELECT:%.*]] = getelementptr i8, ptr [[SELECT_V]], i64 16
+; CHECK-NEXT:    ret ptr [[SELECT]]
+;
+  %gep1 = getelementptr i32, ptr %p, i64 4
+  %gep2 = getelementptr i32, ptr %q, i64 4
+  %cmp = icmp eq ptr %p, %q
+  %cmp.fr = freeze i1 %cmp
+  %select = select i1 %cmp.fr, ptr %gep1, ptr %gep2
+  ret ptr %select
+}
diff --git a/llvm/test/Transforms/InstSimplify/domcondition.ll b/llvm/test/Transforms/InstSimplify/domcondition.ll
index 43be5de..2893bb1 100644
--- a/llvm/test/Transforms/InstSimplify/domcondition.ll
+++ b/llvm/test/Transforms/InstSimplify/domcondition.ll
@@ -278,3 +278,210 @@ end:
 }
 
 declare void @foo(i32)
+
+
+define i1 @simplify_fcmp_implied_by_dom_cond_range_true(float %x) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_true(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, 0.0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp olt float %x, 1.0
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_in_else_implied_by_dom_cond_range_true(float %x) {
+; CHECK-LABEL: @simplify_fcmp_in_else_implied_by_dom_cond_range_true(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], 1.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = fcmp olt float %x, 1.0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  ret i1 true
+
+if.else:
+  %cmp2 = fcmp uge float %x, 0.5
+  ret i1 %cmp2
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_range_false(float %x) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_false(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, 0.0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp ogt float %x, 1.0
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_true(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_true(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp ole float %x, %y
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_false(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_false(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp ogt float %x, %y
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_commuted(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_commuted(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp oge float %y, %x
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+; Negative tests
+
+define i1 @simplify_fcmp_implied_by_dom_cond_wrong_range(float %x) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_wrong_range(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[X]], -1.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, 0.0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp olt float %x, -1.0
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_range_mismatched_operand(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_mismatched_operand(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[Y:%.*]], 1.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, 0.0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp olt float %y, 1.0
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_wrong_pred(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_wrong_pred(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp ole float %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp olt float %x, %y
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_mismatched_operand(float %x, float %y, float %z) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_mismatched_operand(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ole float [[X]], [[Z:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp ole float %x, %z
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index 0f09c74..e376d82 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -325,6 +325,13 @@ int llvm_test_dibuilder(void) {
   LLVMValueRef Phi2 = LLVMBuildPhi(Builder, I64, "p2");
   LLVMAddIncoming(Phi2, &Zero, &FooEntryBlock, 1);
 
+  // Test that LLVMGetFirstDbgRecord and LLVMGetLastDbgRecord return NULL for
+  // instructions without debug info.
+  LLVMDbgRecordRef Phi1FirstDbgRecord = LLVMGetFirstDbgRecord(Phi1);
+  assert(Phi1FirstDbgRecord == NULL);
+  LLVMDbgRecordRef Phi1LastDbgRecord = LLVMGetLastDbgRecord(Phi1);
+  assert(Phi1LastDbgRecord == NULL);
+
   // Insert a non-phi before the `ret` but not before the debug records to
   // test that works as expected.
   LLVMPositionBuilder(Builder, FooVarBlock, Ret);
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index ba55079..2a89961 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -642,7 +642,8 @@ int main(int argc, char **argv) {
                : MAB->createObjectWriter(*OS),
         std::unique_ptr<MCCodeEmitter>(CE), *STI));
     if (NoExecStack)
-      Str->switchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
+      Str->switchSection(
+          Ctx.getAsmInfo()->getStackSection(Ctx, /*Exec=*/false));
     Str->emitVersionForTarget(TheTriple, VersionTuple(), nullptr,
                               VersionTuple());
   }
diff --git a/llvm/unittests/ADT/TypeSwitchTest.cpp b/llvm/unittests/ADT/TypeSwitchTest.cpp
index c54b798..a7d9342 100644
--- a/llvm/unittests/ADT/TypeSwitchTest.cpp
+++ b/llvm/unittests/ADT/TypeSwitchTest.cpp
@@ -114,3 +114,31 @@ TEST(TypeSwitchTest, CasesOptional) {
   EXPECT_EQ(std::nullopt, translate(DerivedC()));
   EXPECT_EQ(-1, translate(DerivedD()));
 }
+
+TEST(TypeSwitchTest, DefaultUnreachableWithValue) {
+  auto translate = [](auto value) {
+    return TypeSwitch<Base *, int>(&value)
+        .Case([](DerivedA *) { return 0; })
+        .DefaultUnreachable("Unhandled type");
+  };
+  EXPECT_EQ(0, translate(DerivedA()));
+
+#if defined(GTEST_HAS_DEATH_TEST) && !defined(NDEBUG)
+  EXPECT_DEATH((void)translate(DerivedD()), "Unhandled type");
+#endif
+}
+
+TEST(TypeSwitchTest, DefaultUnreachableWithVoid) {
+  auto translate = [](auto value) {
+    int result = -1;
+    TypeSwitch<Base *>(&value)
+        .Case([&result](DerivedA *) { result = 0; })
+        .DefaultUnreachable("Unhandled type");
+    return result;
+  };
+  EXPECT_EQ(0, translate(DerivedA()));
+
+#if defined(GTEST_HAS_DEATH_TEST) && !defined(NDEBUG)
+  EXPECT_DEATH((void)translate(DerivedD()), "Unhandled type");
+#endif
+}
diff --git a/mlir/docs/Tutorials/CreatingADialect.md b/mlir/docs/Tutorials/CreatingADialect.md
index af709fc..e30b80bc 100644
--- a/mlir/docs/Tutorials/CreatingADialect.md
+++ b/mlir/docs/Tutorials/CreatingADialect.md
@@ -22,7 +22,7 @@ typically defined in FooDialect.cpp, which includes FooOps.cpp.inc and
 FooOpsInterfaces.h.inc.
 
 The 'Transforms' directory contains rewrite rules for the dialect,
-typically described in TableGen file using the [DDR
+typically described in TableGen file using the [DRR
 format](../DeclarativeRewrites.md).
 
 Note that dialect names should not generally be suffixed with “Ops”,
diff --git a/mlir/include/mlir-c/Rewrite.h b/mlir/include/mlir-c/Rewrite.h
index 77be1f4..5dd285e 100644
--- a/mlir/include/mlir-c/Rewrite.h
+++ b/mlir/include/mlir-c/Rewrite.h
@@ -101,6 +101,12 @@ mlirRewriterBaseGetInsertionBlock(MlirRewriterBase rewriter);
 MLIR_CAPI_EXPORTED MlirBlock
 mlirRewriterBaseGetBlock(MlirRewriterBase rewriter);
 
+/// Returns the operation right after the current insertion point
+/// of the rewriter. A null MlirOperation will be returned
+// if the current insertion point is at the end of the block.
+MLIR_CAPI_EXPORTED MlirOperation
+mlirRewriterBaseGetOperationAfterInsertion(MlirRewriterBase rewriter);
+
 //===----------------------------------------------------------------------===//
 /// Block and operation creation/insertion/cloning
 //===----------------------------------------------------------------------===//
@@ -311,6 +317,14 @@ MLIR_CAPI_EXPORTED MlirLogicalResult mlirApplyPatternsAndFoldGreedily(
     MlirGreedyRewriteDriverConfig);
 
 //===----------------------------------------------------------------------===//
+/// PatternRewriter API
+//===----------------------------------------------------------------------===//
+
+/// Cast the PatternRewriter to a RewriterBase
+MLIR_CAPI_EXPORTED MlirRewriterBase
+mlirPatternRewriterAsBase(MlirPatternRewriter rewriter);
+
+//===----------------------------------------------------------------------===//
 /// PDLPatternModule API
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 32b2b0c..7b17106 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -2046,6 +2046,9 @@ PyInsertionPoint::PyInsertionPoint(PyOperationBase &beforeOperationBase)
     : refOperation(beforeOperationBase.getOperation().getRef()),
       block((*refOperation)->getBlock()) {}
 
+PyInsertionPoint::PyInsertionPoint(PyOperationRef beforeOperationRef)
+    : refOperation(beforeOperationRef), block((*refOperation)->getBlock()) {}
+
 void PyInsertionPoint::insert(PyOperationBase &operationBase) {
   PyOperation &operation = operationBase.getOperation();
   if (operation.isAttached())
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index edbd73e..e706be3b 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -841,6 +841,8 @@ public:
   PyInsertionPoint(const PyBlock &block);
   /// Creates an insertion point positioned before a reference operation.
   PyInsertionPoint(PyOperationBase &beforeOperationBase);
+  /// Creates an insertion point positioned before a reference operation.
+  PyInsertionPoint(PyOperationRef beforeOperationRef);
 
   /// Shortcut to create an insertion point at the beginning of the block.
   static PyInsertionPoint atBlockBegin(PyBlock &block);
diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp
index 836f44fd..9e3d970 100644
--- a/mlir/lib/Bindings/Python/Rewrite.cpp
+++ b/mlir/lib/Bindings/Python/Rewrite.cpp
@@ -26,6 +26,30 @@ using namespace mlir::python;
 
 namespace {
 
+class PyPatternRewriter {
+public:
+  PyPatternRewriter(MlirPatternRewriter rewriter)
+      : base(mlirPatternRewriterAsBase(rewriter)),
+        ctx(PyMlirContext::forContext(mlirRewriterBaseGetContext(base))) {}
+
+  PyInsertionPoint getInsertionPoint() const {
+    MlirBlock block = mlirRewriterBaseGetInsertionBlock(base);
+    MlirOperation op = mlirRewriterBaseGetOperationAfterInsertion(base);
+
+    if (mlirOperationIsNull(op)) {
+      MlirOperation owner = mlirBlockGetParentOperation(block);
+      auto parent = PyOperation::forOperation(ctx, owner);
+      return PyInsertionPoint(PyBlock(parent, block));
+    }
+
+    return PyInsertionPoint(PyOperation::forOperation(ctx, op));
+  }
+
+private:
+  MlirRewriterBase base;
+  PyMlirContextRef ctx;
+};
+
 #if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 static nb::object objectFromPDLValue(MlirPDLValue value) {
   if (MlirValue v = mlirPDLValueAsValue(value); !mlirValueIsNull(v))
@@ -84,7 +108,8 @@ public:
            void *userData) -> MlirLogicalResult {
           nb::handle f = nb::handle(static_cast<PyObject *>(userData));
           return logicalResultFromObject(
-              f(rewriter, results, objectsFromPDLValues(nValues, values)));
+              f(PyPatternRewriter(rewriter), results,
+                objectsFromPDLValues(nValues, values)));
         },
         fn.ptr());
   }
@@ -98,7 +123,8 @@ public:
            void *userData) -> MlirLogicalResult {
           nb::handle f = nb::handle(static_cast<PyObject *>(userData));
           return logicalResultFromObject(
-              f(rewriter, results, objectsFromPDLValues(nValues, values)));
+              f(PyPatternRewriter(rewriter), results,
+                objectsFromPDLValues(nValues, values)));
         },
         fn.ptr());
   }
@@ -143,7 +169,9 @@ private:
 
 /// Create the `mlir.rewrite` here.
 void mlir::python::populateRewriteSubmodule(nb::module_ &m) {
-  nb::class_<MlirPatternRewriter>(m, "PatternRewriter");
+  nb::class_<PyPatternRewriter>(m, "PatternRewriter")
+      .def_prop_ro("ip", &PyPatternRewriter::getInsertionPoint,
+                   "The current insertion point of the PatternRewriter.");
   //----------------------------------------------------------------------------
   // Mapping of the PDLResultList and PDLModule
   //----------------------------------------------------------------------------
diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp
index 0d56259..c15a73b 100644
--- a/mlir/lib/CAPI/Transforms/Rewrite.cpp
+++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp
@@ -70,6 +70,17 @@ MlirBlock mlirRewriterBaseGetBlock(MlirRewriterBase rewriter) {
   return wrap(unwrap(rewriter)->getBlock());
 }
 
+MlirOperation
+mlirRewriterBaseGetOperationAfterInsertion(MlirRewriterBase rewriter) {
+  mlir::RewriterBase *base = unwrap(rewriter);
+  mlir::Block *block = base->getInsertionBlock();
+  mlir::Block::iterator it = base->getInsertionPoint();
+  if (it == block->end())
+    return {nullptr};
+
+  return wrap(std::addressof(*it));
+}
+
 //===----------------------------------------------------------------------===//
 /// Block and operation creation/insertion/cloning
 //===----------------------------------------------------------------------===//
@@ -317,6 +328,10 @@ inline MlirPatternRewriter wrap(mlir::PatternRewriter *rewriter) {
   return {rewriter};
 }
 
+MlirRewriterBase mlirPatternRewriterAsBase(MlirPatternRewriter rewriter) {
+  return wrap(static_cast<mlir::RewriterBase *>(unwrap(rewriter)));
+}
+
 //===----------------------------------------------------------------------===//
 /// PDLPatternModule API
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp b/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp
index 7b9ed1d..79c2f23 100644
--- a/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp
+++ b/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp
@@ -363,7 +363,7 @@ static TypedValue<VectorType> storeTile(PatternRewriter &rewriter,
 }
 
 struct ContractionToAMX : public OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
index 4e1da39..363685a 100644
--- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
+++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
@@ -44,7 +44,7 @@ namespace {
 ///   arm_sme.tile_load ... layout<vertical>
 struct TransferReadToArmSMELowering
     : public OpRewritePattern<vector::TransferReadOp> {
-  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferReadOp transferReadOp,
                                 PatternRewriter &rewriter) const final {
@@ -120,7 +120,7 @@ struct TransferReadToArmSMELowering
 ///     : memref<?x?xi8>, vector<[16]x[16]xi8>
 struct TransferWriteToArmSMELowering
     : public OpRewritePattern<vector::TransferWriteOp> {
-  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
                                 PatternRewriter &rewriter) const final {
@@ -157,7 +157,7 @@ struct TransferWriteToArmSMELowering
 
 /// Conversion pattern for vector.load.
 struct VectorLoadToArmSMELowering : public OpRewritePattern<vector::LoadOp> {
-  using OpRewritePattern<vector::LoadOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::LoadOp load,
                                 PatternRewriter &rewriter) const override {
@@ -173,7 +173,7 @@ struct VectorLoadToArmSMELowering : public OpRewritePattern<vector::LoadOp> {
 
 /// Conversion pattern for vector.store.
 struct VectorStoreToArmSMELowering : public OpRewritePattern<vector::StoreOp> {
-  using OpRewritePattern<vector::StoreOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::StoreOp store,
                                 PatternRewriter &rewriter) const override {
@@ -208,7 +208,7 @@ struct VectorStoreToArmSMELowering : public OpRewritePattern<vector::StoreOp> {
 /// Supports scalar, 0-d vector, and 1-d vector broadcasts.
 struct BroadcastOpToArmSMELowering
     : public OpRewritePattern<vector::BroadcastOp> {
-  using OpRewritePattern<vector::BroadcastOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::BroadcastOp broadcastOp,
                                 PatternRewriter &rewriter) const final {
@@ -279,7 +279,7 @@ struct BroadcastOpToArmSMELowering
 /// implementation, perhaps with tile <-> vector (MOVA) ops.
 struct TransposeOpToArmSMELowering
     : public OpRewritePattern<vector::TransposeOp> {
-  using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransposeOp transposeOp,
                                 PatternRewriter &rewriter) const final {
@@ -372,7 +372,7 @@ struct TransposeOpToArmSMELowering
 struct VectorOuterProductToArmSMELowering
     : public OpRewritePattern<vector::OuterProductOp> {
 
-  using OpRewritePattern<vector::OuterProductOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::OuterProductOp outerProductOp,
                                 PatternRewriter &rewriter) const override {
@@ -451,7 +451,7 @@ struct VectorOuterProductToArmSMELowering
 /// ```
 struct VectorExtractToArmSMELowering
     : public OpRewritePattern<vector::ExtractOp> {
-  using OpRewritePattern<vector::ExtractOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ExtractOp extractOp,
                                 PatternRewriter &rewriter) const override {
@@ -507,7 +507,7 @@ struct VectorExtractToArmSMELowering
 /// ```
 struct VectorInsertToArmSMELowering
     : public OpRewritePattern<vector::InsertOp> {
-  using OpRewritePattern<vector::InsertOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::InsertOp insertOp,
                                 PatternRewriter &rewriter) const override {
@@ -568,7 +568,7 @@ struct VectorInsertToArmSMELowering
 ///  }
 ///  ```
 struct VectorPrintToArmSMELowering : public OpRewritePattern<vector::PrintOp> {
-  using OpRewritePattern<vector::PrintOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::PrintOp printOp,
                                 PatternRewriter &rewriter) const override {
@@ -623,7 +623,7 @@ struct VectorPrintToArmSMELowering : public OpRewritePattern<vector::PrintOp> {
 ///  ```
 struct FoldTransferWriteOfExtractTileSlice
     : public OpRewritePattern<vector::TransferWriteOp> {
-  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
                                 PatternRewriter &rewriter) const final {
@@ -679,7 +679,7 @@ struct FoldTransferWriteOfExtractTileSlice
 /// ```
 struct ExtractFromCreateMaskToPselLowering
     : public OpRewritePattern<vector::ExtractOp> {
-  using OpRewritePattern<vector::ExtractOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ExtractOp extractOp,
                                 PatternRewriter &rewriter) const override {
@@ -734,7 +734,7 @@ struct ExtractFromCreateMaskToPselLowering
 // Convert all `vector.splat` to `vector.broadcast`. There is a path from
 // `vector.broadcast` to ArmSME via another pattern.
 struct ConvertSplatToBroadcast : public OpRewritePattern<vector::SplatOp> {
-  using OpRewritePattern<vector::SplatOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::SplatOp splatOp,
                                 PatternRewriter &rewriter) const final {
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index d6a2622..9843435 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -386,7 +386,7 @@ namespace {
 // to MMA matmul.
 struct PrepareContractToGPUMMA
     : public OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ContractionOp op,
                                 PatternRewriter &rewriter) const override {
@@ -450,7 +450,7 @@ struct PrepareContractToGPUMMA
 // Shared Memory to registers.
 struct CombineTransferReadOpTranspose final
     : public OpRewritePattern<vector::TransposeOp> {
-  using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransposeOp op,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index e0b1a88..5461646 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1342,7 +1342,7 @@ struct VectorScalableExtractOpLowering
 /// ```
 class VectorFMAOpNDRewritePattern : public OpRewritePattern<FMAOp> {
 public:
-  using OpRewritePattern<FMAOp>::OpRewritePattern;
+  using Base::Base;
 
   void initialize() {
     // This pattern recursively unpacks one dimension at a time. The recursion
@@ -2127,7 +2127,7 @@ FailureOr<Value> ContractionOpToMatmulOpLowering::matchAndRewriteMaskableOp(
 class TransposeOpToMatrixTransposeOpLowering
     : public OpRewritePattern<vector::TransposeOp> {
 public:
-  using OpRewritePattern<TransposeOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransposeOp op,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
index 5061a44..311ff6f 100644
--- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
+++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
@@ -838,7 +838,7 @@ struct VectorStoreOpConverter final
 
 struct VectorReductionToIntDotProd final
     : OpRewritePattern<vector::ReductionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ReductionOp op,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index 9f5585a..e2c7d80 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -475,7 +475,7 @@ static LogicalResult lowerToScatteredStoreOp(vector::TransferWriteOp writeOp,
 }
 
 struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
-  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
                                 PatternRewriter &rewriter) const override {
@@ -546,7 +546,7 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
 
 struct TransferWriteLowering
     : public OpRewritePattern<vector::TransferWriteOp> {
-  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
                                 PatternRewriter &rewriter) const override {
@@ -597,7 +597,7 @@ struct TransferWriteLowering
 };
 
 struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
-  using OpRewritePattern<vector::GatherOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::GatherOp gatherOp,
                                 PatternRewriter &rewriter) const override {
@@ -632,7 +632,7 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
 };
 
 struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
-  using OpRewritePattern<vector::ScatterOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ScatterOp scatterOp,
                                 PatternRewriter &rewriter) const override {
@@ -662,7 +662,7 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
 };
 
 struct LoadLowering : public OpRewritePattern<vector::LoadOp> {
-  using OpRewritePattern<vector::LoadOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::LoadOp loadOp,
                                 PatternRewriter &rewriter) const override {
@@ -694,7 +694,7 @@ struct LoadLowering : public OpRewritePattern<vector::LoadOp> {
 };
 
 struct StoreLowering : public OpRewritePattern<vector::StoreOp> {
-  using OpRewritePattern<vector::StoreOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::StoreOp storeOp,
                                 PatternRewriter &rewriter) const override {
@@ -727,7 +727,7 @@ struct StoreLowering : public OpRewritePattern<vector::StoreOp> {
 };
 
 struct ContractionLowering : public OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/test/python/integration/dialects/pdl.py b/mlir/test/python/integration/dialects/pdl.py
index c8e6197..fe27dd4 100644
--- a/mlir/test/python/integration/dialects/pdl.py
+++ b/mlir/test/python/integration/dialects/pdl.py
@@ -16,6 +16,7 @@ def construct_and_print_in_module(f):
             print(module)
     return f
 
+
 def get_pdl_patterns():
     # Create a rewrite from add to mul. This will match
     # - operation name is arith.addi
@@ -121,8 +122,10 @@ def load_myint_dialect():
 
 
 # This PDL pattern is to fold constant additions,
-# i.e. add(constant0, constant1) -> constant2
-# where constant2 = constant0 + constant1.
+# including two patterns:
+# 1. add(constant0, constant1) -> constant2
+#    where constant2 = constant0 + constant1;
+# 2. add(x, 0) or add(0, x) -> x.
 def get_pdl_pattern_fold():
     m = Module.create()
     i32 = IntegerType.get_signless(32)
@@ -237,3 +240,87 @@ def test_pdl_register_function_constraint(module_):
     apply_patterns_and_fold_greedily(module_, frozen)
 
     return module_
+
+
+# This pattern is to expand constant to additions
+# unless the constant is no more than 1,
+# e.g. 3 -> 1 + 2 -> 1 + (1 + 1).
+def get_pdl_pattern_expand():
+    m = Module.create()
+    i32 = IntegerType.get_signless(32)
+    with InsertionPoint(m.body):
+
+        @pdl.pattern(benefit=1, sym_name="myint_constant_expand")
+        def pat():
+            t = pdl.TypeOp(i32)
+            cst = pdl.AttributeOp()
+            pdl.apply_native_constraint([], "is_one", [cst])
+            op0 = pdl.OperationOp(
+                name="myint.constant", attributes={"value": cst}, types=[t]
+            )
+
+            @pdl.rewrite()
+            def rew():
+                expanded = pdl.apply_native_rewrite(
+                    [pdl.OperationType.get()], "expand", [cst]
+                )
+                pdl.ReplaceOp(op0, with_op=expanded)
+
+    def is_one(rewriter, results, values):
+        cst = values[0].value
+        return cst <= 1
+
+    def expand(rewriter, results, values):
+        cst = values[0].value
+        c1 = cst // 2
+        c2 = cst - c1
+        with rewriter.ip:
+            op1 = Operation.create(
+                "myint.constant",
+                results=[i32],
+                attributes={"value": IntegerAttr.get(i32, c1)},
+            )
+            op2 = Operation.create(
+                "myint.constant",
+                results=[i32],
+                attributes={"value": IntegerAttr.get(i32, c2)},
+            )
+            res = Operation.create(
+                "myint.add", results=[i32], operands=[op1.result, op2.result]
+            )
+        results.append(res)
+
+    pdl_module = PDLModule(m)
+    pdl_module.register_constraint_function("is_one", is_one)
+    pdl_module.register_rewrite_function("expand", expand)
+    return pdl_module.freeze()
+
+
+# CHECK-LABEL: TEST: test_pdl_register_function_expand
+# CHECK: %0 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %1 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %2 = "myint.add"(%0, %1) : (i32, i32) -> i32
+# CHECK: %3 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %4 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %5 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %6 = "myint.add"(%4, %5) : (i32, i32) -> i32
+# CHECK: %7 = "myint.add"(%3, %6) : (i32, i32) -> i32
+# CHECK: %8 = "myint.add"(%2, %7) : (i32, i32) -> i32
+# CHECK: return %8 : i32
+@construct_and_print_in_module
+def test_pdl_register_function_expand(module_):
+    load_myint_dialect()
+
+    module_ = Module.parse(
+        """
+        func.func @f() -> i32 {
+          %0 = "myint.constant"() { value = 5 }: () -> (i32)
+          return %0 : i32
+        }
+        """
+    )
+
+    frozen = get_pdl_pattern_expand()
+    apply_patterns_and_fold_greedily(module_, frozen)
+
+    return module_
diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h
index 14a3d8e..3ed3295 100644
--- a/orc-rt/include/orc-rt/SPSWrapperFunction.h
+++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h
@@ -57,8 +57,8 @@ private:
   template <typename... Ts>
   using DeserializableTuple_t = typename DeserializableTuple<Ts...>::type;
 
-  template <typename T> static T fromSerializable(T &&Arg) noexcept {
-    return Arg;
+  template <typename T> static T &&fromSerializable(T &&Arg) noexcept {
+    return std::forward<T>(Arg);
   }
 
   static Error fromSerializable(SPSSerializableError Err) noexcept {
@@ -86,7 +86,10 @@ public:
                                 decltype(Args)>::deserialize(IB, Args))
       return std::nullopt;
     return std::apply(
-        [](auto &&...A) { return ArgTuple(fromSerializable(A)...); },
+        [](auto &&...A) {
+          return std::optional<ArgTuple>(std::in_place,
+                                         std::move(fromSerializable(A))...);
+        },
         std::move(Args));
   }
 };
diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h
index ca165db..47e770f 100644
--- a/orc-rt/include/orc-rt/WrapperFunction.h
+++ b/orc-rt/include/orc-rt/WrapperFunction.h
@@ -111,7 +111,23 @@ struct WFHandlerTraitsImpl {
   static_assert(std::is_void_v<RetT>,
                 "Async wrapper function handler must return void");
   typedef ReturnT YieldType;
-  typedef std::tuple<ArgTs...> ArgTupleType;
+  typedef std::tuple<std::decay_t<ArgTs>...> ArgTupleType;
+
+  // Forwards arguments based on the parameter types of the handler.
+  template <typename FnT> class ForwardArgsAsRequested {
+  public:
+    ForwardArgsAsRequested(FnT &&Fn) : Fn(std::move(Fn)) {}
+    void operator()(ArgTs &...Args) { Fn(std::forward<ArgTs>(Args)...); }
+
+  private:
+    FnT Fn;
+  };
+
+  template <typename FnT>
+  static ForwardArgsAsRequested<std::decay_t<FnT>>
+  forwardArgsAsRequested(FnT &&Fn) {
+    return ForwardArgsAsRequested<std::decay_t<FnT>>(std::forward<FnT>(Fn));
+  }
 };
 
 template <typename C>
@@ -244,10 +260,11 @@ struct WrapperFunction {
 
     if (auto Args =
             S.arguments().template deserialize<ArgTuple>(std::move(ArgBytes)))
-      std::apply(bind_front(std::forward<Handler>(H),
-                            detail::StructuredYield<RetTupleType, Serializer>(
-                                Session, CallCtx, Return, std::move(S))),
-                 std::move(*Args));
+      std::apply(HandlerTraits::forwardArgsAsRequested(bind_front(
+                     std::forward<Handler>(H),
+                     detail::StructuredYield<RetTupleType, Serializer>(
+                         Session, CallCtx, Return, std::move(S)))),
+                 *Args);
     else
       Return(Session, CallCtx,
              WrapperFunctionBuffer::createOutOfBandError(
diff --git a/orc-rt/unittests/CMakeLists.txt b/orc-rt/unittests/CMakeLists.txt
index 54c453d..4d3da68 100644
--- a/orc-rt/unittests/CMakeLists.txt
+++ b/orc-rt/unittests/CMakeLists.txt
@@ -15,7 +15,6 @@ add_orc_rt_unittest(CoreTests
   AllocActionTest.cpp
   BitmaskEnumTest.cpp
   CallableTraitsHelperTest.cpp
-  CommonTestUtils.cpp
   ErrorTest.cpp
   ExecutorAddressTest.cpp
   IntervalMapTest.cpp
diff --git a/orc-rt/unittests/CommonTestUtils.cpp b/orc-rt/unittests/CommonTestUtils.cpp
deleted file mode 100644
index d9f9433..0000000
--- a/orc-rt/unittests/CommonTestUtils.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===- CommonTestUtils.cpp ------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Common test utilities.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CommonTestUtils.h"
-
-size_t OpCounter::DefaultConstructions = 0;
-size_t OpCounter::CopyConstructions = 0;
-size_t OpCounter::CopyAssignments = 0;
-size_t OpCounter::MoveConstructions = 0;
-size_t OpCounter::MoveAssignments = 0;
-size_t OpCounter::Destructions = 0;
diff --git a/orc-rt/unittests/CommonTestUtils.h b/orc-rt/unittests/CommonTestUtils.h
index 5ff2c8e..1c66bdd 100644
--- a/orc-rt/unittests/CommonTestUtils.h
+++ b/orc-rt/unittests/CommonTestUtils.h
@@ -11,7 +11,7 @@
 
 #include <cstddef>
 
-class OpCounter {
+template <size_t Idx = 0> class OpCounter {
 public:
   OpCounter() { ++DefaultConstructions; }
   OpCounter(const OpCounter &Other) { ++CopyConstructions; }
@@ -57,4 +57,11 @@ private:
   static size_t Destructions;
 };
 
+template <size_t Idx> size_t OpCounter<Idx>::DefaultConstructions = 0;
+template <size_t Idx> size_t OpCounter<Idx>::CopyConstructions = 0;
+template <size_t Idx> size_t OpCounter<Idx>::CopyAssignments = 0;
+template <size_t Idx> size_t OpCounter<Idx>::MoveConstructions = 0;
+template <size_t Idx> size_t OpCounter<Idx>::MoveAssignments = 0;
+template <size_t Idx> size_t OpCounter<Idx>::Destructions = 0;
+
 #endif // ORC_RT_UNITTEST_COMMONTESTUTILS_H
diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
index c0c86ff..32aaa61 100644
--- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp
+++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
@@ -10,6 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CommonTestUtils.h"
+
 #include "orc-rt/SPSWrapperFunction.h"
 #include "orc-rt/WrapperFunction.h"
 #include "orc-rt/move_only_function.h"
@@ -218,3 +220,80 @@ TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedFailureCase) {
 
   EXPECT_EQ(ErrMsg, "N is not a multiple of 2");
 }
+
+template <size_t N> struct SPSOpCounter {};
+
+namespace orc_rt {
+template <size_t N>
+class SPSSerializationTraits<SPSOpCounter<N>, OpCounter<N>> {
+public:
+  static size_t size(const OpCounter<N> &O) { return 0; }
+  static bool serialize(SPSOutputBuffer &OB, const OpCounter<N> &O) {
+    return true;
+  }
+  static bool deserialize(SPSInputBuffer &OB, OpCounter<N> &O) { return true; }
+};
+} // namespace orc_rt
+
+static void
+handle_with_reference_types_sps_wrapper(orc_rt_SessionRef Session,
+                                        void *CallCtx,
+                                        orc_rt_WrapperFunctionReturn Return,
+                                        orc_rt_WrapperFunctionBuffer ArgBytes) {
+  SPSWrapperFunction<void(
+      SPSOpCounter<0>, SPSOpCounter<1>, SPSOpCounter<2>,
+      SPSOpCounter<3>)>::handle(Session, CallCtx, Return, ArgBytes,
+                                [](move_only_function<void()> Return,
+                                   OpCounter<0>, OpCounter<1> &,
+                                   const OpCounter<2> &,
+                                   OpCounter<3> &&) { Return(); });
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestHandlerWithReferences) {
+  // Test that we can handle by-value, by-ref, by-const-ref, and by-rvalue-ref
+  // arguments, and that we generate the expected number of moves.
+  OpCounter<0>::reset();
+  OpCounter<1>::reset();
+  OpCounter<2>::reset();
+  OpCounter<3>::reset();
+
+  bool DidRun = false;
+  SPSWrapperFunction<void(SPSOpCounter<0>, SPSOpCounter<1>, SPSOpCounter<2>,
+                          SPSOpCounter<3>)>::
+      call(
+          DirectCaller(nullptr, handle_with_reference_types_sps_wrapper),
+          [&](Error R) {
+            cantFail(std::move(R));
+            DidRun = true;
+          },
+          OpCounter<0>(), OpCounter<1>(), OpCounter<2>(), OpCounter<3>());
+
+  EXPECT_TRUE(DidRun);
+
+  // We expect two default constructions for each parameter: one for the
+  // argument to call, and one for the object to deserialize into.
+  EXPECT_EQ(OpCounter<0>::defaultConstructions(), 2U);
+  EXPECT_EQ(OpCounter<1>::defaultConstructions(), 2U);
+  EXPECT_EQ(OpCounter<2>::defaultConstructions(), 2U);
+  EXPECT_EQ(OpCounter<3>::defaultConstructions(), 2U);
+
+  // Pass-by-value: we expect two moves (one for SPS transparent conversion,
+  // one to copy the value to the parameter), and no copies.
+  EXPECT_EQ(OpCounter<0>::moves(), 2U);
+  EXPECT_EQ(OpCounter<0>::copies(), 0U);
+
+  // Pass-by-lvalue-reference: we expect one move (for SPS transparent
+  // conversion), no copies.
+  EXPECT_EQ(OpCounter<1>::moves(), 1U);
+  EXPECT_EQ(OpCounter<1>::copies(), 0U);
+
+  // Pass-by-const-lvalue-reference: we expect one move (for SPS transparent
+  // conversion), no copies.
+  EXPECT_EQ(OpCounter<2>::moves(), 1U);
+  EXPECT_EQ(OpCounter<2>::copies(), 0U);
+
+  // Pass-by-rvalue-reference: we expect one move (for SPS transparent
+  // conversion), no copies.
+  EXPECT_EQ(OpCounter<3>::moves(), 1U);
+  EXPECT_EQ(OpCounter<3>::copies(), 0U);
+}
diff --git a/orc-rt/unittests/bind-test.cpp b/orc-rt/unittests/bind-test.cpp
index bfaef4e..93a61e63 100644
--- a/orc-rt/unittests/bind-test.cpp
+++ b/orc-rt/unittests/bind-test.cpp
@@ -47,28 +47,28 @@ TEST(BindTest, LambdaCapture) {
 }
 
 TEST(BindTest, MinimalMoves) {
-  OpCounter::reset();
+  OpCounter<>::reset();
   {
-    auto B = bind_front([](OpCounter &O, int) {}, OpCounter());
+    auto B = bind_front([](OpCounter<> &O, int) {}, OpCounter<>());
     B(0);
   }
-  EXPECT_EQ(OpCounter::defaultConstructions(), 1U);
-  EXPECT_EQ(OpCounter::copies(), 0U);
-  EXPECT_EQ(OpCounter::moves(), 1U);
-  EXPECT_EQ(OpCounter::destructions(), 2U);
+  EXPECT_EQ(OpCounter<>::defaultConstructions(), 1U);
+  EXPECT_EQ(OpCounter<>::copies(), 0U);
+  EXPECT_EQ(OpCounter<>::moves(), 1U);
+  EXPECT_EQ(OpCounter<>::destructions(), 2U);
 }
 
 TEST(BindTest, MinimalCopies) {
-  OpCounter::reset();
+  OpCounter<>::reset();
   {
-    OpCounter O;
-    auto B = bind_front([](OpCounter &O, int) {}, O);
+    OpCounter<> O;
+    auto B = bind_front([](OpCounter<> &O, int) {}, O);
     B(0);
   }
-  EXPECT_EQ(OpCounter::defaultConstructions(), 1U);
-  EXPECT_EQ(OpCounter::copies(), 1U);
-  EXPECT_EQ(OpCounter::moves(), 0U);
-  EXPECT_EQ(OpCounter::destructions(), 2U);
+  EXPECT_EQ(OpCounter<>::defaultConstructions(), 1U);
+  EXPECT_EQ(OpCounter<>::copies(), 1U);
+  EXPECT_EQ(OpCounter<>::moves(), 0U);
+  EXPECT_EQ(OpCounter<>::destructions(), 2U);
 }
 
 TEST(BindTest, ForwardUnboundArgs) {
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 026664b..e91e7c8 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2157,24 +2157,13 @@ libc_function(
 ########################### math support library ###############################
 
 libc_support_library(
-    name = "common_constants",
-    srcs = ["src/math/generic/common_constants.cpp"],
-    hdrs = ["src/math/generic/common_constants.h"],
-    deps = [
-        ":__support_math_acosh_float_constants",
-        ":__support_math_exp_constants",
-        ":__support_number_pair",
-    ],
-)
-
-libc_support_library(
     name = "log_range_reduction",
     hdrs = ["src/math/generic/log_range_reduction.h"],
     deps = [
         ":__support_common",
         ":__support_fputil_dyadic_float",
+        ":__support_math_common_constants",
         ":__support_uint128",
-        ":common_constants",
     ],
 )
 
@@ -2189,8 +2178,8 @@ libc_support_library(
         ":__support_fputil_polyeval",
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
+        ":__support_math_common_constants",
         ":__support_math_exp10f_utils",
-        ":common_constants",
     ],
 )
 
@@ -2558,6 +2547,16 @@ libc_support_library(
 )
 
 libc_support_library(
+    name = "__support_math_common_constants",
+    hdrs = ["src/__support/math/common_constants.h"],
+    deps = [
+        ":__support_math_acosh_float_constants",
+        ":__support_math_exp_constants",
+        ":__support_number_pair",
+    ],
+)
+
+libc_support_library(
     name = "__support_math_cos",
     hdrs = ["src/__support/math/cos.h"],
     deps = [
@@ -2632,8 +2631,8 @@ libc_support_library(
         ":__support_fputil_polyeval",
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
+        ":__support_math_common_constants",
         ":__support_sincosf_utils",
-        ":common_constants",
     ],
 )
 
@@ -2879,6 +2878,24 @@ libc_support_library(
 )
 
 libc_support_library(
+    name = "__support_math_exp2",
+    hdrs = ["src/__support/math/exp2.h"],
+    deps = [
+        ":__support_fputil_double_double",
+        ":__support_fputil_dyadic_float",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_fputil_triple_double",
+        ":__support_integer_literals",
+        ":__support_macros_optimization",
+        ":__support_math_common_constants",
+        ":__support_math_exp_utils",
+    ],
+)
+
+libc_support_library(
     name = "__support_math_exp10",
     hdrs = ["src/__support/math/exp10.h"],
     deps = [
@@ -3652,17 +3669,7 @@ libc_math_function(
 libc_math_function(
     name = "exp2",
     additional_deps = [
-        ":__support_fputil_double_double",
-        ":__support_fputil_dyadic_float",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-        ":__support_fputil_rounding_mode",
-        ":__support_fputil_triple_double",
-        ":__support_integer_literals",
-        ":__support_macros_optimization",
-        ":__support_math_exp_utils",
-        ":common_constants",
+        ":__support_math_exp2",
     ],
 )
 
@@ -3706,7 +3713,7 @@ libc_math_function(
         ":__support_fputil_triple_double",
         ":__support_integer_literals",
         ":__support_macros_optimization",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -3720,7 +3727,7 @@ libc_math_function(
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4233,7 +4240,7 @@ libc_math_function(
         ":__support_integer_literals",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
         ":log_range_reduction",
     ],
 )
@@ -4246,7 +4253,7 @@ libc_math_function(
         ":__support_fputil_polyeval",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4268,7 +4275,7 @@ libc_math_function(
         ":__support_integer_literals",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
         ":log_range_reduction",
     ],
 )
@@ -4281,7 +4288,7 @@ libc_math_function(
         ":__support_fputil_polyeval",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4303,7 +4310,7 @@ libc_math_function(
         ":__support_integer_literals",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4315,7 +4322,7 @@ libc_math_function(
         ":__support_fputil_polyeval",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4330,7 +4337,7 @@ libc_math_function(
         ":__support_integer_literals",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
         ":log_range_reduction",
     ],
 )
@@ -4342,7 +4349,7 @@ libc_math_function(
         ":__support_fputil_multiply_add",
         ":__support_fputil_polyeval",
         ":__support_macros_optimization",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4488,7 +4495,7 @@ libc_math_function(
         ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
         ":__support_fputil_sqrt",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4503,7 +4510,7 @@ libc_math_function(
         ":__support_fputil_triple_double",
         ":__support_macros_optimization",
         ":__support_math_exp10f",
-        ":common_constants",
+        ":__support_math_common_constants",
         ":exp2f_impl",
     ],
 )
@@ -4664,7 +4671,7 @@ libc_math_function(
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
         ":__support_math_sinhfcoshf_utils",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4771,7 +4778,7 @@ libc_math_function(
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
         ":__support_math_exp10f_utils",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )