aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--clang/docs/ReleaseNotes.rst3
-rw-r--r--clang/lib/CodeGen/CGExprScalar.cpp7
-rw-r--r--clang/lib/Sema/SemaChecking.cpp3
-rw-r--r--clang/lib/Sema/SemaConcept.cpp29
-rw-r--r--clang/lib/Sema/SemaInit.cpp6
-rw-r--r--clang/lib/Sema/SemaTemplateDeduction.cpp4
-rw-r--r--clang/test/Parser/cxx0x-lambda-expressions.cpp2
-rw-r--r--clang/test/SemaCXX/builtin-assume-aligned.cpp13
-rw-r--r--clang/test/SemaCXX/diagnose_if.cpp27
-rw-r--r--clang/test/SemaCXX/lambda-expressions.cpp30
-rw-r--r--clang/test/SemaTemplate/concepts.cpp71
-rw-r--r--libc/shared/math.h1
-rw-r--r--libc/shared/math/exp2.h23
-rw-r--r--libc/src/__support/math/CMakeLists.txt31
-rw-r--r--libc/src/__support/math/common_constants.h (renamed from libc/src/math/generic/common_constants.cpp)47
-rw-r--r--libc/src/__support/math/exp2.h425
-rw-r--r--libc/src/math/generic/CMakeLists.txt56
-rw-r--r--libc/src/math/generic/common_constants.h73
-rw-r--r--libc/src/math/generic/exp2.cpp398
-rw-r--r--libc/src/math/generic/expm1.cpp5
-rw-r--r--libc/src/math/generic/expm1f.cpp3
-rw-r--r--libc/src/math/generic/log.cpp4
-rw-r--r--libc/src/math/generic/log10.cpp5
-rw-r--r--libc/src/math/generic/log10f.cpp3
-rw-r--r--libc/src/math/generic/log1p.cpp4
-rw-r--r--libc/src/math/generic/log1pf.cpp4
-rw-r--r--libc/src/math/generic/log2.cpp5
-rw-r--r--libc/src/math/generic/log2f.cpp5
-rw-r--r--libc/src/math/generic/log_range_reduction.h3
-rw-r--r--libc/src/math/generic/logf.cpp3
-rw-r--r--libc/src/math/generic/pow.cpp5
-rw-r--r--libc/src/math/generic/powf.cpp6
-rw-r--r--libc/test/shared/CMakeLists.txt1
-rw-r--r--libc/test/shared/shared_math_test.cpp1
-rw-r--r--llvm/docs/GitHub.rst13
-rw-r--r--llvm/include/llvm/ADT/TypeSwitch.h19
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfo.h6
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfoImpl.h8
-rw-r--r--llvm/include/llvm/MC/MCAsmInfo.h8
-rw-r--r--llvm/include/llvm/MC/MCAsmInfoELF.h2
-rw-r--r--llvm/include/llvm/MC/TargetRegistry.h15
-rw-r--r--llvm/lib/Analysis/InstructionSimplify.cpp4
-rw-r--r--llvm/lib/Analysis/ValueTracking.cpp98
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp8
-rw-r--r--llvm/lib/CodeGen/TargetInstrInfo.cpp6
-rw-r--r--llvm/lib/IR/Core.cpp4
-rw-r--r--llvm/lib/MC/MCAsmInfoELF.cpp5
-rw-r--r--llvm/lib/MC/MCELFStreamer.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMInstrCDE.td1
-rw-r--r--llvm/lib/Target/ARM/ARMInstrMVE.td1
-rw-r--r--llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp101
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp30
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll812
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll12
-rw-r--r--llvm/test/CodeGen/AArch64/machine-combiner-copy.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll47
-rw-r--r--llvm/test/CodeGen/AArch64/peephole-and-tst.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/tbl-loops.ll8
-rw-r--r--llvm/test/CodeGen/AArch64/trampoline.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-copy.ll2
-rw-r--r--llvm/test/CodeGen/ARM/combine-movc-sub.ll12
-rw-r--r--llvm/test/CodeGen/ARM/extract-bits.ll148
-rw-r--r--llvm/test/CodeGen/ARM/extract-lowbits.ll92
-rw-r--r--llvm/test/CodeGen/ARM/llround-conv.ll74
-rw-r--r--llvm/test/CodeGen/ARM/lround-conv.ll46
-rw-r--r--llvm/test/CodeGen/RISCV/pr69586.ll204
-rw-r--r--llvm/test/CodeGen/RISCV/rv64-trampoline.ll7
-rw-r--r--llvm/test/CodeGen/SystemZ/llvm.sincos.ll4
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll6
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll151
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll80
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll13
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-float16regloops.ll82
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-float32regloops.ll100
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-gather-increment.ll278
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-phireg.ll30
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll519
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll22
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll10
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll16
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll92
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll82
-rw-r--r--llvm/test/CodeGen/X86/dag-update-nodetomatch.ll5
-rw-r--r--llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir4
-rw-r--r--llvm/test/CodeGen/X86/inalloca-invoke.ll2
-rw-r--r--llvm/test/CodeGen/X86/licm-regpressure.ll62
-rw-r--r--llvm/test/Transforms/InstCombine/clamp-to-minmax.ll6
-rw-r--r--llvm/test/Transforms/InstCombine/select-gep.ll32
-rw-r--r--llvm/test/Transforms/InstSimplify/domcondition.ll207
-rw-r--r--llvm/tools/llvm-c-test/debuginfo.c7
-rw-r--r--llvm/tools/llvm-mc/llvm-mc.cpp3
-rw-r--r--llvm/unittests/ADT/TypeSwitchTest.cpp28
-rw-r--r--mlir/docs/Tutorials/CreatingADialect.md2
-rw-r--r--mlir/include/mlir-c/Rewrite.h14
-rw-r--r--mlir/lib/Bindings/Python/IRCore.cpp3
-rw-r--r--mlir/lib/Bindings/Python/IRModule.h2
-rw-r--r--mlir/lib/Bindings/Python/Rewrite.cpp34
-rw-r--r--mlir/lib/CAPI/Transforms/Rewrite.cpp15
-rw-r--r--mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp2
-rw-r--r--mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp26
-rw-r--r--mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp4
-rw-r--r--mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp4
-rw-r--r--mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp2
-rw-r--r--mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp14
-rw-r--r--mlir/test/python/integration/dialects/pdl.py91
-rw-r--r--orc-rt/include/orc-rt/SPSWrapperFunction.h9
-rw-r--r--orc-rt/include/orc-rt/WrapperFunction.h27
-rw-r--r--orc-rt/unittests/CMakeLists.txt1
-rw-r--r--orc-rt/unittests/CommonTestUtils.cpp20
-rw-r--r--orc-rt/unittests/CommonTestUtils.h9
-rw-r--r--orc-rt/unittests/SPSWrapperFunctionTest.cpp79
-rw-r--r--orc-rt/unittests/bind-test.cpp26
-rw-r--r--utils/bazel/llvm-project-overlay/libc/BUILD.bazel85
115 files changed, 3135 insertions, 2296 deletions
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d2e5bd2..ab536ad 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -250,6 +250,8 @@ Non-comprehensive list of changes in this release
- ``__builtin_assume_dereferenceable`` now accepts non-constant size operands.
+- Fixed a crash when the second argument to ``__builtin_assume_aligned`` was not constant (#GH161314)
+
New Compiler Flags
------------------
- New option ``-fno-sanitize-debug-trap-reasons`` added to disable emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
@@ -452,6 +454,7 @@ Bug Fixes to AST Handling
Miscellaneous Bug Fixes
^^^^^^^^^^^^^^^^^^^^^^^
+- Fixed missing diagnostics of ``diagnose_if`` on templates involved in initialization. (#GH160776)
Miscellaneous Clang Crashes Fixed
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index f319b17..c961222 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -465,11 +465,16 @@ public:
return nullptr;
if (Value *Result = ConstantEmitter(CGF).tryEmitConstantExpr(E)) {
- if (E->isGLValue())
+ if (E->isGLValue()) {
+ // This was already converted to an rvalue when it was constant
+ // evaluated.
+ if (E->hasAPValueResult() && !E->getAPValueResult().isLValue())
+ return Result;
return CGF.EmitLoadOfScalar(
Address(Result, CGF.convertTypeForLoadStore(E->getType()),
CGF.getContext().getTypeAlignInChars(E->getType())),
/*Volatile*/ false, E->getType(), E->getExprLoc());
+ }
return Result;
}
return Visit(E->getSubExpr());
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7ce3513..3cc61b1 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5954,6 +5954,9 @@ bool Sema::BuiltinAssumeAligned(CallExpr *TheCall) {
if (Result > Sema::MaximumAlignment)
Diag(TheCall->getBeginLoc(), diag::warn_assume_aligned_too_great)
<< SecondArg->getSourceRange() << Sema::MaximumAlignment;
+
+ TheCall->setArg(1,
+ ConstantExpr::Create(Context, SecondArg, APValue(Result)));
}
if (NumArgs > 2) {
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 8413090..11d2d5c 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -264,14 +264,6 @@ class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
UnsignedOrNone OuterPackSubstIndex;
- TemplateArgument getPackSubstitutedTemplateArgument(TemplateArgument Arg) {
- assert(*SemaRef.ArgPackSubstIndex < Arg.pack_size());
- Arg = Arg.pack_begin()[*SemaRef.ArgPackSubstIndex];
- if (Arg.isPackExpansion())
- Arg = Arg.getPackExpansionPattern();
- return Arg;
- }
-
bool shouldVisitTemplateInstantiations() const { return true; }
public:
@@ -294,7 +286,7 @@ public:
assert(Arg.getKind() == TemplateArgument::Pack &&
"Missing argument pack");
- Arg = getPackSubstitutedTemplateArgument(Arg);
+ Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
}
UsedTemplateArgs.push_back(
@@ -312,7 +304,7 @@ public:
if (NTTP->isParameterPack() && SemaRef.ArgPackSubstIndex) {
assert(Arg.getKind() == TemplateArgument::Pack &&
"Missing argument pack");
- Arg = getPackSubstitutedTemplateArgument(Arg);
+ Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
}
UsedTemplateArgs.push_back(
@@ -325,8 +317,11 @@ public:
}
bool TraverseDecl(Decl *D) {
- if (auto *VD = dyn_cast<ValueDecl>(D))
+ if (auto *VD = dyn_cast<ValueDecl>(D)) {
+ if (auto *Var = dyn_cast<VarDecl>(VD))
+ TraverseStmt(Var->getInit());
return TraverseType(VD->getType());
+ }
return inherited::TraverseDecl(D);
}
@@ -363,6 +358,14 @@ public:
return inherited::TraverseTemplateArgument(Arg);
}
+ bool TraverseSizeOfPackExpr(SizeOfPackExpr *SOPE) {
+ return TraverseDecl(SOPE->getPack());
+ }
+
+ bool VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *E) {
+ return inherited::TraverseStmt(E->getReplacement());
+ }
+
void VisitConstraint(const NormalizedConstraintWithParamMapping &Constraint) {
if (!Constraint.hasParameterMapping()) {
for (const auto &List : TemplateArgs)
@@ -2083,8 +2086,8 @@ bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) {
/*UpdateArgsWithConversions=*/false))
return true;
auto TemplateArgs = *MLTAL;
- TemplateArgs.replaceOutermostTemplateArguments(
- TemplateArgs.getAssociatedDecl(0).first, CTAI.SugaredConverted);
+ TemplateArgs.replaceOutermostTemplateArguments(CSE->getNamedConcept(),
+ CTAI.SugaredConverted);
return SubstituteParameterMappings(SemaRef, &TemplateArgs, ArgsAsWritten,
InFoldExpr)
.substitute(CC.getNormalizedConstraint());
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 0d0d2c0..922fcac 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -7539,7 +7539,7 @@ PerformConstructorInitialization(Sema &S,
// Only check access if all of that succeeded.
S.CheckConstructorAccess(Loc, Constructor, Step.Function.FoundDecl, Entity);
- if (S.DiagnoseUseOfDecl(Step.Function.FoundDecl, Loc))
+ if (S.DiagnoseUseOfOverloadedDecl(Constructor, Loc))
return ExprError();
if (const ArrayType *AT = S.Context.getAsArrayType(Entity.getType()))
@@ -8092,7 +8092,7 @@ ExprResult InitializationSequence::Perform(Sema &S,
S.CheckConstructorAccess(Kind.getLocation(), Constructor, FoundFn,
Entity);
- if (S.DiagnoseUseOfDecl(FoundFn, Kind.getLocation()))
+ if (S.DiagnoseUseOfOverloadedDecl(Constructor, Kind.getLocation()))
return ExprError();
CastKind = CK_ConstructorConversion;
@@ -8102,7 +8102,7 @@ ExprResult InitializationSequence::Perform(Sema &S,
CXXConversionDecl *Conversion = cast<CXXConversionDecl>(Fn);
S.CheckMemberOperatorAccess(Kind.getLocation(), CurInit.get(), nullptr,
FoundFn);
- if (S.DiagnoseUseOfDecl(FoundFn, Kind.getLocation()))
+ if (S.DiagnoseUseOfOverloadedDecl(Conversion, Kind.getLocation()))
return ExprError();
CurInit = S.BuildCXXMemberCallExpr(CurInit.get(), FoundFn, Conversion,
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 6bba505..3baa977 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -6718,6 +6718,10 @@ struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor {
}
return true;
}
+
+ bool TraverseSizeOfPackExpr(SizeOfPackExpr *SOPE) override {
+ return TraverseDecl(SOPE->getPack());
+ }
};
}
diff --git a/clang/test/Parser/cxx0x-lambda-expressions.cpp b/clang/test/Parser/cxx0x-lambda-expressions.cpp
index f90f8ce..5b57c7f 100644
--- a/clang/test/Parser/cxx0x-lambda-expressions.cpp
+++ b/clang/test/Parser/cxx0x-lambda-expressions.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++03 -Wno-c99-designator %s -Wno-c++11-extensions
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++03 -Wno-c99-designator %s -Wno-c++11-extensions -Wno-local-type-template-args
// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++11 -Wno-c99-designator %s
// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx17ext,cxx20ext,cxx23ext -std=c++14 -Wno-c99-designator %s
// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx20ext,cxx23ext -std=c++17 -Wno-c99-designator %s
diff --git a/clang/test/SemaCXX/builtin-assume-aligned.cpp b/clang/test/SemaCXX/builtin-assume-aligned.cpp
index 48bd841..30296c7 100644
--- a/clang/test/SemaCXX/builtin-assume-aligned.cpp
+++ b/clang/test/SemaCXX/builtin-assume-aligned.cpp
@@ -47,3 +47,16 @@ constexpr void *s1 = __builtin_assume_aligned(x, 32);
constexpr void *s2 = __builtin_assume_aligned(x, 32, 5);
constexpr void *s3 = __builtin_assume_aligned(x, 32, -1);
+
+constexpr int add(int a, int b) {
+ return a+b;
+}
+constexpr void *c1 = __builtin_assume_aligned(p, add(1,1));
+constexpr void *c2 = __builtin_assume_aligned(p, add(2,1)); // expected-error {{not a power of 2}}
+
+constexpr long kAlignment = 128;
+long AllocateAlignedBytes_payload;
+void AllocateAlignedBytes() {
+ void *m = __builtin_assume_aligned(
+ reinterpret_cast<void *>(AllocateAlignedBytes_payload), kAlignment);
+}
diff --git a/clang/test/SemaCXX/diagnose_if.cpp b/clang/test/SemaCXX/diagnose_if.cpp
index 1b9e660..0af8bb7 100644
--- a/clang/test/SemaCXX/diagnose_if.cpp
+++ b/clang/test/SemaCXX/diagnose_if.cpp
@@ -1,5 +1,7 @@
// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14
+// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++20 -verify=expected,cxx20
// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14 -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++20 -verify=expected,cxx20 -fexperimental-new-constant-interpreter
#define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__)))
@@ -665,3 +667,28 @@ void run() {
switch (constexpr Foo i = 2) { default: break; } // expected-error{{oh no}}
}
}
+
+namespace GH160776 {
+
+struct ConstructorTemplate {
+ template <class T>
+ explicit ConstructorTemplate(T x)
+ _diagnose_if(sizeof(T) == sizeof(char), "oh no", "error") {} // expected-note {{diagnose_if}}
+
+ template <class T>
+#if __cplusplus >= 202002L
+ requires (sizeof(T) == 1) // cxx20-note {{evaluated to false}}
+#endif
+ operator T() _diagnose_if(sizeof(T) == sizeof(char), "oh no", "error") { // expected-note {{diagnose_if}} \
+ // cxx20-note {{constraints not satisfied}}
+ return T{};
+ }
+};
+
+void run() {
+ ConstructorTemplate x('1'); // expected-error {{oh no}}
+ char y = x; // expected-error {{oh no}}
+ int z = x; // cxx20-error {{no viable conversion}}
+}
+
+}
diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp
index 8ea8e32..f9d7cfc 100644
--- a/clang/test/SemaCXX/lambda-expressions.cpp
+++ b/clang/test/SemaCXX/lambda-expressions.cpp
@@ -149,7 +149,8 @@ namespace PR12031 {
void f(int i, X x);
void g() {
const int v = 10;
- f(v, [](){});
+ f(v, [](){}); // cxx03-warning {{template argument uses local type}} \
+ // cxx03-note {{while substituting}}
}
}
@@ -572,26 +573,37 @@ namespace PR27994 {
struct A { template <class T> A(T); };
template <class T>
-struct B {
+struct B { // #PR27994_B
int x;
- A a = [&] { int y = x; };
- A b = [&] { [&] { [&] { int y = x; }; }; };
- A d = [&](auto param) { int y = x; }; // cxx03-cxx11-error {{'auto' not allowed in lambda parameter}}
- A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx03-cxx11-error 2 {{'auto' not allowed in lambda parameter}}
+ A a = [&] { int y = x; }; // cxx03-warning {{template argument uses unnamed type}} \
+ // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
+ A b = [&] { [&] { [&] { int y = x; }; }; }; // cxx03-warning {{template argument uses unnamed type}} \
+ // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
+ A d = [&](auto param) { int y = x; }; // cxx03-cxx11-error {{'auto' not allowed in lambda parameter}} \
+ // cxx03-warning {{template argument uses unnamed type}} \
+ // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
+ A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx03-cxx11-error 2 {{'auto' not allowed in lambda parameter}} \
+ // cxx03-warning {{template argument uses unnamed type}} \
+ // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
};
B<int> b;
+// cxx03-note@#PR27994_B 4{{in instantiation of default member initializer}}
+// cxx03-note@-2 4{{in evaluation of exception}}
template <class T> struct C {
struct D {
+ // cxx03-note@-1 {{in instantiation of default member initializer}}
int x;
- A f = [&] { int y = x; };
+ A f = [&] { int y = x; }; // cxx03-warning {{template argument uses unnamed type}} \
+ // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
};
};
int func() {
C<int> a;
decltype(a)::D b;
+ // cxx03-note@-1 {{in evaluation of exception}}
}
}
@@ -606,8 +618,12 @@ struct S1 {
void foo1() {
auto s0 = S1([name=]() {}); // expected-error {{expected expression}}
+ // cxx03-warning@-1 {{template argument uses local type}} \
+ // cxx03-note@-1 {{while substituting deduced template arguments}}
auto s1 = S1([name=name]() {}); // expected-error {{use of undeclared identifier 'name'; did you mean 'name1'?}}
// cxx03-cxx11-warning@-1 {{initialized lambda captures are a C++14 extension}}
+ // cxx03-warning@-2 {{template argument uses local type}} \
+ // cxx03-note@-2 {{while substituting deduced template arguments}}
}
}
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 6d29f8b..e5e081f 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1333,4 +1333,75 @@ static_assert(__cpp17_iterator<not_move_constructible>); \
// expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<parameter_mapping_regressions::case3::not_move_constructible>' evaluated to false}}
}
+namespace case4 {
+
+template<bool b>
+concept bool_ = b;
+
+template<typename... Ts>
+concept unary = bool_<sizeof...(Ts) == 1>;
+
+static_assert(!unary<>);
+static_assert(unary<void>);
+
+}
+
+namespace case5 {
+
+template<int size>
+concept true1 = size == size;
+
+template<typename... Ts>
+concept true2 = true1<sizeof...(Ts)>;
+
+template<typename... Ts>
+concept true3 = true2<Ts...>;
+
+static_assert(true3<void>);
+
+}
+
+namespace case6 {
+
+namespace std {
+template <int __v>
+struct integral_constant {
+ static const int value = __v;
+};
+
+template <class _Tp, class... _Args>
+constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...);
+
+template <class _From, class _To>
+constexpr bool is_convertible_v = __is_convertible(_From, _To);
+
+template <class>
+struct tuple_size;
+
+template <class _Tp>
+constexpr decltype(sizeof(int)) tuple_size_v = tuple_size<_Tp>::value;
+} // namespace std
+
+template <int N, int X>
+concept FixedExtentConstructibleFromExtent = X == N;
+
+template <int Extent>
+struct span {
+ int static constexpr extent = Extent;
+ template <typename R, int N = std::tuple_size_v<R>>
+ requires(FixedExtentConstructibleFromExtent<extent, N>)
+ span(R);
+};
+
+template <class, int>
+struct array {};
+
+template <class _Tp, decltype(sizeof(int)) _Size>
+struct std::tuple_size<array<_Tp, _Size>> : integral_constant<_Size> {};
+
+static_assert(std::is_convertible_v<array<int, 3>, span<3>>);
+static_assert(!std::is_constructible_v<span<4>, array<int, 3>>);
+
+}
+
}
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 4b2a0d8..924d0cb 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -47,6 +47,7 @@
#include "math/exp10f16.h"
#include "math/exp10m1f.h"
#include "math/exp10m1f16.h"
+#include "math/exp2.h"
#include "math/expf.h"
#include "math/expf16.h"
#include "math/frexpf.h"
diff --git a/libc/shared/math/exp2.h b/libc/shared/math/exp2.h
new file mode 100644
index 0000000..6f1e143
--- /dev/null
+++ b/libc/shared/math/exp2.h
@@ -0,0 +1,23 @@
+//===-- Shared exp2 function ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP2_H
+#define LLVM_LIBC_SHARED_MATH_EXP2_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp2.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp2;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP2_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 98f9bb42..4130fdf 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -374,6 +374,15 @@ add_header_library(
)
add_header_library(
+ common_constants
+ HDRS
+ common_constants.h
+ DEPENDS
+ libc.src.__support.macros.config
+ libc.src.__support.number_pair
+)
+
+add_header_library(
cos
HDRS
cos.h
@@ -705,6 +714,28 @@ add_header_library(
)
add_header_library(
+ exp2
+ HDRS
+ exp2.h
+ DEPENDS
+ .common_constants
+ .exp_utils
+ libc.src.__support.CPP.bit
+ libc.src.__support.CPP.optional
+ libc.src.__support.FPUtil.dyadic_float
+ libc.src.__support.FPUtil.fenv_impl
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.nearest_integer
+ libc.src.__support.FPUtil.polyeval
+ libc.src.__support.FPUtil.rounding_mode
+ libc.src.__support.FPUtil.triple_double
+ libc.src.__support.integer_literals
+ libc.src.__support.macros.optimization
+ libc.src.errno.errno
+)
+
+add_header_library(
exp10
HDRS
exp10.h
diff --git a/libc/src/math/generic/common_constants.cpp b/libc/src/__support/math/common_constants.h
index 2a15df2..53abbfe 100644
--- a/libc/src/math/generic/common_constants.cpp
+++ b/libc/src/__support/math/common_constants.h
@@ -6,12 +6,29 @@
//
//===----------------------------------------------------------------------===//
-#include "common_constants.h"
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H
+
#include "src/__support/macros/config.h"
#include "src/__support/number_pair.h"
namespace LIBC_NAMESPACE_DECL {
+namespace common_constants_internal {
+
+// log(2) generated by Sollya with:
+// > a = 2^-43 * nearestint(2^43*log(2));
+// LSB = 2^-43 is chosen so that e_x * LOG_2_HI is exact for -1075 < e_x < 1024.
+static constexpr double LOG_2_HI = 0x1.62e42fefa38p-1; // LSB = 2^-43
+// > b = round(log10(2) - a, D, RN);
+static constexpr double LOG_2_LO = 0x1.ef35793c7673p-45; // LSB = 2^-97
+
+// Minimax polynomial for (log(1 + x) - x)/x^2, generated by sollya with:
+// > P = fpminimax((log(1 + x) - x)/x^2, 5, [|D...|], [-2^-8, 2^-7]);
+constexpr double LOG_COEFFS[6] = {-0x1.fffffffffffffp-2, 0x1.5555555554a9bp-2,
+ -0x1.0000000094567p-2, 0x1.99999dcc9823cp-3,
+ -0x1.55550ac2e537ap-3, 0x1.21a02c4e624d7p-3};
+
// Range reduction constants for logarithms.
// r(0) = 1, r(127) = 0.5
// r(k) = 2^-8 * ceil(2^8 * (1 - 2^-8) / (1 + k*2^-7))
@@ -19,7 +36,7 @@ namespace LIBC_NAMESPACE_DECL {
// precision, and -2^-8 <= v < 2^-7.
// TODO(lntue): Add reference to how the constants are derived after the
// resulting paper is ready.
-alignas(8) const float R[128] = {
+alignas(8) static constexpr float R[128] = {
0x1p0, 0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1, 0x1.ecp-1, 0x1.e8p-1,
0x1.e4p-1, 0x1.ep-1, 0x1.dep-1, 0x1.dap-1, 0x1.d6p-1, 0x1.d4p-1, 0x1.dp-1,
0x1.ccp-1, 0x1.cap-1, 0x1.c6p-1, 0x1.c4p-1, 0x1.cp-1, 0x1.bep-1, 0x1.bap-1,
@@ -40,7 +57,7 @@ alignas(8) const float R[128] = {
0x1.0ap-1, 0x1.08p-1, 0x1.08p-1, 0x1.06p-1, 0x1.06p-1, 0x1.04p-1, 0x1.04p-1,
0x1.02p-1, 0x1.0p-1};
-const double RD[128] = {
+static constexpr double RD[128] = {
0x1p0, 0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1, 0x1.ecp-1, 0x1.e8p-1,
0x1.e4p-1, 0x1.ep-1, 0x1.dep-1, 0x1.dap-1, 0x1.d6p-1, 0x1.d4p-1, 0x1.dp-1,
0x1.ccp-1, 0x1.cap-1, 0x1.c6p-1, 0x1.c4p-1, 0x1.cp-1, 0x1.bep-1, 0x1.bap-1,
@@ -65,7 +82,7 @@ const double RD[128] = {
// available.
// Generated by Sollya with the formula: CD[i] = RD[i]*(1 + i*2^-7) - 1
// for RD[i] defined on the table above.
-const double CD[128] = {
+static constexpr double CD[128] = {
0.0, -0x1p-14, -0x1p-12, -0x1.2p-11, -0x1p-10, -0x1.9p-10,
-0x1.2p-9, -0x1.88p-9, -0x1p-8, -0x1.9p-11, -0x1.fp-10, -0x1.9cp-9,
-0x1p-12, -0x1.cp-10, -0x1.bp-9, -0x1.5p-11, -0x1.4p-9, 0x1p-14,
@@ -90,7 +107,7 @@ const double CD[128] = {
-0x1p-14, -0x1p-8,
};
-const double LOG_R[128] = {
+static constexpr double LOG_R[128] = {
0x0.0000000000000p0, 0x1.010157588de71p-7, 0x1.0205658935847p-6,
0x1.8492528c8cabfp-6, 0x1.0415d89e74444p-5, 0x1.466aed42de3eap-5,
0x1.894aa149fb343p-5, 0x1.ccb73cdddb2ccp-5, 0x1.08598b59e3a07p-4,
@@ -135,7 +152,7 @@ const double LOG_R[128] = {
0x1.5707a26bb8c66p-1, 0x1.5af405c3649ep-1, 0x1.5af405c3649ep-1,
0x1.5ee82aa24192p-1, 0x0.000000000000p0};
-const double LOG2_R[128] = {
+static constexpr double LOG2_R[128] = {
0x0.0000000000000p+0, 0x1.72c7ba20f7327p-7, 0x1.743ee861f3556p-6,
0x1.184b8e4c56af8p-5, 0x1.77394c9d958d5p-5, 0x1.d6ebd1f1febfep-5,
0x1.1bb32a600549dp-4, 0x1.4c560fe68af88p-4, 0x1.7d60496cfbb4cp-4,
@@ -188,7 +205,7 @@ const double LOG2_R[128] = {
// print("{", -c, ",", -b, "},");
// };
// We replace LOG_R[0] with log10(1.0) == 0.0
-alignas(16) const NumberPair<double> LOG_R_DD[128] = {
+alignas(16) static constexpr NumberPair<double> LOG_R_DD[128] = {
{0.0, 0.0},
{-0x1.0c76b999d2be8p-46, 0x1.010157589p-7},
{-0x1.3dc5b06e2f7d2p-45, 0x1.0205658938p-6},
@@ -324,7 +341,7 @@ alignas(16) const NumberPair<double> LOG_R_DD[128] = {
// Output range:
// [-0x1.3ffcp-15, 0x1.3e3dp-15]
// We store S2[i] = 2^16 (r(i - 2^6) - 1).
-alignas(8) const int S2[193] = {
+alignas(8) static constexpr int S2[193] = {
0x101, 0xfd, 0xf9, 0xf5, 0xf1, 0xed, 0xe9, 0xe5, 0xe1,
0xdd, 0xd9, 0xd5, 0xd1, 0xcd, 0xc9, 0xc5, 0xc1, 0xbd,
0xb9, 0xb4, 0xb0, 0xac, 0xa8, 0xa4, 0xa0, 0x9c, 0x98,
@@ -348,7 +365,7 @@ alignas(8) const int S2[193] = {
-0x1cd, -0x1d1, -0x1d5, -0x1d9, -0x1dd, -0x1e0, -0x1e4, -0x1e8, -0x1ec,
-0x1f0, -0x1f4, -0x1f8, -0x1fc};
-const double R2[193] = {
+static constexpr double R2[193] = {
0x1.0101p0, 0x1.00fdp0, 0x1.00f9p0, 0x1.00f5p0, 0x1.00f1p0,
0x1.00edp0, 0x1.00e9p0, 0x1.00e5p0, 0x1.00e1p0, 0x1.00ddp0,
0x1.00d9p0, 0x1.00d5p0, 0x1.00d1p0, 0x1.00cdp0, 0x1.00c9p0,
@@ -395,7 +412,7 @@ const double R2[193] = {
// Output range:
// [-0x1.01928p-22 , 0x1p-22]
// We store S[i] = 2^21 (r(i - 80) - 1).
-alignas(8) const int S3[161] = {
+alignas(8) static constexpr int S3[161] = {
0x50, 0x4f, 0x4e, 0x4d, 0x4c, 0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46,
0x45, 0x44, 0x43, 0x42, 0x41, 0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b,
0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30,
@@ -418,7 +435,7 @@ alignas(8) const int S3[161] = {
// Output range:
// [-0x1.0002143p-29 , 0x1p-29]
// We store S[i] = 2^28 (r(i - 65) - 1).
-alignas(8) const int S4[130] = {
+alignas(8) static constexpr int S4[130] = {
0x41, 0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37,
0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c,
0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21,
@@ -439,7 +456,7 @@ alignas(8) const int S4[130] = {
// Table is generated with Sollya as follow:
// > display = hexadecimal;
// > for i from -104 to 89 do { D(exp(i)); };
-const double EXP_M1[195] = {
+static constexpr double EXP_M1[195] = {
0x1.f1e6b68529e33p-151, 0x1.525be4e4e601dp-149, 0x1.cbe0a45f75eb1p-148,
0x1.3884e838aea68p-146, 0x1.a8c1f14e2af5dp-145, 0x1.20a717e64a9bdp-143,
0x1.8851d84118908p-142, 0x1.0a9bdfb02d240p-140, 0x1.6a5bea046b42ep-139,
@@ -511,7 +528,7 @@ const double EXP_M1[195] = {
// Table is generated with Sollya as follow:
// > display = hexadecimal;
// > for i from 0 to 127 do { D(exp(i / 128)); };
-const double EXP_M2[128] = {
+static constexpr double EXP_M2[128] = {
0x1.0000000000000p0, 0x1.0202015600446p0, 0x1.04080ab55de39p0,
0x1.06122436410ddp0, 0x1.08205601127edp0, 0x1.0a32a84e9c1f6p0,
0x1.0c49236829e8cp0, 0x1.0e63cfa7ab09dp0, 0x1.1082b577d34edp0,
@@ -557,4 +574,8 @@ const double EXP_M2[128] = {
0x1.568bb722dd593p1, 0x1.593b7d72305bbp1,
};
+} // namespace common_constants_internal
+
} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H
diff --git a/libc/src/__support/math/exp2.h b/libc/src/__support/math/exp2.h
new file mode 100644
index 0000000..7eaa465
--- /dev/null
+++ b/libc/src/__support/math/exp2.h
@@ -0,0 +1,425 @@
+//===-- Implementation header for exp2 --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H
+
+#include "common_constants.h" // Lookup tables EXP2_MID1 and EXP_M2.
+#include "exp_constants.h"
+#include "exp_utils.h" // ziv_test_denorm.
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/FPUtil/triple_double.h"
+#include "src/__support/common.h"
+#include "src/__support/integer_literals.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+namespace exp2_internal {
+
+using namespace common_constants_internal;
+
+using fputil::DoubleDouble;
+using fputil::TripleDouble;
+using Float128 = typename fputil::DyadicFloat<128>;
+
+using LIBC_NAMESPACE::operator""_u128;
+
+// Error bounds:
+// Errors when using double precision.
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+constexpr double ERR_D = 0x1.0p-63;
+#else
+constexpr double ERR_D = 0x1.8p-63;
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Errors when using double-double precision.
+constexpr double ERR_DD = 0x1.0p-100;
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// Polynomial approximations with double precision. Generated by Sollya with:
+// > P = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
+// > P;
+// Error bounds:
+// | output - (2^dx - 1) / dx | < 1.5 * 2^-52.
+LIBC_INLINE static double poly_approx_d(double dx) {
+ // dx^2
+ double dx2 = dx * dx;
+ double c0 =
+ fputil::multiply_add(dx, 0x1.ebfbdff82c58ep-3, 0x1.62e42fefa39efp-1);
+ double c1 =
+ fputil::multiply_add(dx, 0x1.3b2aba7a95a89p-7, 0x1.c6b08e8fc0c0ep-5);
+ double p = fputil::multiply_add(dx2, c1, c0);
+ return p;
+}
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Polynomial approximation with double-double precision. Generated by Solya
+// with:
+// > P = fpminimax((2^x - 1)/x, 5, [|DD...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
+// Error bounds:
+// | output - 2^(dx) | < 2^-101
+LIBC_INLINE static constexpr DoubleDouble
+poly_approx_dd(const DoubleDouble &dx) {
+ // Taylor polynomial.
+ constexpr DoubleDouble COEFFS[] = {
+ {0, 0x1p0},
+ {0x1.abc9e3b39824p-56, 0x1.62e42fefa39efp-1},
+ {-0x1.5e43a53e4527bp-57, 0x1.ebfbdff82c58fp-3},
+ {-0x1.d37963a9444eep-59, 0x1.c6b08d704a0cp-5},
+ {0x1.4eda1a81133dap-62, 0x1.3b2ab6fba4e77p-7},
+ {-0x1.c53fd1ba85d14p-64, 0x1.5d87fe7a265a5p-10},
+ {0x1.d89250b013eb8p-70, 0x1.430912f86cb8ep-13},
+ };
+
+ DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
+ COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
+ return p;
+}
+
+// Polynomial approximation with 128-bit precision:
+// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7
+// For |dx| < 2^-13 + 2^-30:
+// | output - exp(dx) | < 2^-126.
+LIBC_INLINE static constexpr Float128 poly_approx_f128(const Float128 &dx) {
+ constexpr Float128 COEFFS_128[]{
+ {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
+ {Sign::POS, -128, 0xb17217f7'd1cf79ab'c9e3b398'03f2f6af_u128},
+ {Sign::POS, -128, 0x3d7f7bff'058b1d50'de2d60dd'9c9a1d9f_u128},
+ {Sign::POS, -132, 0xe35846b8'2505fc59'9d3b15d9'e7fb6897_u128},
+ {Sign::POS, -134, 0x9d955b7d'd273b94e'184462f6'bcd2b9e7_u128},
+ {Sign::POS, -137, 0xaec3ff3c'53398883'39ea1bb9'64c51a89_u128},
+ {Sign::POS, -138, 0x2861225f'345c396a'842c5341'8fa8ae61_u128},
+ {Sign::POS, -144, 0xffe5fe2d'109a319d'7abeb5ab'd5ad2079_u128},
+ };
+
+ Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
+ COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
+ COEFFS_128[6], COEFFS_128[7]);
+ return p;
+}
+
+// Compute 2^(x) using 128-bit precision.
+// TODO(lntue): investigate triple-double precision implementation for this
+// step.
+LIBC_INLINE static constexpr Float128 exp2_f128(double x, int hi, int idx1,
+ int idx2) {
+ Float128 dx = Float128(x);
+
+ // TODO: Skip recalculating exp_mid1 and exp_mid2.
+ Float128 exp_mid1 =
+ fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
+ fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
+ Float128(EXP2_MID1[idx1].lo)));
+
+ Float128 exp_mid2 =
+ fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
+ fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
+ Float128(EXP2_MID2[idx2].lo)));
+
+ Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
+
+ Float128 p = poly_approx_f128(dx);
+
+ Float128 r = fputil::quick_mul(exp_mid, p);
+
+ r.exponent += hi;
+
+ return r;
+}
+
+// Compute 2^x with double-double precision.
+LIBC_INLINE static DoubleDouble
+exp2_double_double(double x, const DoubleDouble &exp_mid) {
+ DoubleDouble dx({0, x});
+
+ // Degree-6 polynomial approximation in double-double precision.
+ // | p - 2^x | < 2^-103.
+ DoubleDouble p = poly_approx_dd(dx);
+
+ // Error bounds: 2^-102.
+ DoubleDouble r = fputil::quick_mult(exp_mid, p);
+
+ return r;
+}
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// When output is denormal.
+LIBC_INLINE static double exp2_denorm(double x) {
+ // Range reduction.
+ int k =
+ static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
+ double kd = static_cast<double>(k);
+
+ uint32_t idx1 = (k >> 6) & 0x3f;
+ uint32_t idx2 = k & 0x3f;
+
+ int hi = k >> 12;
+
+ DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
+ DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
+ DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
+
+ // |dx| < 2^-13 + 2^-30.
+ double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
+
+ double mid_lo = dx * exp_mid.hi;
+
+ // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
+ double p = poly_approx_d(dx);
+
+ double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+ return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
+ .value();
+#else
+ if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
+ LIBC_LIKELY(r.has_value()))
+ return r.value();
+
+ // Use double-double
+ DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
+
+ if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
+ LIBC_LIKELY(r.has_value()))
+ return r.value();
+
+ // Use 128-bit precision
+ Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
+
+ return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+// Check for exceptional cases when:
+// * log2(1 - 2^-54) < x < log2(1 + 2^-53)
+// * x >= 1024
+// * x <= -1022
+// * x is inf or nan
+LIBC_INLINE static constexpr double set_exceptional(double x) {
+ using FPBits = typename fputil::FPBits<double>;
+ FPBits xbits(x);
+
+ uint64_t x_u = xbits.uintval();
+ uint64_t x_abs = xbits.abs().uintval();
+
+ // |x| < log2(1 + 2^-53)
+ if (x_abs <= 0x3ca71547652b82fd) {
+ // 2^(x) ~ 1 + x/2
+ return fputil::multiply_add(x, 0.5, 1.0);
+ }
+
+ // x <= -1022 || x >= 1024 or inf/nan.
+ if (x_u > 0xc08ff00000000000) {
+ // x <= -1075 or -inf/nan
+ if (x_u >= 0xc090cc0000000000) {
+ // exp(-Inf) = 0
+ if (xbits.is_inf())
+ return 0.0;
+
+ // exp(nan) = nan
+ if (xbits.is_nan())
+ return x;
+
+ if (fputil::quick_get_round() == FE_UPWARD)
+ return FPBits::min_subnormal().get_val();
+ fputil::set_errno_if_required(ERANGE);
+ fputil::raise_except_if_required(FE_UNDERFLOW);
+ return 0.0;
+ }
+
+ return exp2_denorm(x);
+ }
+
+ // x >= 1024 or +inf/nan
+ // x is finite
+ if (x_u < 0x7ff0'0000'0000'0000ULL) {
+ int rounding = fputil::quick_get_round();
+ if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+ return FPBits::max_normal().get_val();
+
+ fputil::set_errno_if_required(ERANGE);
+ fputil::raise_except_if_required(FE_OVERFLOW);
+ }
+ // x is +inf or nan
+ return x + FPBits::inf().get_val();
+}
+
+} // namespace exp2_internal
+
+LIBC_INLINE static constexpr double exp2(double x) {
+ using namespace exp2_internal;
+ using FPBits = typename fputil::FPBits<double>;
+ FPBits xbits(x);
+
+ uint64_t x_u = xbits.uintval();
+
+ // x < -1022 or x >= 1024 or log2(1 - 2^-54) < x < log2(1 + 2^-53).
+ if (LIBC_UNLIKELY(x_u > 0xc08ff00000000000 ||
+ (x_u <= 0xbc971547652b82fe && x_u >= 0x4090000000000000) ||
+ x_u <= 0x3ca71547652b82fd)) {
+ return set_exceptional(x);
+ }
+
+ // Now -1075 < x <= log2(1 - 2^-54) or log2(1 + 2^-53) < x < 1024
+
+ // Range reduction:
+ // Let x = (hi + mid1 + mid2) + lo
+ // in which:
+ // hi is an integer
+ // mid1 * 2^6 is an integer
+ // mid2 * 2^12 is an integer
+ // then:
+ // 2^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 2^(lo).
+ // With this formula:
+ // - multiplying by 2^hi is exact and cheap, simply by adding the exponent
+ // field.
+ // - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
+ // - 2^(lo) ~ 1 + a0*lo + a1 * lo^2 + ...
+ //
+ // We compute (hi + mid1 + mid2) together by perform the rounding on x * 2^12.
+ // Since |x| < |-1075)| < 2^11,
+ // |x * 2^12| < 2^11 * 2^12 < 2^23,
+ // So we can fit the rounded result round(x * 2^12) in int32_t.
+ // Thus, the goal is to be able to use an additional addition and fixed width
+ // shift to get an int32_t representing round(x * 2^12).
+ //
+ // Assuming int32_t using 2-complement representation, since the mantissa part
+ // of a double precision is unsigned with the leading bit hidden, if we add an
+ // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^25 to the product, the
+ // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
+ // considered as a proper 2-complement representations of x*2^12.
+ //
+ // One small problem with this approach is that the sum (x*2^12 + C) in
+ // double precision is rounded to the least significant bit of the dorminant
+ // factor C. In order to minimize the rounding errors from this addition, we
+ // want to minimize e1. Another constraint that we want is that after
+ // shifting the mantissa so that the least significant bit of int32_t
+ // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
+ // any adjustment. So combining these 2 requirements, we can choose
+ // C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
+ // after right shifting the mantissa, the resulting int32_t has correct sign.
+ // With this choice of C, the number of mantissa bits we need to shift to the
+ // right is: 52 - 33 = 19.
+ //
+ // Moreover, since the integer right shifts are equivalent to rounding down,
+ // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
+ // +infinity. So in particular, we can compute:
+ // hmm = x * 2^12 + C,
+ // where C = 2^33 + 2^32 + 2^-1, then if
+ // k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19),
+ // the reduced argument:
+ // lo = x - 2^-12 * k is bounded by:
+ // |lo| <= 2^-13 + 2^-12*2^-19
+ // = 2^-13 + 2^-31.
+ //
+ // Finally, notice that k only uses the mantissa of x * 2^12, so the
+ // exponent 2^12 is not needed. So we can simply define
+ // C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
+ // k = int32_t(lower 51 bits of double(x + C) >> 19).
+
+ // Rounding errors <= 2^-31.
+ int k =
+ static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
+ double kd = static_cast<double>(k);
+
+ uint32_t idx1 = (k >> 6) & 0x3f;
+ uint32_t idx2 = k & 0x3f;
+
+ int hi = k >> 12;
+
+ DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
+ DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
+ DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
+
+ // |dx| < 2^-13 + 2^-30.
+ double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
+
+ // We use the degree-4 polynomial to approximate 2^(lo):
+ // 2^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4 = 1 + lo * P(lo)
+ // So that the errors are bounded by:
+ // |P(lo) - (2^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
+ // Let P_ be an evaluation of P where all intermediate computations are in
+ // double precision. Using either Horner's or Estrin's schemes, the evaluated
+ // errors can be bounded by:
+ // |P_(lo) - P(lo)| < 2^-51
+ // => |lo * P_(lo) - (2^lo - 1) | < 2^-64
+ // => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-63.
+ // Since we approximate
+ // 2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
+ // We use the expression:
+ // (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
+ // ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
+ // with errors bounded by 2^-63.
+
+ double mid_lo = dx * exp_mid.hi;
+
+ // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
+ double p = poly_approx_d(dx);
+
+ double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+ // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+ // field.
+ int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+ double r =
+ cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
+ return r;
+#else
+ double upper = exp_mid.hi + (lo + ERR_D);
+ double lower = exp_mid.hi + (lo - ERR_D);
+
+ if (LIBC_LIKELY(upper == lower)) {
+ // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+ // field.
+ int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+ double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
+ return r;
+ }
+
+ // Use double-double
+ DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
+
+ double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
+ double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
+
+ if (LIBC_LIKELY(upper_dd == lower_dd)) {
+ // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+ // field.
+ int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+ double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
+ return r;
+ }
+
+ // Use 128-bit precision
+ Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
+
+ return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 99c1b08..28ea475 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1448,21 +1448,7 @@ add_entrypoint_object(
HDRS
../exp2.h
DEPENDS
- .common_constants
- libc.src.__support.CPP.bit
- libc.src.__support.CPP.optional
- libc.src.__support.FPUtil.dyadic_float
- libc.src.__support.FPUtil.fenv_impl
- libc.src.__support.FPUtil.fp_bits
- libc.src.__support.FPUtil.multiply_add
- libc.src.__support.FPUtil.nearest_integer
- libc.src.__support.FPUtil.polyeval
- libc.src.__support.FPUtil.rounding_mode
- libc.src.__support.FPUtil.triple_double
- libc.src.__support.integer_literals
- libc.src.__support.macros.optimization
- libc.src.__support.math.exp_utils
- libc.src.errno.errno
+ libc.src.__support.math.exp2
)
add_header_library(
@@ -1613,7 +1599,6 @@ add_entrypoint_object(
HDRS
../expm1.h
DEPENDS
- .common_constants
libc.src.__support.CPP.bit
libc.src.__support.FPUtil.dyadic_float
libc.src.__support.FPUtil.fenv_impl
@@ -1624,6 +1609,7 @@ add_entrypoint_object(
libc.src.__support.FPUtil.triple_double
libc.src.__support.integer_literals
libc.src.__support.macros.optimization
+ libc.src.__support.math.common_constants
libc.src.errno.errno
)
@@ -1634,7 +1620,6 @@ add_entrypoint_object(
HDRS
../expm1f.h
DEPENDS
- .common_constants
libc.src.__support.FPUtil.basic_operations
libc.src.__support.FPUtil.fenv_impl
libc.src.__support.FPUtil.fp_bits
@@ -1643,6 +1628,7 @@ add_entrypoint_object(
libc.src.__support.FPUtil.polyeval
libc.src.__support.FPUtil.rounding_mode
libc.src.__support.macros.optimization
+ libc.src.__support.math.common_constants
libc.src.errno.errno
)
@@ -1673,7 +1659,6 @@ add_entrypoint_object(
HDRS
../powf.h
DEPENDS
- .common_constants
.exp2f_impl
libc.src.__support.math.exp10f
libc.src.__support.CPP.bit
@@ -1685,6 +1670,7 @@ add_entrypoint_object(
libc.src.__support.FPUtil.sqrt
libc.src.__support.FPUtil.triple_double
libc.src.__support.macros.optimization
+ libc.src.__support.math.common_constants
libc.src.errno.errno
)
@@ -1695,7 +1681,6 @@ add_entrypoint_object(
HDRS
../pow.h
DEPENDS
- .common_constants
libc.hdr.errno_macros
libc.hdr.fenv_macros
libc.src.__support.CPP.bit
@@ -1707,6 +1692,7 @@ add_entrypoint_object(
libc.src.__support.FPUtil.polyeval
libc.src.__support.FPUtil.sqrt
libc.src.__support.macros.optimization
+ libc.src.__support.math.common_constants
)
add_entrypoint_object(
@@ -2043,26 +2029,14 @@ add_entrypoint_object(
libc.src.__support.macros.properties.types
)
-add_object_library(
- common_constants
- HDRS
- common_constants.h
- SRCS
- common_constants.cpp
- DEPENDS
- libc.src.__support.math.exp_constants
- libc.src.__support.math.acosh_float_constants
- libc.src.__support.number_pair
-)
-
add_header_library(
log_range_reduction
HDRS
log_range_reduction.h
DEPENDS
- .common_constants
- libc.src.__support.uint128
libc.src.__support.FPUtil.dyadic_float
+ libc.src.__support.math.common_constants
+ libc.src.__support.uint128
)
add_entrypoint_object(
@@ -2072,7 +2046,6 @@ add_entrypoint_object(
HDRS
../log10.h
DEPENDS
- .common_constants
.log_range_reduction
libc.src.__support.FPUtil.double_double
libc.src.__support.FPUtil.dyadic_float
@@ -2082,6 +2055,7 @@ add_entrypoint_object(
libc.src.__support.FPUtil.polyeval
libc.src.__support.integer_literals
libc.src.__support.macros.optimization
+ libc.src.__support.math.common_constants
)
add_entrypoint_object(
@@ -2091,12 +2065,12 @@ add_entrypoint_object(
HDRS
../log10f.h
DEPENDS
- .common_constants
libc.src.__support.FPUtil.except_value_utils
libc.src.__support.FPUtil.fenv_impl
libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.fma
libc.src.__support.FPUtil.polyeval
+ libc.src.__support.math.common_constants
)
add_entrypoint_object(
@@ -2126,7 +2100,6 @@ add_entrypoint_object(
HDRS
../log1p.h
DEPENDS
- .common_constants
libc.src.__support.FPUtil.double_double
libc.src.__support.FPUtil.dyadic_float
libc.src.__support.FPUtil.fenv_impl
@@ -2135,6 +2108,7 @@ add_entrypoint_object(
libc.src.__support.FPUtil.polyeval
libc.src.__support.integer_literals
libc.src.__support.macros.optimization
+ libc.src.__support.math.common_constants
)
add_entrypoint_object(
@@ -2144,13 +2118,13 @@ add_entrypoint_object(
HDRS
../log1pf.h
DEPENDS
- .common_constants
libc.src.__support.FPUtil.except_value_utils
libc.src.__support.FPUtil.fenv_impl
libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.fma
libc.src.__support.FPUtil.polyeval
libc.src.__support.macros.optimization
+ libc.src.__support.math.common_constants
)
add_entrypoint_object(
@@ -2160,7 +2134,6 @@ add_entrypoint_object(
HDRS
../log2.h
DEPENDS
- .common_constants
.log_range_reduction
libc.src.__support.FPUtil.double_double
libc.src.__support.FPUtil.dyadic_float
@@ -2170,6 +2143,7 @@ add_entrypoint_object(
libc.src.__support.FPUtil.polyeval
libc.src.__support.integer_literals
libc.src.__support.macros.optimization
+ libc.src.__support.math.common_constants
)
add_entrypoint_object(
@@ -2179,13 +2153,13 @@ add_entrypoint_object(
HDRS
../log2f.h
DEPENDS
- .common_constants
libc.src.__support.FPUtil.except_value_utils
libc.src.__support.FPUtil.fenv_impl
libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.fma
libc.src.__support.FPUtil.polyeval
libc.src.__support.macros.optimization
+ libc.src.__support.math.common_constants
)
add_entrypoint_object(
@@ -2215,7 +2189,6 @@ add_entrypoint_object(
HDRS
../log.h
DEPENDS
- .common_constants
.log_range_reduction
libc.src.__support.FPUtil.double_double
libc.src.__support.FPUtil.dyadic_float
@@ -2225,6 +2198,7 @@ add_entrypoint_object(
libc.src.__support.FPUtil.polyeval
libc.src.__support.integer_literals
libc.src.__support.macros.optimization
+ libc.src.__support.math.common_constants
)
add_entrypoint_object(
@@ -2234,7 +2208,6 @@ add_entrypoint_object(
HDRS
../logf.h
DEPENDS
- .common_constants
libc.src.__support.FPUtil.except_value_utils
libc.src.__support.FPUtil.fenv_impl
libc.src.__support.FPUtil.fp_bits
@@ -2242,6 +2215,7 @@ add_entrypoint_object(
libc.src.__support.FPUtil.polyeval
libc.src.__support.macros.optimization
libc.src.__support.macros.properties.cpu_features
+ libc.src.__support.math.common_constants
)
add_entrypoint_object(
diff --git a/libc/src/math/generic/common_constants.h b/libc/src/math/generic/common_constants.h
deleted file mode 100644
index 9ee31f0..0000000
--- a/libc/src/math/generic/common_constants.h
+++ /dev/null
@@ -1,73 +0,0 @@
-//===-- Common constants for math functions ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
-#define LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
-
-#include "src/__support/FPUtil/triple_double.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/math/acosh_float_constants.h"
-#include "src/__support/math/exp_constants.h"
-#include "src/__support/number_pair.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-// Lookup table for range reduction constants r for logarithms.
-extern const float R[128];
-
-// Lookup table for range reduction constants r for logarithms.
-extern const double RD[128];
-
-// Lookup table for compensated constants for exact range reduction when FMA
-// instructions are not available.
-extern const double CD[128];
-
-// Lookup table for -log(r)
-extern const double LOG_R[128];
-extern const NumberPair<double> LOG_R_DD[128];
-
-// Lookup table for -log2(r)
-extern const double LOG2_R[128];
-
-// Minimax polynomial for (log(1 + x) - x)/x^2, generated by sollya with:
-// > P = fpminimax((log(1 + x) - x)/x^2, 5, [|D...|], [-2^-8, 2^-7]);
-constexpr double LOG_COEFFS[6] = {-0x1.fffffffffffffp-2, 0x1.5555555554a9bp-2,
- -0x1.0000000094567p-2, 0x1.99999dcc9823cp-3,
- -0x1.55550ac2e537ap-3, 0x1.21a02c4e624d7p-3};
-
-// Logarithm Range Reduction - Step 2, 3, and 4.
-extern const int S2[193];
-extern const int S3[161];
-extern const int S4[130];
-
-extern const double R2[193];
-
-// log(2) generated by Sollya with:
-// > a = 2^-43 * nearestint(2^43*log(2));
-// LSB = 2^-43 is chosen so that e_x * LOG_2_HI is exact for -1075 < e_x < 1024.
-constexpr double LOG_2_HI = 0x1.62e42fefa38p-1; // LSB = 2^-43
-// > b = round(log10(2) - a, D, RN);
-constexpr double LOG_2_LO = 0x1.ef35793c7673p-45; // LSB = 2^-97
-
-// Lookup table for exp(m) with m = -104, ..., 89.
-// -104 = floor(log(single precision's min denormal))
-// 89 = ceil(log(single precision's max normal))
-// Table is generated with Sollya as follow:
-// > display = hexadecimal;
-// > for i from -104 to 89 do { D(exp(i)); };
-extern const double EXP_M1[195];
-
-// Lookup table for exp(m * 2^(-7)) with m = 0, ..., 127.
-// Table is generated with Sollya as follow:
-// > display = hexadecimal;
-// > for i from 0 to 127 do { D(exp(i / 128)); };
-extern const double EXP_M2[128];
-
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 154154f..20e1ff5 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -7,404 +7,10 @@
//===----------------------------------------------------------------------===//
#include "src/math/exp2.h"
-#include "common_constants.h" // Lookup tables EXP2_MID1 and EXP_M2.
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/optional.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/double_double.h"
-#include "src/__support/FPUtil/dyadic_float.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/FPUtil/triple_double.h"
-#include "src/__support/common.h"
-#include "src/__support/integer_literals.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/__support/math/exp_utils.h" // ziv_test_denorm.
+#include "src/__support/math/exp2.h"
namespace LIBC_NAMESPACE_DECL {
-using fputil::DoubleDouble;
-using fputil::TripleDouble;
-using Float128 = typename fputil::DyadicFloat<128>;
-
-using LIBC_NAMESPACE::operator""_u128;
-
-// Error bounds:
-// Errors when using double precision.
-#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-constexpr double ERR_D = 0x1.0p-63;
-#else
-constexpr double ERR_D = 0x1.8p-63;
-#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Errors when using double-double precision.
-constexpr double ERR_DD = 0x1.0p-100;
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-namespace {
-
-// Polynomial approximations with double precision. Generated by Sollya with:
-// > P = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
-// > P;
-// Error bounds:
-// | output - (2^dx - 1) / dx | < 1.5 * 2^-52.
-LIBC_INLINE double poly_approx_d(double dx) {
- // dx^2
- double dx2 = dx * dx;
- double c0 =
- fputil::multiply_add(dx, 0x1.ebfbdff82c58ep-3, 0x1.62e42fefa39efp-1);
- double c1 =
- fputil::multiply_add(dx, 0x1.3b2aba7a95a89p-7, 0x1.c6b08e8fc0c0ep-5);
- double p = fputil::multiply_add(dx2, c1, c0);
- return p;
-}
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Polynomial approximation with double-double precision. Generated by Solya
-// with:
-// > P = fpminimax((2^x - 1)/x, 5, [|DD...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
-// Error bounds:
-// | output - 2^(dx) | < 2^-101
-DoubleDouble poly_approx_dd(const DoubleDouble &dx) {
- // Taylor polynomial.
- constexpr DoubleDouble COEFFS[] = {
- {0, 0x1p0},
- {0x1.abc9e3b39824p-56, 0x1.62e42fefa39efp-1},
- {-0x1.5e43a53e4527bp-57, 0x1.ebfbdff82c58fp-3},
- {-0x1.d37963a9444eep-59, 0x1.c6b08d704a0cp-5},
- {0x1.4eda1a81133dap-62, 0x1.3b2ab6fba4e77p-7},
- {-0x1.c53fd1ba85d14p-64, 0x1.5d87fe7a265a5p-10},
- {0x1.d89250b013eb8p-70, 0x1.430912f86cb8ep-13},
- };
-
- DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
- COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
- return p;
-}
-
-// Polynomial approximation with 128-bit precision:
-// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7
-// For |dx| < 2^-13 + 2^-30:
-// | output - exp(dx) | < 2^-126.
-Float128 poly_approx_f128(const Float128 &dx) {
- constexpr Float128 COEFFS_128[]{
- {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
- {Sign::POS, -128, 0xb17217f7'd1cf79ab'c9e3b398'03f2f6af_u128},
- {Sign::POS, -128, 0x3d7f7bff'058b1d50'de2d60dd'9c9a1d9f_u128},
- {Sign::POS, -132, 0xe35846b8'2505fc59'9d3b15d9'e7fb6897_u128},
- {Sign::POS, -134, 0x9d955b7d'd273b94e'184462f6'bcd2b9e7_u128},
- {Sign::POS, -137, 0xaec3ff3c'53398883'39ea1bb9'64c51a89_u128},
- {Sign::POS, -138, 0x2861225f'345c396a'842c5341'8fa8ae61_u128},
- {Sign::POS, -144, 0xffe5fe2d'109a319d'7abeb5ab'd5ad2079_u128},
- };
-
- Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
- COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
- COEFFS_128[6], COEFFS_128[7]);
- return p;
-}
-
-// Compute 2^(x) using 128-bit precision.
-// TODO(lntue): investigate triple-double precision implementation for this
-// step.
-Float128 exp2_f128(double x, int hi, int idx1, int idx2) {
- Float128 dx = Float128(x);
-
- // TODO: Skip recalculating exp_mid1 and exp_mid2.
- Float128 exp_mid1 =
- fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
- fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
- Float128(EXP2_MID1[idx1].lo)));
-
- Float128 exp_mid2 =
- fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
- fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
- Float128(EXP2_MID2[idx2].lo)));
-
- Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
-
- Float128 p = poly_approx_f128(dx);
-
- Float128 r = fputil::quick_mul(exp_mid, p);
-
- r.exponent += hi;
-
- return r;
-}
-
-// Compute 2^x with double-double precision.
-DoubleDouble exp2_double_double(double x, const DoubleDouble &exp_mid) {
- DoubleDouble dx({0, x});
-
- // Degree-6 polynomial approximation in double-double precision.
- // | p - 2^x | < 2^-103.
- DoubleDouble p = poly_approx_dd(dx);
-
- // Error bounds: 2^-102.
- DoubleDouble r = fputil::quick_mult(exp_mid, p);
-
- return r;
-}
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-// When output is denormal.
-double exp2_denorm(double x) {
- // Range reduction.
- int k =
- static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
- double kd = static_cast<double>(k);
-
- uint32_t idx1 = (k >> 6) & 0x3f;
- uint32_t idx2 = k & 0x3f;
-
- int hi = k >> 12;
-
- DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
- DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
- DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
-
- // |dx| < 2^-13 + 2^-30.
- double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
-
- double mid_lo = dx * exp_mid.hi;
-
- // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
- double p = poly_approx_d(dx);
-
- double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
- return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
- .value();
-#else
- if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
- LIBC_LIKELY(r.has_value()))
- return r.value();
-
- // Use double-double
- DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
-
- if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
- LIBC_LIKELY(r.has_value()))
- return r.value();
-
- // Use 128-bit precision
- Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
-
- return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
-
-// Check for exceptional cases when:
-// * log2(1 - 2^-54) < x < log2(1 + 2^-53)
-// * x >= 1024
-// * x <= -1022
-// * x is inf or nan
-double set_exceptional(double x) {
- using FPBits = typename fputil::FPBits<double>;
- FPBits xbits(x);
-
- uint64_t x_u = xbits.uintval();
- uint64_t x_abs = xbits.abs().uintval();
-
- // |x| < log2(1 + 2^-53)
- if (x_abs <= 0x3ca71547652b82fd) {
- // 2^(x) ~ 1 + x/2
- return fputil::multiply_add(x, 0.5, 1.0);
- }
-
- // x <= -1022 || x >= 1024 or inf/nan.
- if (x_u > 0xc08ff00000000000) {
- // x <= -1075 or -inf/nan
- if (x_u >= 0xc090cc0000000000) {
- // exp(-Inf) = 0
- if (xbits.is_inf())
- return 0.0;
-
- // exp(nan) = nan
- if (xbits.is_nan())
- return x;
-
- if (fputil::quick_get_round() == FE_UPWARD)
- return FPBits::min_subnormal().get_val();
- fputil::set_errno_if_required(ERANGE);
- fputil::raise_except_if_required(FE_UNDERFLOW);
- return 0.0;
- }
-
- return exp2_denorm(x);
- }
-
- // x >= 1024 or +inf/nan
- // x is finite
- if (x_u < 0x7ff0'0000'0000'0000ULL) {
- int rounding = fputil::quick_get_round();
- if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
- return FPBits::max_normal().get_val();
-
- fputil::set_errno_if_required(ERANGE);
- fputil::raise_except_if_required(FE_OVERFLOW);
- }
- // x is +inf or nan
- return x + FPBits::inf().get_val();
-}
-
-} // namespace
-
-LLVM_LIBC_FUNCTION(double, exp2, (double x)) {
- using FPBits = typename fputil::FPBits<double>;
- FPBits xbits(x);
-
- uint64_t x_u = xbits.uintval();
-
- // x < -1022 or x >= 1024 or log2(1 - 2^-54) < x < log2(1 + 2^-53).
- if (LIBC_UNLIKELY(x_u > 0xc08ff00000000000 ||
- (x_u <= 0xbc971547652b82fe && x_u >= 0x4090000000000000) ||
- x_u <= 0x3ca71547652b82fd)) {
- return set_exceptional(x);
- }
-
- // Now -1075 < x <= log2(1 - 2^-54) or log2(1 + 2^-53) < x < 1024
-
- // Range reduction:
- // Let x = (hi + mid1 + mid2) + lo
- // in which:
- // hi is an integer
- // mid1 * 2^6 is an integer
- // mid2 * 2^12 is an integer
- // then:
- // 2^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 2^(lo).
- // With this formula:
- // - multiplying by 2^hi is exact and cheap, simply by adding the exponent
- // field.
- // - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
- // - 2^(lo) ~ 1 + a0*lo + a1 * lo^2 + ...
- //
- // We compute (hi + mid1 + mid2) together by perform the rounding on x * 2^12.
- // Since |x| < |-1075)| < 2^11,
- // |x * 2^12| < 2^11 * 2^12 < 2^23,
- // So we can fit the rounded result round(x * 2^12) in int32_t.
- // Thus, the goal is to be able to use an additional addition and fixed width
- // shift to get an int32_t representing round(x * 2^12).
- //
- // Assuming int32_t using 2-complement representation, since the mantissa part
- // of a double precision is unsigned with the leading bit hidden, if we add an
- // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^25 to the product, the
- // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
- // considered as a proper 2-complement representations of x*2^12.
- //
- // One small problem with this approach is that the sum (x*2^12 + C) in
- // double precision is rounded to the least significant bit of the dorminant
- // factor C. In order to minimize the rounding errors from this addition, we
- // want to minimize e1. Another constraint that we want is that after
- // shifting the mantissa so that the least significant bit of int32_t
- // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
- // any adjustment. So combining these 2 requirements, we can choose
- // C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
- // after right shifting the mantissa, the resulting int32_t has correct sign.
- // With this choice of C, the number of mantissa bits we need to shift to the
- // right is: 52 - 33 = 19.
- //
- // Moreover, since the integer right shifts are equivalent to rounding down,
- // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
- // +infinity. So in particular, we can compute:
- // hmm = x * 2^12 + C,
- // where C = 2^33 + 2^32 + 2^-1, then if
- // k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19),
- // the reduced argument:
- // lo = x - 2^-12 * k is bounded by:
- // |lo| <= 2^-13 + 2^-12*2^-19
- // = 2^-13 + 2^-31.
- //
- // Finally, notice that k only uses the mantissa of x * 2^12, so the
- // exponent 2^12 is not needed. So we can simply define
- // C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
- // k = int32_t(lower 51 bits of double(x + C) >> 19).
-
- // Rounding errors <= 2^-31.
- int k =
- static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
- double kd = static_cast<double>(k);
-
- uint32_t idx1 = (k >> 6) & 0x3f;
- uint32_t idx2 = k & 0x3f;
-
- int hi = k >> 12;
-
- DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
- DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
- DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
-
- // |dx| < 2^-13 + 2^-30.
- double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
-
- // We use the degree-4 polynomial to approximate 2^(lo):
- // 2^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4 = 1 + lo * P(lo)
- // So that the errors are bounded by:
- // |P(lo) - (2^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
- // Let P_ be an evaluation of P where all intermediate computations are in
- // double precision. Using either Horner's or Estrin's schemes, the evaluated
- // errors can be bounded by:
- // |P_(lo) - P(lo)| < 2^-51
- // => |lo * P_(lo) - (2^lo - 1) | < 2^-64
- // => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-63.
- // Since we approximate
- // 2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
- // We use the expression:
- // (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
- // ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
- // with errors bounded by 2^-63.
-
- double mid_lo = dx * exp_mid.hi;
-
- // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
- double p = poly_approx_d(dx);
-
- double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
- // To multiply by 2^hi, a fast way is to simply add hi to the exponent
- // field.
- int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
- double r =
- cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
- return r;
-#else
- double upper = exp_mid.hi + (lo + ERR_D);
- double lower = exp_mid.hi + (lo - ERR_D);
-
- if (LIBC_LIKELY(upper == lower)) {
- // To multiply by 2^hi, a fast way is to simply add hi to the exponent
- // field.
- int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
- double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
- return r;
- }
-
- // Use double-double
- DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
-
- double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
- double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
-
- if (LIBC_LIKELY(upper_dd == lower_dd)) {
- // To multiply by 2^hi, a fast way is to simply add hi to the exponent
- // field.
- int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
- double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
- return r;
- }
-
- // Use 128-bit precision
- Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
-
- return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
+LLVM_LIBC_FUNCTION(double, exp2, (double x)) { return math::exp2(x); }
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp
index c360554..a3d0c1a 100644
--- a/libc/src/math/generic/expm1.cpp
+++ b/libc/src/math/generic/expm1.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "src/math/expm1.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
#include "src/__support/CPP/bit.h"
#include "src/__support/FPUtil/FEnvImpl.h"
#include "src/__support/FPUtil/FPBits.h"
@@ -22,6 +21,8 @@
#include "src/__support/integer_literals.h"
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
+#include "src/__support/math/exp_constants.h"
#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
#define LIBC_MATH_EXPM1_SKIP_ACCURATE_PASS
@@ -59,6 +60,8 @@ constexpr double MLOG_2_EXP2_M12_LO = 0x1.b0e2633fe0685p-79;
namespace {
+using namespace common_constants_internal;
+
// Polynomial approximations with double precision:
// Return expm1(dx) / x ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24.
// For |dx| < 2^-13 + 2^-30:
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index b2967e2..72c8aa3 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "src/math/expm1f.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
#include "src/__support/FPUtil/BasicOperations.h"
#include "src/__support/FPUtil/FEnvImpl.h"
#include "src/__support/FPUtil/FMA.h"
@@ -20,10 +19,12 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
+ using namespace common_constants_internal;
using FPBits = typename fputil::FPBits<float>;
FPBits xbits(x);
diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp
index 0cd4424..66ce059 100644
--- a/libc/src/math/generic/log.cpp
+++ b/libc/src/math/generic/log.cpp
@@ -18,8 +18,8 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "common_constants.h"
#include "log_range_reduction.h"
+#include "src/__support/math/common_constants.h"
namespace LIBC_NAMESPACE_DECL {
@@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128;
namespace {
+using namespace common_constants_internal;
+
#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
// A simple upper bound for the error of e_x * log(2) - log(r).
constexpr double HI_ERR = 0x1.0p-85;
diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp
index 1c4e559..95f24fa 100644
--- a/libc/src/math/generic/log10.cpp
+++ b/libc/src/math/generic/log10.cpp
@@ -18,8 +18,8 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "common_constants.h"
#include "log_range_reduction.h"
+#include "src/__support/math/common_constants.h"
namespace LIBC_NAMESPACE_DECL {
@@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128;
namespace {
+using namespace common_constants_internal;
+
constexpr fputil::DoubleDouble LOG10_E = {0x1.95355baaafad3p-57,
0x1.bcb7b1526e50ep-2};
@@ -739,6 +741,7 @@ double log10_accurate(int e_x, int index, double m_x) {
} // namespace
LLVM_LIBC_FUNCTION(double, log10, (double x)) {
+ using namespace common_constants_internal;
using FPBits_t = typename fputil::FPBits<double>;
FPBits_t xbits(x);
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index 81e7cdb..6b9cc5d 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "src/math/log10f.h"
-#include "common_constants.h" // Lookup table for (1/f)
#include "src/__support/FPUtil/FEnvImpl.h"
#include "src/__support/FPUtil/FMA.h"
#include "src/__support/FPUtil/FPBits.h"
@@ -18,6 +17,7 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
#include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f)
// This is an algorithm for log10(x) in single precision which is
// correctly rounded for all rounding modes, based on the implementation of
@@ -104,6 +104,7 @@ static constexpr double LOG10_R[128] = {
0x1.30cb3a7bb3625p-2, 0x1.34413509f79ffp-2};
LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
+ using namespace common_constants_internal;
constexpr double LOG10_2 = 0x1.34413509f79ffp-2;
using FPBits = typename fputil::FPBits<float>;
diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp
index 09f465a..1595981 100644
--- a/libc/src/math/generic/log1p.cpp
+++ b/libc/src/math/generic/log1p.cpp
@@ -18,7 +18,7 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "common_constants.h"
+#include "src/__support/math/common_constants.h"
namespace LIBC_NAMESPACE_DECL {
@@ -29,6 +29,8 @@ using LIBC_NAMESPACE::operator""_u128;
namespace {
+using namespace common_constants_internal;
+
// R1[i] = 2^-8 * nearestint( 2^8 / (1 + i * 2^-7) )
constexpr double R1[129] = {
0x1p0, 0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1, 0x1.ecp-1, 0x1.eap-1,
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index 16b1b34..f0289c2 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "src/math/log1pf.h"
-#include "common_constants.h" // Lookup table for (1/f) and log(f)
#include "src/__support/FPUtil/FEnvImpl.h"
#include "src/__support/FPUtil/FMA.h"
#include "src/__support/FPUtil/FPBits.h"
@@ -18,6 +17,8 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
#include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/acosh_float_constants.h"
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f) and log(f)
// This is an algorithm for log10(x) in single precision which is
// correctly rounded for all rounding modes.
@@ -38,6 +39,7 @@ namespace internal {
// We don't need to treat denormal and 0
LIBC_INLINE float log(double x) {
using namespace acoshf_internal;
+ using namespace common_constants_internal;
constexpr double LOG_2 = 0x1.62e42fefa39efp-1;
using FPBits = typename fputil::FPBits<double>;
diff --git a/libc/src/math/generic/log2.cpp b/libc/src/math/generic/log2.cpp
index 27ca2fc..f0c0ae3 100644
--- a/libc/src/math/generic/log2.cpp
+++ b/libc/src/math/generic/log2.cpp
@@ -18,8 +18,8 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "common_constants.h"
#include "log_range_reduction.h"
+#include "src/__support/math/common_constants.h"
namespace LIBC_NAMESPACE_DECL {
@@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128;
namespace {
+using namespace common_constants_internal;
+
constexpr fputil::DoubleDouble LOG2_E = {0x1.777d0ffda0d24p-56,
0x1.71547652b82fep0};
@@ -859,6 +861,7 @@ double log2_accurate(int e_x, int index, double m_x) {
} // namespace
LLVM_LIBC_FUNCTION(double, log2, (double x)) {
+ using namespace common_constants_internal;
using FPBits_t = typename fputil::FPBits<double>;
FPBits_t xbits(x);
diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp
index cff718e..7353f03 100644
--- a/libc/src/math/generic/log2f.cpp
+++ b/libc/src/math/generic/log2f.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "src/math/log2f.h"
-#include "common_constants.h" // Lookup table for (1/f)
#include "src/__support/FPUtil/FEnvImpl.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/PolyEval.h"
@@ -15,7 +14,8 @@
#include "src/__support/FPUtil/multiply_add.h"
#include "src/__support/common.h"
#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f)
// This is a correctly-rounded algorithm for log2(x) in single precision with
// round-to-nearest, tie-to-even mode from the RLIBM project at:
@@ -55,6 +55,7 @@
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
+ using namespace common_constants_internal;
using FPBits = typename fputil::FPBits<float>;
FPBits xbits(x);
diff --git a/libc/src/math/generic/log_range_reduction.h b/libc/src/math/generic/log_range_reduction.h
index 8c94230..7484506 100644
--- a/libc/src/math/generic/log_range_reduction.h
+++ b/libc/src/math/generic/log_range_reduction.h
@@ -9,9 +9,9 @@
#ifndef LLVM_LIBC_SRC_MATH_GENERIC_LOG_RANGE_REDUCTION_H
#define LLVM_LIBC_SRC_MATH_GENERIC_LOG_RANGE_REDUCTION_H
-#include "common_constants.h"
#include "src/__support/FPUtil/dyadic_float.h"
#include "src/__support/macros/config.h"
+#include "src/__support/math/common_constants.h"
#include "src/__support/uint128.h"
namespace LIBC_NAMESPACE_DECL {
@@ -36,6 +36,7 @@ struct LogRR {
LIBC_INLINE fputil::DyadicFloat<128>
log_range_reduction(double m_x, const LogRR &log_table,
fputil::DyadicFloat<128> &sum) {
+ using namespace common_constants_internal;
using Float128 = typename fputil::DyadicFloat<128>;
using MType = typename Float128::MantissaType;
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index e8d2ba2..4d2947d 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "src/math/logf.h"
-#include "common_constants.h" // Lookup table for (1/f) and log(f)
#include "src/__support/FPUtil/FEnvImpl.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/PolyEval.h"
@@ -17,6 +16,7 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
#include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f) and log(f)
// This is an algorithm for log(x) in single precision which is correctly
// rounded for all rounding modes, based on the implementation of log(x) from
@@ -53,6 +53,7 @@
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(float, logf, (float x)) {
+ using namespace common_constants_internal;
constexpr double LOG_2 = 0x1.62e42fefa39efp-1;
using FPBits = typename fputil::FPBits<float>;
diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp
index 43e99a7..c9f685b 100644
--- a/libc/src/math/generic/pow.cpp
+++ b/libc/src/math/generic/pow.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "src/math/pow.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
#include "hdr/errno_macros.h"
#include "hdr/fenv_macros.h"
#include "src/__support/CPP/bit.h"
@@ -21,6 +20,8 @@
#include "src/__support/common.h"
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
+#include "src/__support/math/exp_constants.h" // Lookup tables EXP_M1 and EXP_M2.
namespace LIBC_NAMESPACE_DECL {
@@ -28,6 +29,8 @@ using fputil::DoubleDouble;
namespace {
+using namespace common_constants_internal;
+
// Constants for log2(x) range reduction, generated by Sollya with:
// > for i from 0 to 127 do {
// r = 2^-8 * ceil( 2^8 * (1 - 2^(-8)) / (1 + i*2^-7) );
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index a45ef51..12246e9 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "src/math/powf.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
#include "src/__support/CPP/bit.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/PolyEval.h"
@@ -15,10 +14,13 @@
#include "src/__support/FPUtil/multiply_add.h"
#include "src/__support/FPUtil/nearest_integer.h"
#include "src/__support/FPUtil/sqrt.h" // Speedup for powf(x, 1/2) = sqrtf(x)
+#include "src/__support/FPUtil/triple_double.h"
#include "src/__support/common.h"
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
#include "src/__support/math/exp10f.h" // Speedup for powf(10, y) = exp10f(y)
+#include "src/__support/math/exp_constants.h"
#include "exp2f_impl.h" // Speedup for powf(2, y) = exp2f(y)
@@ -29,6 +31,8 @@ using fputil::TripleDouble;
namespace {
+using namespace common_constants_internal;
+
#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
alignas(16) constexpr DoubleDouble LOG2_R_DD[128] = {
{0.0, 0.0},
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index ea4634c..040f635 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -40,6 +40,7 @@ add_fp_unittest(
libc.src.__support.math.exp10m1f16
libc.src.__support.math.erff
libc.src.__support.math.exp
+ libc.src.__support.math.exp2
libc.src.__support.math.exp10
libc.src.__support.math.exp10f
libc.src.__support.math.exp10f16
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 1722193..ef2e7b8 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -80,6 +80,7 @@ TEST(LlvmLibcSharedMathTest, AllDouble) {
EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::cos(0.0));
EXPECT_FP_EQ(0x0p+0, LIBC_NAMESPACE::shared::dsqrtl(0.0));
EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp(0.0));
+ EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp2(0.0));
EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp10(0.0));
}
diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst
index eae693b1..f86c736 100644
--- a/llvm/docs/GitHub.rst
+++ b/llvm/docs/GitHub.rst
@@ -567,8 +567,11 @@ branch. Just make sure to add the release milestone to the pull request.
Getting admin access to CI infrastructure
=========================================
-Any individual who is responsible for setting up and/or maintaining CI infrastructure for a LLVM project can
-request to be granted the CI/CD role to the LLVM organization admins. The request can be made by creating
-`a Github issue <https://github.com/llvm/llvm-project/issues/new>`_ and using the ``infrastructure`` label.
-Applicants must include a justification for why the role is being requested. Applications are reviewed on a
-case-by-case basis by the LLVM admins and the role can be revoked at any point as the LLVM admins see fit.
+Any individual who is responsible for setting up and/or maintaining CI
+infrastructure for a LLVM project can request to be granted the CI/CD role by
+the LLVM infrastructure area team. The request can be made by creating `a
+Github issue <https://github.com/llvm/llvm-project/issues/new>`_ and using the
+``infrastructure`` label. Applicants must include a justification for why the
+role is being requested. Applications are reviewed on a case-by-case basis by
+the LLVM infrastructure area team and the role can be revoked at any point as
+the area team sees fit.
diff --git a/llvm/include/llvm/ADT/TypeSwitch.h b/llvm/include/llvm/ADT/TypeSwitch.h
index 5bbbdf2..5657303 100644
--- a/llvm/include/llvm/ADT/TypeSwitch.h
+++ b/llvm/include/llvm/ADT/TypeSwitch.h
@@ -17,6 +17,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
#include <optional>
namespace llvm {
@@ -117,11 +118,16 @@ public:
return defaultResult;
}
- [[nodiscard]] operator ResultT() {
- assert(result && "Fell off the end of a type-switch");
- return std::move(*result);
+ /// Declare default as unreachable, making sure that all cases were handled.
+ [[nodiscard]] ResultT DefaultUnreachable(
+ const char *message = "Fell off the end of a type-switch") {
+ if (result)
+ return std::move(*result);
+ llvm_unreachable(message);
}
+ [[nodiscard]] operator ResultT() { return DefaultUnreachable(); }
+
private:
/// The pointer to the result of this switch statement, once known,
/// null before that.
@@ -158,6 +164,13 @@ public:
defaultFn(this->value);
}
+ /// Declare default as unreachable, making sure that all cases were handled.
+ void DefaultUnreachable(
+ const char *message = "Fell off the end of a type-switch") {
+ if (!foundMatch)
+ llvm_unreachable(message);
+ }
+
private:
/// A flag detailing if we have already found a match.
bool foundMatch = false;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7a4abe9..5d3b233 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1551,12 +1551,6 @@ public:
OperandValueInfo OpdInfo = {OK_AnyValue, OP_None},
const Instruction *I = nullptr) const;
- /// \return The cost of VP Load and Store instructions.
- LLVM_ABI InstructionCost getVPMemoryOpCost(
- unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
- const Instruction *I = nullptr) const;
-
/// \return The cost of masked Load and Store instructions.
LLVM_ABI InstructionCost getMaskedMemoryOpCost(
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 566e1cf..4cd607c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -841,14 +841,6 @@ public:
return 1;
}
- virtual InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src,
- Align Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I) const {
- return 1;
- }
-
virtual InstructionCost
getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
unsigned AddressSpace,
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 6c12cd3..7a2e9ad 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -464,10 +464,10 @@ public:
const char *getData64bitsDirective() const { return Data64bitsDirective; }
bool supportsSignedData() const { return SupportsSignedData; }
- /// Targets can implement this method to specify a section to switch to if the
- /// translation unit doesn't have any trampolines that require an executable
- /// stack.
- virtual MCSection *getNonexecutableStackSection(MCContext &Ctx) const {
+ /// Targets can implement this method to specify a section to switch to
+ /// depending on whether the translation unit has any trampolines that require
+ /// an executable stack.
+ virtual MCSection *getStackSection(MCContext &Ctx, bool Exec) const {
return nullptr;
}
diff --git a/llvm/include/llvm/MC/MCAsmInfoELF.h b/llvm/include/llvm/MC/MCAsmInfoELF.h
index e067888..095ee4d 100644
--- a/llvm/include/llvm/MC/MCAsmInfoELF.h
+++ b/llvm/include/llvm/MC/MCAsmInfoELF.h
@@ -15,7 +15,7 @@ namespace llvm {
class MCAsmInfoELF : public MCAsmInfo {
virtual void anchor();
- MCSection *getNonexecutableStackSection(MCContext &Ctx) const override;
+ MCSection *getStackSection(MCContext &Ctx, bool Exec) const override;
void printSwitchToSection(const MCSection &, uint32_t, const Triple &,
raw_ostream &) const final;
bool useCodeAlign(const MCSection &Sec) const final;
diff --git a/llvm/include/llvm/MC/TargetRegistry.h b/llvm/include/llvm/MC/TargetRegistry.h
index 019ee60..570d4c0 100644
--- a/llvm/include/llvm/MC/TargetRegistry.h
+++ b/llvm/include/llvm/MC/TargetRegistry.h
@@ -389,6 +389,7 @@ public:
/// @name Feature Constructors
/// @{
+ // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
[[deprecated("Use overload accepting Triple instead")]]
MCAsmInfo *createMCAsmInfo(const MCRegisterInfo &MRI, StringRef TheTriple,
const MCTargetOptions &Options) const {
@@ -440,6 +441,7 @@ public:
return MCInstrAnalysisCtorFn(Info);
}
+ // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
[[deprecated("Use overload accepting Triple instead")]]
MCRegisterInfo *createMCRegInfo(StringRef TT) const {
if (!MCRegInfoCtorFn)
@@ -454,6 +456,7 @@ public:
return MCRegInfoCtorFn(TT);
}
+ // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
[[deprecated("Use overload accepting Triple instead")]]
MCSubtargetInfo *createMCSubtargetInfo(StringRef TheTriple, StringRef CPU,
StringRef Features) const {
@@ -496,16 +499,6 @@ public:
JIT);
}
- [[deprecated("Use overload accepting Triple instead")]]
- TargetMachine *createTargetMachine(
- StringRef TT, StringRef CPU, StringRef Features,
- const TargetOptions &Options, std::optional<Reloc::Model> RM,
- std::optional<CodeModel::Model> CM = std::nullopt,
- CodeGenOptLevel OL = CodeGenOptLevel::Default, bool JIT = false) const {
- return createTargetMachine(Triple(TT), CPU, Features, Options, RM, CM, OL,
- JIT);
- }
-
/// createMCAsmBackend - Create a target specific assembly parser.
MCAsmBackend *createMCAsmBackend(const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
@@ -599,6 +592,7 @@ public:
return nullptr;
}
+ // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
[[deprecated("Use overload accepting Triple instead")]]
MCRelocationInfo *createMCRelocationInfo(StringRef TT, MCContext &Ctx) const {
return createMCRelocationInfo(Triple(TT), Ctx);
@@ -616,6 +610,7 @@ public:
return Fn(TT, Ctx);
}
+ // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
[[deprecated("Use overload accepting Triple instead")]]
MCSymbolizer *
createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 07f4a8e..0d978d4 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4164,6 +4164,10 @@ static Value *simplifyFCmpInst(CmpPredicate Pred, Value *LHS, Value *RHS,
return ConstantInt::get(RetTy, Pred == CmpInst::FCMP_UNO);
}
+ if (std::optional<bool> Res =
+ isImpliedByDomCondition(Pred, LHS, RHS, Q.CxtI, Q.DL))
+ return ConstantInt::getBool(RetTy, *Res);
+
const APFloat *C = nullptr;
match(RHS, m_APFloatAllowPoison(C));
std::optional<KnownFPClass> FullKnownClassLHS;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 1eda7a7..a42c061 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -39,6 +39,7 @@
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantFPRange.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
@@ -9474,6 +9475,69 @@ isImpliedCondICmps(CmpPredicate LPred, const Value *L0, const Value *L1,
return std::nullopt;
}
+/// Return true if LHS implies RHS (expanded to its components as "R0 RPred R1")
+/// is true. Return false if LHS implies RHS is false. Otherwise, return
+/// std::nullopt if we can't infer anything.
+static std::optional<bool>
+isImpliedCondFCmps(FCmpInst::Predicate LPred, const Value *L0, const Value *L1,
+ FCmpInst::Predicate RPred, const Value *R0, const Value *R1,
+ const DataLayout &DL, bool LHSIsTrue) {
+ // The rest of the logic assumes the LHS condition is true. If that's not the
+ // case, invert the predicate to make it so.
+ if (!LHSIsTrue)
+ LPred = FCmpInst::getInversePredicate(LPred);
+
+ // We can have non-canonical operands, so try to normalize any common operand
+ // to L0/R0.
+ if (L0 == R1) {
+ std::swap(R0, R1);
+ RPred = FCmpInst::getSwappedPredicate(RPred);
+ }
+ if (R0 == L1) {
+ std::swap(L0, L1);
+ LPred = FCmpInst::getSwappedPredicate(LPred);
+ }
+ if (L1 == R1) {
+ // If we have L0 == R0 and L1 == R1, then make L1/R1 the constants.
+ if (L0 != R0 || match(L0, m_ImmConstant())) {
+ std::swap(L0, L1);
+ LPred = ICmpInst::getSwappedCmpPredicate(LPred);
+ std::swap(R0, R1);
+ RPred = ICmpInst::getSwappedCmpPredicate(RPred);
+ }
+ }
+
+ // Can we infer anything when the two compares have matching operands?
+ if (L0 == R0 && L1 == R1) {
+ if ((LPred & RPred) == LPred)
+ return true;
+ if ((LPred & ~RPred) == LPred)
+ return false;
+ }
+
+ // See if we can infer anything if operand-0 matches and we have at least one
+ // constant.
+ const APFloat *L1C, *R1C;
+ if (L0 == R0 && match(L1, m_APFloat(L1C)) && match(R1, m_APFloat(R1C))) {
+ if (std::optional<ConstantFPRange> DomCR =
+ ConstantFPRange::makeExactFCmpRegion(LPred, *L1C)) {
+ if (std::optional<ConstantFPRange> ImpliedCR =
+ ConstantFPRange::makeExactFCmpRegion(RPred, *R1C)) {
+ if (ImpliedCR->contains(*DomCR))
+ return true;
+ }
+ if (std::optional<ConstantFPRange> ImpliedCR =
+ ConstantFPRange::makeExactFCmpRegion(
+ FCmpInst::getInversePredicate(RPred), *R1C)) {
+ if (ImpliedCR->contains(*DomCR))
+ return false;
+ }
+ }
+ }
+
+ return std::nullopt;
+}
+
/// Return true if LHS implies RHS is true. Return false if LHS implies RHS is
/// false. Otherwise, return std::nullopt if we can't infer anything. We
/// expect the RHS to be an icmp and the LHS to be an 'and', 'or', or a 'select'
@@ -9529,15 +9593,24 @@ llvm::isImpliedCondition(const Value *LHS, CmpPredicate RHSPred,
LHSIsTrue = !LHSIsTrue;
// Both LHS and RHS are icmps.
- if (const auto *LHSCmp = dyn_cast<ICmpInst>(LHS))
- return isImpliedCondICmps(LHSCmp->getCmpPredicate(), LHSCmp->getOperand(0),
- LHSCmp->getOperand(1), RHSPred, RHSOp0, RHSOp1,
- DL, LHSIsTrue);
- const Value *V;
- if (match(LHS, m_NUWTrunc(m_Value(V))))
- return isImpliedCondICmps(CmpInst::ICMP_NE, V,
- ConstantInt::get(V->getType(), 0), RHSPred,
- RHSOp0, RHSOp1, DL, LHSIsTrue);
+ if (RHSOp0->getType()->getScalarType()->isIntOrPtrTy()) {
+ if (const auto *LHSCmp = dyn_cast<ICmpInst>(LHS))
+ return isImpliedCondICmps(LHSCmp->getCmpPredicate(),
+ LHSCmp->getOperand(0), LHSCmp->getOperand(1),
+ RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue);
+ const Value *V;
+ if (match(LHS, m_NUWTrunc(m_Value(V))))
+ return isImpliedCondICmps(CmpInst::ICMP_NE, V,
+ ConstantInt::get(V->getType(), 0), RHSPred,
+ RHSOp0, RHSOp1, DL, LHSIsTrue);
+ } else {
+ assert(RHSOp0->getType()->isFPOrFPVectorTy() &&
+ "Expected floating point type only!");
+ if (const auto *LHSCmp = dyn_cast<FCmpInst>(LHS))
+ return isImpliedCondFCmps(LHSCmp->getPredicate(), LHSCmp->getOperand(0),
+ LHSCmp->getOperand(1), RHSPred, RHSOp0, RHSOp1,
+ DL, LHSIsTrue);
+ }
/// The LHS should be an 'or', 'and', or a 'select' instruction. We expect
/// the RHS to be an icmp.
@@ -9574,6 +9647,13 @@ std::optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
return InvertRHS ? !*Implied : *Implied;
return std::nullopt;
}
+ if (const FCmpInst *RHSCmp = dyn_cast<FCmpInst>(RHS)) {
+ if (auto Implied = isImpliedCondition(
+ LHS, RHSCmp->getPredicate(), RHSCmp->getOperand(0),
+ RHSCmp->getOperand(1), DL, LHSIsTrue, Depth))
+ return InvertRHS ? !*Implied : *Implied;
+ return std::nullopt;
+ }
const Value *V;
if (match(RHS, m_NUWTrunc(m_Value(V)))) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 11efe49..10df9c1 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2866,9 +2866,11 @@ bool AsmPrinter::doFinalization(Module &M) {
// If we don't have any trampolines, then we don't require stack memory
// to be executable. Some targets have a directive to declare this.
Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
- if (!InitTrampolineIntrinsic || InitTrampolineIntrinsic->use_empty())
- if (MCSection *S = MAI->getNonexecutableStackSection(OutContext))
- OutStreamer->switchSection(S);
+ bool HasTrampolineUses =
+ InitTrampolineIntrinsic && !InitTrampolineIntrinsic->use_empty();
+ MCSection *S = MAI->getStackSection(OutContext, /*Exec=*/HasTrampolineUses);
+ if (S)
+ OutStreamer->switchSection(S);
if (TM.Options.EmitAddrsig) {
// Emit address-significance attributes for all globals.
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 2f3b7a2..3c41bbe 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1657,12 +1657,6 @@ bool TargetInstrInfo::isReMaterializableImpl(
// same virtual register, though.
if (MO.isDef() && Reg != DefReg)
return false;
-
- // Don't allow any virtual-register uses. Rematting an instruction with
- // virtual register uses would length the live ranges of the uses, which
- // is not necessarily a good idea, certainly not "trivial".
- if (MO.isUse())
- return false;
}
// Everything checked out.
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 8b5965b..df0c85b 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2994,6 +2994,8 @@ LLVMValueRef LLVMIsATerminatorInst(LLVMValueRef Inst) {
LLVMDbgRecordRef LLVMGetFirstDbgRecord(LLVMValueRef Inst) {
Instruction *Instr = unwrap<Instruction>(Inst);
+ if (!Instr->DebugMarker)
+ return nullptr;
auto I = Instr->DebugMarker->StoredDbgRecords.begin();
if (I == Instr->DebugMarker->StoredDbgRecords.end())
return nullptr;
@@ -3002,6 +3004,8 @@ LLVMDbgRecordRef LLVMGetFirstDbgRecord(LLVMValueRef Inst) {
LLVMDbgRecordRef LLVMGetLastDbgRecord(LLVMValueRef Inst) {
Instruction *Instr = unwrap<Instruction>(Inst);
+ if (!Instr->DebugMarker)
+ return nullptr;
auto I = Instr->DebugMarker->StoredDbgRecords.rbegin();
if (I == Instr->DebugMarker->StoredDbgRecords.rend())
return nullptr;
diff --git a/llvm/lib/MC/MCAsmInfoELF.cpp b/llvm/lib/MC/MCAsmInfoELF.cpp
index cdae9d7..98090d3 100644
--- a/llvm/lib/MC/MCAsmInfoELF.cpp
+++ b/llvm/lib/MC/MCAsmInfoELF.cpp
@@ -27,12 +27,13 @@ using namespace llvm;
void MCAsmInfoELF::anchor() {}
-MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const {
+MCSection *MCAsmInfoELF::getStackSection(MCContext &Ctx, bool Exec) const {
// Solaris doesn't know/doesn't care about .note.GNU-stack sections, so
// don't emit them.
if (Ctx.getTargetTriple().isOSSolaris())
return nullptr;
- return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0);
+ return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
+ Exec ? ELF::SHF_EXECINSTR : 0U);
}
bool MCAsmInfoELF::useCodeAlign(const MCSection &Sec) const {
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index 2881d7c..1bc1b92 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -54,7 +54,7 @@ void MCELFStreamer::initSections(bool NoExecStack, const MCSubtargetInfo &STI) {
&STI);
if (NoExecStack)
- switchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
+ switchSection(Ctx.getAsmInfo()->getStackSection(Ctx, /*Exec=*/false));
}
void MCELFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) {
diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td
index 54e27a6..f4326de 100644
--- a/llvm/lib/Target/ARM/ARMInstrCDE.td
+++ b/llvm/lib/Target/ARM/ARMInstrCDE.td
@@ -268,6 +268,7 @@ class CDE_Vec_Instr<bit acc, dag oops, dag iops, string asm, string cstr,
!con(iops, (ins vpred:$vp)), asm,
!strconcat(cstr, vpred.vpred_constraint)>,
CDE_RequiresQReg {
+ bits<0> vp;
}
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 9dffd94..e244134 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -409,6 +409,7 @@ class MVE_p<dag oops, dag iops, InstrItinClass itin, string iname,
!strconcat(iname, "${vp}",
!if(!eq(suffix, ""), "", !strconcat(".", suffix))),
ops, !strconcat(cstr, vpred.vpred_constraint), vecsize, pattern> {
+ bits<0> vp;
let Inst{31-29} = 0b111;
let Inst{27-26} = 0b11;
}
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index d358913..e67db8e 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -639,6 +639,43 @@ static DecodeStatus DecodeCCOutOperand(MCInst &Inst,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeVpredNOperand(MCInst &Inst,
+ const MCDisassembler *Decoder) {
+ const auto *D = static_cast<const ARMDisassembler *>(Decoder);
+ unsigned VCC = D->VPTBlock.getVPTPred();
+ MCRegister CondReg = VCC == ARMVCC::None ? ARM::NoRegister : ARM::P0;
+
+ Inst.addOperand(MCOperand::createImm(VCC)); // $cond
+ Inst.addOperand(MCOperand::createReg(CondReg)); // $cond_reg
+ Inst.addOperand(MCOperand::createReg(ARM::NoRegister)); // $tp_reg
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVpredROperand(MCInst &Inst,
+ const MCDisassembler *Decoder) {
+ const auto *D = static_cast<const ARMDisassembler *>(Decoder);
+ unsigned VCC = D->VPTBlock.getVPTPred();
+ MCRegister CondReg = VCC == ARMVCC::None ? ARM::NoRegister : ARM::P0;
+
+ Inst.addOperand(MCOperand::createImm(VCC)); // $cond
+ Inst.addOperand(MCOperand::createReg(CondReg)); // $cond_reg
+ Inst.addOperand(MCOperand::createReg(ARM::NoRegister)); // $tp_reg
+
+ // The last sub-operand ($inactive) is tied to an output operand.
+ // The output operand has already been decoded, so just copy it.
+ const MCInstrDesc &MCID = D->MCII->get(Inst.getOpcode());
+ unsigned InactiveOpIdx = Inst.getNumOperands();
+ int TiedOpIdx = MCID.getOperandConstraint(InactiveOpIdx, MCOI::TIED_TO);
+ assert(TiedOpIdx >= 0 &&
+ "Inactive register in vpred_r is not tied to an output!");
+
+ // Make a copy of the operand to ensure it is not invalidated when MI grows.
+ Inst.addOperand(MCOperand(Inst.getOperand(TiedOpIdx))); // $inactive
+
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -2777,6 +2814,7 @@ static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst, unsigned Insn,
Inst.addOperand(MCOperand::createImm(imm));
+ Check(S, DecodeVpredROperand(Inst, Decoder));
return S;
}
@@ -2802,6 +2840,7 @@ static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn,
if (!fieldFromInstruction(Insn, 12, 1)) // I bit clear => need input FPSCR
Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV));
+ Check(S, DecodeVpredROperand(Inst, Decoder));
return S;
}
@@ -5466,30 +5505,6 @@ static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
return S;
}
-static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- // The vpred_r operand type includes an MQPR register field derived
- // from the encoding. But we don't actually want to add an operand
- // to the MCInst at this stage, because AddThumbPredicate will do it
- // later, and will infer the register number from the TIED_TO
- // constraint. So this is a deliberately empty decoder method that
- // will inhibit the auto-generated disassembly code from adding an
- // operand at all.
- return MCDisassembler::Success;
-}
-
-[[maybe_unused]] static DecodeStatus
-DecodeVpredNOperand(MCInst &Inst, unsigned RegNo, uint64_t Address,
- const MCDisassembler *Decoder) {
- // Similar to above, we want to ensure that no operands are added for the
- // vpred operands. (This is marked "maybe_unused" for the moment; because
- // DecoderEmitter currently (wrongly) omits operands with no instruction bits,
- // the decoder doesn't actually call it yet. That will be addressed in a
- // future change.)
- return MCDisassembler::Success;
-}
-
static DecodeStatus
DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
const MCDisassembler *Decoder) {
@@ -5668,6 +5683,7 @@ DecodeMVE_MEM_pre(MCInst &Inst, unsigned Val, uint64_t Address,
if (!Check(S, AddrDecoder(Inst, addr, Address, Decoder)))
return MCDisassembler::Fail;
+ Check(S, DecodeVpredNOperand(Inst, Decoder));
return S;
}
@@ -5871,7 +5887,7 @@ static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn,
return MCDisassembler::Fail;
if (!Check(S, DecodeVCVTImmOperand(Inst, imm6, Address, Decoder)))
return MCDisassembler::Fail;
-
+ Check(S, DecodeVpredROperand(Inst, Decoder));
return S;
}
@@ -5906,6 +5922,7 @@ static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address,
if (!Check(S, predicate_decoder(Inst, fc, Address, Decoder)))
return MCDisassembler::Fail;
+ Check(S, DecodeVpredNOperand(Inst, Decoder));
return S;
}
@@ -5916,6 +5933,7 @@ static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
unsigned Rn = fieldFromInstruction(Insn, 16, 4);
if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
return MCDisassembler::Fail;
+ Check(S, DecodeVpredNOperand(Inst, Decoder));
return S;
}
@@ -5925,6 +5943,7 @@ static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn,
DecodeStatus S = MCDisassembler::Success;
Inst.addOperand(MCOperand::createReg(ARM::VPR));
Inst.addOperand(MCOperand::createReg(ARM::VPR));
+ Check(S, DecodeVpredNOperand(Inst, Decoder));
return S;
}
@@ -6199,15 +6218,13 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
(isVectorPredicable(MI) && ITBlock.instrInITBlock()))
S = SoftFail;
- // If we're in an IT/VPT block, base the predicate on that. Otherwise,
+ // If we're in an IT block, base the predicate on that. Otherwise,
// assume a predicate of AL.
unsigned CC = ARMCC::AL;
- unsigned VCC = ARMVCC::None;
if (ITBlock.instrInITBlock()) {
CC = ITBlock.getITCC();
ITBlock.advanceITState();
} else if (VPTBlock.instrInVPTBlock()) {
- VCC = VPTBlock.getVPTPred();
VPTBlock.advanceVPTState();
}
@@ -6230,34 +6247,6 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
Check(S, SoftFail);
}
- MCInst::iterator VCCI = MI.begin();
- unsigned VCCPos;
- for (VCCPos = 0; VCCPos < MCID.NumOperands; ++VCCPos, ++VCCI) {
- if (ARM::isVpred(MCID.operands()[VCCPos].OperandType) || VCCI == MI.end())
- break;
- }
-
- if (isVectorPredicable(MI)) {
- VCCI = MI.insert(VCCI, MCOperand::createImm(VCC));
- ++VCCI;
- if (VCC == ARMVCC::None)
- VCCI = MI.insert(VCCI, MCOperand::createReg(0));
- else
- VCCI = MI.insert(VCCI, MCOperand::createReg(ARM::P0));
- ++VCCI;
- VCCI = MI.insert(VCCI, MCOperand::createReg(0));
- ++VCCI;
- if (MCID.operands()[VCCPos].OperandType == ARM::OPERAND_VPRED_R) {
- int TiedOp = MCID.getOperandConstraint(VCCPos + 3, MCOI::TIED_TO);
- assert(TiedOp >= 0 &&
- "Inactive register in vpred_r is not tied to an output!");
- // Copy the operand to ensure it's not invalidated when MI grows.
- MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp)));
- }
- } else if (VCC != ARMVCC::None) {
- Check(S, SoftFail);
- }
-
return S;
}
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index dfd896f..8d8066a 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -49,7 +49,7 @@ public:
DwarfUsesRelocationsAcrossSections = enable;
}
- MCSection *getNonexecutableStackSection(MCContext &Ctx) const override {
+ MCSection *getStackSection(MCContext &Ctx, bool Exec) const override {
return nullptr;
}
};
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 3df448d..8f60e50 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -17,6 +17,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/OverflowInstAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
@@ -42,6 +43,7 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include <cassert>
+#include <optional>
#include <utility>
#define DEBUG_TYPE "instcombine"
@@ -1451,10 +1453,16 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
return nullptr;
};
- if (Instruction *R = ReplaceOldOpWithNewOp(CmpLHS, CmpRHS))
- return R;
- if (Instruction *R = ReplaceOldOpWithNewOp(CmpRHS, CmpLHS))
- return R;
+ bool CanReplaceCmpLHSWithRHS = canReplacePointersIfEqual(CmpLHS, CmpRHS, DL);
+ if (CanReplaceCmpLHSWithRHS) {
+ if (Instruction *R = ReplaceOldOpWithNewOp(CmpLHS, CmpRHS))
+ return R;
+ }
+ bool CanReplaceCmpRHSWithLHS = canReplacePointersIfEqual(CmpRHS, CmpLHS, DL);
+ if (CanReplaceCmpRHSWithLHS) {
+ if (Instruction *R = ReplaceOldOpWithNewOp(CmpRHS, CmpLHS))
+ return R;
+ }
auto *FalseInst = dyn_cast<Instruction>(FalseVal);
if (!FalseInst)
@@ -1469,12 +1477,14 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
// Example:
// (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1
SmallVector<Instruction *> DropFlags;
- if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ,
- /* AllowRefinement */ false,
- &DropFlags) == TrueVal ||
- simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ,
- /* AllowRefinement */ false,
- &DropFlags) == TrueVal) {
+ if ((CanReplaceCmpLHSWithRHS &&
+ simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ,
+ /* AllowRefinement */ false,
+ &DropFlags) == TrueVal) ||
+ (CanReplaceCmpRHSWithLHS &&
+ simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ,
+ /* AllowRefinement */ false,
+ &DropFlags) == TrueVal)) {
for (Instruction *I : DropFlags) {
I->dropPoisonGeneratingAnnotations();
Worklist.add(I);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index ed68723..41f7ab8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -1219,14 +1219,14 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
;
; GISEL-LABEL: test_shl_i1024:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: sub sp, sp, #416
-; GISEL-NEXT: stp x28, x27, [sp, #320] ; 16-byte Folded Spill
-; GISEL-NEXT: stp x26, x25, [sp, #336] ; 16-byte Folded Spill
-; GISEL-NEXT: stp x24, x23, [sp, #352] ; 16-byte Folded Spill
-; GISEL-NEXT: stp x22, x21, [sp, #368] ; 16-byte Folded Spill
-; GISEL-NEXT: stp x20, x19, [sp, #384] ; 16-byte Folded Spill
-; GISEL-NEXT: stp x29, x30, [sp, #400] ; 16-byte Folded Spill
-; GISEL-NEXT: .cfi_def_cfa_offset 416
+; GISEL-NEXT: sub sp, sp, #432
+; GISEL-NEXT: stp x28, x27, [sp, #336] ; 16-byte Folded Spill
+; GISEL-NEXT: stp x26, x25, [sp, #352] ; 16-byte Folded Spill
+; GISEL-NEXT: stp x24, x23, [sp, #368] ; 16-byte Folded Spill
+; GISEL-NEXT: stp x22, x21, [sp, #384] ; 16-byte Folded Spill
+; GISEL-NEXT: stp x20, x19, [sp, #400] ; 16-byte Folded Spill
+; GISEL-NEXT: stp x29, x30, [sp, #416] ; 16-byte Folded Spill
+; GISEL-NEXT: .cfi_def_cfa_offset 432
; GISEL-NEXT: .cfi_offset w30, -8
; GISEL-NEXT: .cfi_offset w29, -16
; GISEL-NEXT: .cfi_offset w19, -24
@@ -1242,38 +1242,44 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: ldp x10, x11, [x1]
; GISEL-NEXT: mov w8, w2
; GISEL-NEXT: lsr x9, x8, #6
-; GISEL-NEXT: and x16, x8, #0x3f
+; GISEL-NEXT: and x12, x8, #0x3f
+; GISEL-NEXT: str x0, [sp, #144] ; 8-byte Folded Spill
+; GISEL-NEXT: and x14, x8, #0x3f
; GISEL-NEXT: mov w13, #64 ; =0x40
-; GISEL-NEXT: sub x21, x13, x16
-; GISEL-NEXT: str x0, [sp, #112] ; 8-byte Folded Spill
-; GISEL-NEXT: mov x24, x16
-; GISEL-NEXT: lsl x25, x10, x16
+; GISEL-NEXT: and x16, x8, #0x3f
+; GISEL-NEXT: lsl x0, x10, x12
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: lsr x26, x10, x21
-; GISEL-NEXT: lsl x2, x11, x16
-; GISEL-NEXT: lsr x23, x11, x21
-; GISEL-NEXT: mov x22, x21
-; GISEL-NEXT: csel x12, x25, xzr, eq
+; GISEL-NEXT: sub x2, x13, x14
+; GISEL-NEXT: lsr x3, x10, x2
+; GISEL-NEXT: lsl x6, x11, x14
+; GISEL-NEXT: and x14, x8, #0x3f
+; GISEL-NEXT: csel x12, x0, xzr, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: str x1, [sp, #312] ; 8-byte Folded Spill
+; GISEL-NEXT: lsr x20, x11, x2
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: str x23, [sp, #208] ; 8-byte Folded Spill
+; GISEL-NEXT: mov x24, x0
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: stp x24, x22, [sp, #40] ; 16-byte Folded Spill
+; GISEL-NEXT: mov x7, x3
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #4
+; GISEL-NEXT: mov x28, x1
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #5
+; GISEL-NEXT: and x21, x8, #0x3f
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #6
+; GISEL-NEXT: str x6, [sp, #24] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #7
+; GISEL-NEXT: str x28, [sp, #304] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #8
+; GISEL-NEXT: str x7, [sp, #272] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #9
+; GISEL-NEXT: str x20, [sp, #112] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #10
; GISEL-NEXT: csel x12, xzr, x12, eq
@@ -1290,13 +1296,13 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x10, x10, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x10, [sp, #192] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x10, xzr, x26, eq
+; GISEL-NEXT: str x10, [sp, #232] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x10, xzr, x3, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x10, x2, x10
+; GISEL-NEXT: orr x10, x6, x10
; GISEL-NEXT: csel x10, x10, xzr, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: csel x10, x25, x10, eq
+; GISEL-NEXT: csel x10, x0, x10, eq
; GISEL-NEXT: cmp x9, #2
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #3
@@ -1327,25 +1333,24 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x13, xzr, x13, eq
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: lsl x20, x12, x16
+; GISEL-NEXT: lsl x26, x12, x14
; GISEL-NEXT: csel x11, x11, x13, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #184] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x11, xzr, x23, eq
+; GISEL-NEXT: str x11, [sp, #224] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x11, xzr, x20, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x20, x11
-; GISEL-NEXT: lsr x15, x12, x21
-; GISEL-NEXT: lsl x14, x10, x16
+; GISEL-NEXT: orr x11, x26, x11
+; GISEL-NEXT: lsr x15, x12, x2
+; GISEL-NEXT: lsl x30, x10, x16
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: lsr x17, x10, x21
-; GISEL-NEXT: csel x13, xzr, x26, eq
+; GISEL-NEXT: lsr x17, x10, x2
+; GISEL-NEXT: csel x13, xzr, x3, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: str x20, [sp, #8] ; 8-byte Folded Spill
-; GISEL-NEXT: orr x13, x2, x13
+; GISEL-NEXT: orr x13, x6, x13
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: csel x11, x0, x11, eq
; GISEL-NEXT: cmp x9, #3
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #4
@@ -1375,23 +1380,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #176] ; 8-byte Folded Spill
+; GISEL-NEXT: str x11, [sp, #216] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, xzr, x15, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x14, x11
+; GISEL-NEXT: orr x11, x30, x11
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x23, eq
+; GISEL-NEXT: csel x12, xzr, x20, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x12, x20, x12
+; GISEL-NEXT: orr x12, x26, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x26, eq
+; GISEL-NEXT: csel x12, xzr, x3, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: orr x12, x6, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: csel x11, x0, x11, eq
; GISEL-NEXT: cmp x9, #4
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #5
@@ -1421,33 +1426,33 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: lsl x0, x12, x16
; GISEL-NEXT: csel x10, x10, x13, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x10, [sp, #168] ; 8-byte Folded Spill
+; GISEL-NEXT: str x10, [sp, #208] ; 8-byte Folded Spill
; GISEL-NEXT: csel x10, xzr, x17, eq
; GISEL-NEXT: cmp x9, #0
; GISEL-NEXT: orr x10, x0, x10
-; GISEL-NEXT: lsr x27, x12, x21
+; GISEL-NEXT: lsr x4, x12, x2
; GISEL-NEXT: lsl x19, x11, x16
; GISEL-NEXT: csel x10, x10, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: lsr x3, x11, x21
+; GISEL-NEXT: mov x16, x15
; GISEL-NEXT: csel x13, xzr, x15, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: stp x27, x0, [sp, #240] ; 16-byte Folded Spill
-; GISEL-NEXT: orr x13, x14, x13
-; GISEL-NEXT: mov x7, x3
+; GISEL-NEXT: str x4, [sp, #248] ; 8-byte Folded Spill
+; GISEL-NEXT: orr x13, x30, x13
+; GISEL-NEXT: str x0, [sp, #48] ; 8-byte Folded Spill
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x23, eq
+; GISEL-NEXT: csel x13, xzr, x20, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x13, x20, x13
+; GISEL-NEXT: orr x13, x26, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x26, eq
+; GISEL-NEXT: csel x13, xzr, x3, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x13, x2, x13
+; GISEL-NEXT: orr x13, x6, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: csel x10, x25, x10, eq
+; GISEL-NEXT: csel x10, x24, x10, eq
; GISEL-NEXT: cmp x9, #5
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #6
@@ -1473,8 +1478,8 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x10, x12, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x10, [sp, #160] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x10, xzr, x27, eq
+; GISEL-NEXT: str x10, [sp, #200] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x10, xzr, x4, eq
; GISEL-NEXT: cmp x9, #0
; GISEL-NEXT: orr x10, x19, x10
; GISEL-NEXT: csel x10, x10, xzr, eq
@@ -1486,20 +1491,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x12, xzr, x15, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x12, x14, x12
+; GISEL-NEXT: and x15, x8, #0x3f
+; GISEL-NEXT: orr x12, x30, x12
; GISEL-NEXT: csel x10, x12, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x23, eq
+; GISEL-NEXT: csel x12, xzr, x20, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x12, x20, x12
+; GISEL-NEXT: orr x12, x26, x12
; GISEL-NEXT: csel x10, x12, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x26, eq
+; GISEL-NEXT: csel x12, xzr, x3, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: lsr x3, x11, x2
+; GISEL-NEXT: orr x12, x6, x12
; GISEL-NEXT: csel x10, x12, x10, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: csel x10, x25, x10, eq
+; GISEL-NEXT: csel x10, x24, x10, eq
; GISEL-NEXT: cmp x9, #6
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #7
@@ -1522,21 +1529,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x13, xzr, x13, eq
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: lsl x4, x12, x16
+; GISEL-NEXT: lsl x22, x12, x15
; GISEL-NEXT: csel x11, x11, x13, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #152] ; 8-byte Folded Spill
+; GISEL-NEXT: str x11, [sp, #192] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, xzr, x3, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x4, x11
-; GISEL-NEXT: lsl x30, x10, x16
-; GISEL-NEXT: lsr x28, x10, x21
+; GISEL-NEXT: orr x11, x22, x11
+; GISEL-NEXT: lsl x5, x10, x15
+; GISEL-NEXT: lsr x27, x10, x2
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x27, eq
+; GISEL-NEXT: csel x13, xzr, x4, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: str x30, [sp, #200] ; 8-byte Folded Spill
+; GISEL-NEXT: mov x25, x27
; GISEL-NEXT: orr x13, x19, x13
+; GISEL-NEXT: mov x14, x5
+; GISEL-NEXT: str x27, [sp, #328] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x13, xzr, x17, eq
@@ -1544,30 +1553,29 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: orr x13, x0, x13
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x15, eq
+; GISEL-NEXT: csel x13, xzr, x16, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x13, x14, x13
+; GISEL-NEXT: orr x13, x30, x13
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x23, eq
+; GISEL-NEXT: csel x13, xzr, x20, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x13, x20, x13
+; GISEL-NEXT: orr x13, x26, x13
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x26, eq
+; GISEL-NEXT: csel x13, xzr, x7, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x13, x2, x13
+; GISEL-NEXT: orr x13, x6, x13
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: lsr x13, x12, x21
-; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: lsr x13, x12, x2
+; GISEL-NEXT: csel x11, x24, x11, eq
; GISEL-NEXT: cmp x9, #7
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: mov x6, x13
+; GISEL-NEXT: mov x15, x13
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: str x6, [sp, #256] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #10
; GISEL-NEXT: csel x11, xzr, x11, eq
@@ -1584,18 +1592,18 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #144] ; 8-byte Folded Spill
+; GISEL-NEXT: str x11, [sp, #184] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, xzr, x13, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x30, x11
+; GISEL-NEXT: orr x11, x5, x11
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x12, xzr, x3, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x12, x4, x12
+; GISEL-NEXT: orr x12, x22, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x27, eq
+; GISEL-NEXT: csel x12, xzr, x4, eq
; GISEL-NEXT: cmp x9, #2
; GISEL-NEXT: orr x12, x19, x12
; GISEL-NEXT: csel x11, x12, x11, eq
@@ -1605,22 +1613,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: orr x12, x0, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x15, eq
+; GISEL-NEXT: csel x12, xzr, x16, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x12, x14, x12
+; GISEL-NEXT: orr x12, x30, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x23, eq
+; GISEL-NEXT: csel x12, xzr, x20, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x12, x20, x12
+; GISEL-NEXT: orr x12, x26, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x26, eq
+; GISEL-NEXT: csel x12, xzr, x7, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: orr x12, x6, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: csel x11, x24, x11, eq
; GISEL-NEXT: cmp x9, #8
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #9
@@ -1635,39 +1643,34 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #14
; GISEL-NEXT: csel x12, xzr, x11, eq
-; GISEL-NEXT: ldp x11, x5, [x1, #64]
+; GISEL-NEXT: ldp x11, x1, [x1, #64]
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x12, x10, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: lsl x21, x11, x16
-; GISEL-NEXT: str x12, [sp, #136] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x12, xzr, x28, eq
+; GISEL-NEXT: lsl x23, x11, x21
+; GISEL-NEXT: str x12, [sp, #176] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x12, xzr, x27, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x12, x21, x12
-; GISEL-NEXT: lsr x10, x11, x22
-; GISEL-NEXT: mov x16, x19
+; GISEL-NEXT: orr x12, x23, x12
+; GISEL-NEXT: lsr x21, x11, x2
+; GISEL-NEXT: str x23, [sp, #288] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, x12, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: mov x1, x16
; GISEL-NEXT: csel x13, xzr, x13, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: str x16, [sp, #304] ; 8-byte Folded Spill
-; GISEL-NEXT: orr x13, x30, x13
+; GISEL-NEXT: orr x13, x5, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x13, xzr, x3, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: lsl x3, x5, x24
-; GISEL-NEXT: orr x13, x4, x13
+; GISEL-NEXT: orr x13, x22, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: stp x21, x3, [sp, #216] ; 16-byte Folded Spill
-; GISEL-NEXT: csel x13, xzr, x27, eq
+; GISEL-NEXT: csel x13, xzr, x4, eq
; GISEL-NEXT: cmp x9, #3
; GISEL-NEXT: orr x13, x19, x13
-; GISEL-NEXT: mov x19, x28
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x13, xzr, x17, eq
@@ -1675,27 +1678,30 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: orr x13, x0, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x15, eq
+; GISEL-NEXT: csel x13, xzr, x16, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x13, x14, x13
+; GISEL-NEXT: orr x13, x30, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x23, eq
+; GISEL-NEXT: csel x13, xzr, x20, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x13, x20, x13
+; GISEL-NEXT: orr x13, x26, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x26, eq
+; GISEL-NEXT: csel x13, xzr, x7, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: orr x13, x2, x13
+; GISEL-NEXT: orr x13, x6, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: csel x12, x25, x12, eq
+; GISEL-NEXT: and x13, x8, #0x3f
+; GISEL-NEXT: csel x12, x24, x12, eq
; GISEL-NEXT: cmp x9, #9
+; GISEL-NEXT: lsl x10, x1, x13
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #10
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #11
+; GISEL-NEXT: stp x10, x15, [sp, #312] ; 16-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #12
; GISEL-NEXT: csel x12, xzr, x12, eq
@@ -1708,69 +1714,69 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x11, x11, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #128] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x11, xzr, x10, eq
+; GISEL-NEXT: str x11, [sp, #168] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x11, xzr, x21, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x3, x11
+; GISEL-NEXT: orr x11, x10, x11
+; GISEL-NEXT: mov x10, x23
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x28, eq
+; GISEL-NEXT: csel x12, xzr, x27, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: mov x28, x4
-; GISEL-NEXT: orr x12, x21, x12
-; GISEL-NEXT: str x28, [sp, #32] ; 8-byte Folded Spill
+; GISEL-NEXT: mov x27, x24
+; GISEL-NEXT: orr x12, x23, x12
+; GISEL-NEXT: mov x23, x15
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x6, eq
+; GISEL-NEXT: csel x12, xzr, x15, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x12, x30, x12
+; GISEL-NEXT: mov x15, x22
+; GISEL-NEXT: orr x12, x5, x12
+; GISEL-NEXT: mov x5, x3
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x7, eq
+; GISEL-NEXT: stp x14, x5, [sp, #256] ; 16-byte Folded Spill
+; GISEL-NEXT: csel x12, xzr, x3, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x12, x4, x12
-; GISEL-NEXT: mov x4, x20
+; GISEL-NEXT: mov x5, x4
+; GISEL-NEXT: orr x12, x22, x12
+; GISEL-NEXT: lsr x22, x1, x2
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x27, eq
+; GISEL-NEXT: csel x12, xzr, x4, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: mov x27, x2
-; GISEL-NEXT: orr x12, x16, x12
-; GISEL-NEXT: mov x16, x17
+; GISEL-NEXT: str x22, [sp, #240] ; 8-byte Folded Spill
+; GISEL-NEXT: orr x12, x19, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x12, xzr, x17, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: mov x17, x15
; GISEL-NEXT: orr x12, x0, x12
-; GISEL-NEXT: lsr x0, x5, x22
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x15, eq
+; GISEL-NEXT: csel x12, xzr, x16, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: ldr x15, [sp, #312] ; 8-byte Folded Reload
-; GISEL-NEXT: orr x12, x14, x12
-; GISEL-NEXT: str x0, [sp, #280] ; 8-byte Folded Spill
+; GISEL-NEXT: orr x12, x30, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x23, eq
+; GISEL-NEXT: csel x12, xzr, x20, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: mov x23, x25
-; GISEL-NEXT: orr x12, x20, x12
-; GISEL-NEXT: str x23, [sp, #288] ; 8-byte Folded Spill
+; GISEL-NEXT: orr x12, x26, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x26, eq
+; GISEL-NEXT: csel x12, xzr, x7, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: orr x12, x2, x12
-; GISEL-NEXT: mov x2, x3
+; GISEL-NEXT: mov x7, x14
+; GISEL-NEXT: orr x12, x6, x12
+; GISEL-NEXT: mov x6, x28
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: csel x11, x24, x11, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: mov x25, x26
+; GISEL-NEXT: ldr x24, [x6, #88]
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #11
+; GISEL-NEXT: ldr x6, [sp, #272] ; 8-byte Folded Reload
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #12
; GISEL-NEXT: csel x11, xzr, x11, eq
@@ -1780,80 +1786,84 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x12, xzr, x11, eq
+; GISEL-NEXT: ldr x11, [x28, #80]
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: csel x12, x5, x12, eq
-; GISEL-NEXT: ldp x11, x5, [x15, #80]
+; GISEL-NEXT: csel x12, x1, x12, eq
+; GISEL-NEXT: mov x28, x2
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x12, [sp, #120] ; 8-byte Folded Spill
-; GISEL-NEXT: mov x15, x7
-; GISEL-NEXT: csel x12, xzr, x0, eq
+; GISEL-NEXT: lsl x2, x11, x13
+; GISEL-NEXT: str x12, [sp, #160] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x12, xzr, x22, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: str x15, [sp, #24] ; 8-byte Folded Spill
-; GISEL-NEXT: lsl x20, x11, x24
-; GISEL-NEXT: orr x12, x20, x12
-; GISEL-NEXT: str x20, [sp, #232] ; 8-byte Folded Spill
+; GISEL-NEXT: ldr x1, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT: str x28, [sp, #16] ; 8-byte Folded Spill
+; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: str x2, [sp, #280] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, x12, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x10, eq
+; GISEL-NEXT: csel x13, xzr, x21, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x13, x3, x13
-; GISEL-NEXT: lsl x3, x5, x24
+; GISEL-NEXT: orr x13, x1, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x19, eq
+; GISEL-NEXT: csel x13, xzr, x25, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: stp x19, x3, [sp, #264] ; 16-byte Folded Spill
-; GISEL-NEXT: orr x13, x21, x13
+; GISEL-NEXT: mov x25, x16
+; GISEL-NEXT: orr x13, x10, x13
+; GISEL-NEXT: mov x10, x30
+; GISEL-NEXT: str x25, [sp, #80] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x6, eq
+; GISEL-NEXT: csel x13, xzr, x23, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x13, x30, x13
+; GISEL-NEXT: mov x23, x3
+; GISEL-NEXT: orr x13, x14, x13
+; GISEL-NEXT: mov x14, x17
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x7, eq
-; GISEL-NEXT: ldp x7, x30, [sp, #240] ; 16-byte Folded Reload
+; GISEL-NEXT: stp x19, x14, [sp, #64] ; 16-byte Folded Spill
+; GISEL-NEXT: csel x13, xzr, x3, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x13, x28, x13
+; GISEL-NEXT: mov x3, x21
+; GISEL-NEXT: orr x13, x15, x13
+; GISEL-NEXT: str x3, [sp, #32] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x7, eq
+; GISEL-NEXT: csel x13, xzr, x4, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x13, x1, x13
-; GISEL-NEXT: mov x1, x14
+; GISEL-NEXT: mov x4, x0
+; GISEL-NEXT: orr x13, x19, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x16, eq
+; GISEL-NEXT: csel x13, xzr, x17, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x13, x30, x13
+; GISEL-NEXT: mov x17, x27
+; GISEL-NEXT: orr x13, x0, x13
+; GISEL-NEXT: ldr x0, [sp, #24] ; 8-byte Folded Reload
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x17, eq
+; GISEL-NEXT: csel x13, xzr, x16, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: orr x13, x14, x13
-; GISEL-NEXT: ldr x14, [sp, #208] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x13, x30, x13
+; GISEL-NEXT: ldp x30, x16, [sp, #320] ; 16-byte Folded Reload
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x14, eq
+; GISEL-NEXT: csel x13, xzr, x20, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: orr x13, x4, x13
-; GISEL-NEXT: mov x4, x10
+; GISEL-NEXT: orr x13, x26, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x26, eq
+; GISEL-NEXT: csel x13, xzr, x6, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: mov x26, x27
-; GISEL-NEXT: orr x13, x27, x13
-; GISEL-NEXT: lsr x27, x11, x22
+; GISEL-NEXT: orr x13, x0, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: mov x13, x23
-; GISEL-NEXT: csel x12, x23, x12, eq
+; GISEL-NEXT: lsr x13, x11, x28
+; GISEL-NEXT: csel x12, x27, x12, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: str x27, [sp, #64] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #12
-; GISEL-NEXT: mov x23, x20
+; GISEL-NEXT: str x13, [sp, #96] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #13
; GISEL-NEXT: csel x12, xzr, x12, eq
@@ -1864,71 +1874,77 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x11, x11, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #104] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x11, xzr, x27, eq
+; GISEL-NEXT: str x11, [sp, #152] ; 8-byte Folded Spill
+; GISEL-NEXT: and x11, x8, #0x3f
+; GISEL-NEXT: lsl x27, x24, x11
+; GISEL-NEXT: csel x11, xzr, x13, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x3, x11
+; GISEL-NEXT: orr x11, x27, x11
+; GISEL-NEXT: str x27, [sp, #56] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x0, eq
+; GISEL-NEXT: csel x12, xzr, x22, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: mov x0, x7
-; GISEL-NEXT: orr x12, x20, x12
-; GISEL-NEXT: mov x20, x16
+; GISEL-NEXT: mov x22, x2
+; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: mov x2, x14
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x10, eq
+; GISEL-NEXT: csel x12, xzr, x21, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: ldr x10, [sp, #312] ; 8-byte Folded Reload
-; GISEL-NEXT: orr x12, x2, x12
-; GISEL-NEXT: ldr x2, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x21, [sp, #288] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x12, x1, x12
+; GISEL-NEXT: mov x1, x27
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x19, eq
+; GISEL-NEXT: csel x12, xzr, x16, eq
; GISEL-NEXT: cmp x9, #3
; GISEL-NEXT: orr x12, x21, x12
-; GISEL-NEXT: ldr x21, [sp, #200] ; 8-byte Folded Reload
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x6, eq
+; GISEL-NEXT: csel x12, xzr, x30, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x12, x21, x12
+; GISEL-NEXT: orr x12, x7, x12
+; GISEL-NEXT: mov x7, x15
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x15, eq
+; GISEL-NEXT: str x7, [sp, #40] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x12, xzr, x23, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x12, x28, x12
+; GISEL-NEXT: orr x12, x15, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x7, eq
+; GISEL-NEXT: csel x12, xzr, x5, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: mov x7, x17
-; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: mov x5, x19
+; GISEL-NEXT: orr x12, x19, x12
+; GISEL-NEXT: mov x19, x7
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x16, eq
+; GISEL-NEXT: csel x12, xzr, x14, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: orr x12, x30, x12
+; GISEL-NEXT: lsr x14, x24, x28
+; GISEL-NEXT: orr x12, x4, x12
+; GISEL-NEXT: mov x4, x10
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x17, eq
+; GISEL-NEXT: csel x12, xzr, x25, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: mov x17, x24
-; GISEL-NEXT: orr x12, x1, x12
+; GISEL-NEXT: orr x12, x10, x12
+; GISEL-NEXT: ldr x10, [sp, #304] ; 8-byte Folded Reload
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x14, eq
-; GISEL-NEXT: ldr x14, [sp, #8] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x12, xzr, x20, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: orr x12, x14, x12
+; GISEL-NEXT: orr x12, x26, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x25, eq
+; GISEL-NEXT: csel x12, xzr, x6, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: orr x12, x26, x12
+; GISEL-NEXT: orr x12, x0, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: csel x11, x13, x11, eq
+; GISEL-NEXT: csel x11, x17, x11, eq
; GISEL-NEXT: cmp x9, #12
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #13
@@ -1937,393 +1953,395 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x12, xzr, x11, eq
+; GISEL-NEXT: ldp x11, x6, [x10, #96]
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: ldp x11, x10, [x10, #96]
-; GISEL-NEXT: csel x12, x5, x12, eq
-; GISEL-NEXT: str x12, [sp, #96] ; 8-byte Folded Spill
-; GISEL-NEXT: mov x12, x22
-; GISEL-NEXT: lsr x22, x5, x22
-; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: mov x5, x27
-; GISEL-NEXT: lsl x24, x11, x24
-; GISEL-NEXT: str x10, [sp, #296] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x10, xzr, x22, eq
+; GISEL-NEXT: and x10, x8, #0x3f
+; GISEL-NEXT: csel x12, x24, x12, eq
+; GISEL-NEXT: tst x8, #0x3f
+; GISEL-NEXT: ldr x24, [sp, #248] ; 8-byte Folded Reload
+; GISEL-NEXT: lsl x15, x11, x10
+; GISEL-NEXT: csel x10, xzr, x14, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: str x22, [sp, #16] ; 8-byte Folded Spill
-; GISEL-NEXT: orr x10, x24, x10
+; GISEL-NEXT: str x12, [sp, #136] ; 8-byte Folded Spill
+; GISEL-NEXT: ldr x12, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x10, x15, x10
+; GISEL-NEXT: str x15, [sp, #296] ; 8-byte Folded Spill
+; GISEL-NEXT: mov x15, x13
; GISEL-NEXT: csel x10, x10, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x27, eq
+; GISEL-NEXT: csel x13, xzr, x13, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: ldr x27, [sp, #280] ; 8-byte Folded Reload
-; GISEL-NEXT: orr x13, x3, x13
-; GISEL-NEXT: mov x3, x26
+; GISEL-NEXT: orr x13, x27, x13
+; GISEL-NEXT: ldr x27, [sp, #240] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x13, xzr, x27, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x13, x23, x13
-; GISEL-NEXT: mov x23, x4
+; GISEL-NEXT: orr x13, x22, x13
+; GISEL-NEXT: ldr x22, [sp, #272] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x4, eq
-; GISEL-NEXT: ldp x4, x16, [sp, #216] ; 16-byte Folded Reload
+; GISEL-NEXT: csel x13, xzr, x3, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x13, x16, x13
+; GISEL-NEXT: orr x13, x12, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x19, eq
+; GISEL-NEXT: csel x13, xzr, x16, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: mov x19, x1
-; GISEL-NEXT: orr x13, x4, x13
+; GISEL-NEXT: mov x16, x17
+; GISEL-NEXT: orr x13, x21, x13
+; GISEL-NEXT: ldp x23, x21, [sp, #256] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x6, eq
+; GISEL-NEXT: csel x13, xzr, x30, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: mov x6, x14
-; GISEL-NEXT: orr x13, x21, x13
+; GISEL-NEXT: mov x30, x0
+; GISEL-NEXT: orr x13, x23, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x15, eq
+; GISEL-NEXT: csel x13, xzr, x21, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x13, x28, x13
+; GISEL-NEXT: orr x13, x7, x13
+; GISEL-NEXT: mov x7, x14
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x0, eq
+; GISEL-NEXT: csel x13, xzr, x24, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: mov x0, x23
-; GISEL-NEXT: orr x13, x2, x13
+; GISEL-NEXT: orr x13, x5, x13
+; GISEL-NEXT: ldr x5, [sp, #48] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x20, eq
+; GISEL-NEXT: csel x13, xzr, x2, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: orr x13, x30, x13
-; GISEL-NEXT: ldr x30, [sp, #208] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x2, [sp, #296] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x13, x5, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x7, eq
+; GISEL-NEXT: csel x13, xzr, x25, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: orr x13, x1, x13
+; GISEL-NEXT: mov x25, x6
+; GISEL-NEXT: orr x13, x4, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x30, eq
+; GISEL-NEXT: csel x13, xzr, x20, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: orr x13, x14, x13
-; GISEL-NEXT: ldp x14, x2, [sp, #264] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x13, x26, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x25, eq
+; GISEL-NEXT: csel x13, xzr, x22, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: orr x13, x26, x13
-; GISEL-NEXT: ldr x26, [sp, #288] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x13, x0, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: cmp x9, #12
-; GISEL-NEXT: lsr x13, x11, x12
-; GISEL-NEXT: csel x10, x26, x10, eq
+; GISEL-NEXT: lsr x13, x11, x28
+; GISEL-NEXT: csel x10, x17, x10, eq
; GISEL-NEXT: cmp x9, #13
+; GISEL-NEXT: ldr x17, [sp, #80] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #14
-; GISEL-NEXT: str x13, [sp, #72] ; 8-byte Folded Spill
+; GISEL-NEXT: str x13, [sp, #104] ; 8-byte Folded Spill
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x10, [sp, #88] ; 8-byte Folded Spill
-; GISEL-NEXT: ldr x10, [sp, #296] ; 8-byte Folded Reload
-; GISEL-NEXT: lsl x11, x10, x17
+; GISEL-NEXT: str x10, [sp, #128] ; 8-byte Folded Spill
+; GISEL-NEXT: and x10, x8, #0x3f
+; GISEL-NEXT: lsl x11, x6, x10
; GISEL-NEXT: csel x10, xzr, x13, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: ldr x17, [sp, #232] ; 8-byte Folded Reload
-; GISEL-NEXT: ldr x13, [sp, #256] ; 8-byte Folded Reload
+; GISEL-NEXT: ldp x0, x13, [sp, #280] ; 16-byte Folded Reload
+; GISEL-NEXT: mov x6, x16
; GISEL-NEXT: orr x10, x11, x10
-; GISEL-NEXT: str x11, [sp, #56] ; 8-byte Folded Spill
+; GISEL-NEXT: str x11, [sp, #88] ; 8-byte Folded Spill
; GISEL-NEXT: csel x10, x10, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x22, eq
+; GISEL-NEXT: csel x11, xzr, x14, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x11, x24, x11
+; GISEL-NEXT: orr x11, x2, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x5, eq
+; GISEL-NEXT: csel x11, xzr, x15, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x11, x2, x11
-; GISEL-NEXT: ldp x12, x5, [sp, #240] ; 16-byte Folded Reload
+; GISEL-NEXT: mov x15, x3
+; GISEL-NEXT: orr x11, x1, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x11, xzr, x27, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: mov x27, x30
-; GISEL-NEXT: orr x11, x17, x11
+; GISEL-NEXT: orr x11, x0, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x23, eq
+; GISEL-NEXT: csel x11, xzr, x3, eq
+; GISEL-NEXT: ldp x14, x3, [sp, #320] ; 16-byte Folded Reload
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: mov x23, x20
-; GISEL-NEXT: orr x11, x16, x11
-; GISEL-NEXT: ldr x16, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x11, x12, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x14, eq
+; GISEL-NEXT: csel x11, xzr, x3, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x11, x4, x11
+; GISEL-NEXT: orr x11, x13, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x13, eq
+; GISEL-NEXT: csel x11, xzr, x14, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x11, x21, x11
-; GISEL-NEXT: ldr x21, [sp, #296] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x11, x23, x11
+; GISEL-NEXT: mov x23, x5
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x15, eq
+; GISEL-NEXT: csel x11, xzr, x21, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: orr x11, x28, x11
+; GISEL-NEXT: mov x21, x4
+; GISEL-NEXT: orr x11, x19, x11
+; GISEL-NEXT: ldp x12, x19, [sp, #64] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x12, eq
+; GISEL-NEXT: csel x11, xzr, x24, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: orr x11, x16, x11
+; GISEL-NEXT: orr x11, x12, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x20, eq
+; GISEL-NEXT: csel x11, xzr, x19, eq
; GISEL-NEXT: cmp x9, #9
; GISEL-NEXT: orr x11, x5, x11
+; GISEL-NEXT: mov x5, x30
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x7, eq
+; GISEL-NEXT: csel x11, xzr, x17, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: orr x11, x1, x11
-; GISEL-NEXT: ldr x1, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x11, x4, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x30, eq
+; GISEL-NEXT: csel x11, xzr, x20, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: orr x11, x6, x11
+; GISEL-NEXT: orr x11, x26, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x25, eq
+; GISEL-NEXT: csel x11, xzr, x22, eq
; GISEL-NEXT: cmp x9, #12
-; GISEL-NEXT: orr x11, x3, x11
+; GISEL-NEXT: orr x11, x30, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: cmp x9, #13
-; GISEL-NEXT: csel x10, x26, x10, eq
+; GISEL-NEXT: csel x10, x16, x10, eq
; GISEL-NEXT: cmp x9, #14
+; GISEL-NEXT: ldr x16, [sp, #304] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x11, xzr, x10, eq
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: csel x11, x21, x11, eq
-; GISEL-NEXT: ldp x10, x20, [x1, #112]
-; GISEL-NEXT: str x11, [sp, #80] ; 8-byte Folded Spill
-; GISEL-NEXT: ldp x11, x4, [sp, #40] ; 16-byte Folded Reload
+; GISEL-NEXT: ldp x10, x4, [x16, #112]
+; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: str x11, [sp, #120] ; 8-byte Folded Spill
+; GISEL-NEXT: lsr x11, x25, x28
+; GISEL-NEXT: and x16, x8, #0x3f
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: lsr x21, x21, x4
-; GISEL-NEXT: lsl x28, x10, x11
-; GISEL-NEXT: csel x1, xzr, x21, eq
-; GISEL-NEXT: str x21, [sp, #296] ; 8-byte Folded Spill
+; GISEL-NEXT: ldr x25, [sp, #88] ; 8-byte Folded Reload
+; GISEL-NEXT: lsl x24, x10, x16
+; GISEL-NEXT: csel x1, xzr, x11, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x1, x28, x1
-; GISEL-NEXT: ldr x21, [sp, #72] ; 8-byte Folded Reload
-; GISEL-NEXT: str x28, [sp, #312] ; 8-byte Folded Spill
+; GISEL-NEXT: ldp x16, x28, [sp, #96] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x1, x24, x1
; GISEL-NEXT: csel x1, x1, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: ldr x28, [sp, #56] ; 8-byte Folded Reload
-; GISEL-NEXT: csel x30, xzr, x21, eq
+; GISEL-NEXT: csel x30, xzr, x28, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x30, x28, x30
+; GISEL-NEXT: orr x30, x25, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x22, eq
+; GISEL-NEXT: csel x30, xzr, x7, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: ldr x22, [sp, #64] ; 8-byte Folded Reload
-; GISEL-NEXT: orr x30, x24, x30
+; GISEL-NEXT: orr x30, x2, x30
+; GISEL-NEXT: ldr x2, [sp, #56] ; 8-byte Folded Reload
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x22, eq
+; GISEL-NEXT: csel x30, xzr, x16, eq
; GISEL-NEXT: cmp x9, #3
; GISEL-NEXT: orr x30, x2, x30
-; GISEL-NEXT: ldr x2, [sp, #280] ; 8-byte Folded Reload
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x2, eq
+; GISEL-NEXT: csel x30, xzr, x27, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x30, x17, x30
-; GISEL-NEXT: ldr x17, [sp, #224] ; 8-byte Folded Reload
+; GISEL-NEXT: mov x27, x13
+; GISEL-NEXT: orr x30, x0, x30
+; GISEL-NEXT: ldr x0, [sp, #248] ; 8-byte Folded Reload
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x0, eq
+; GISEL-NEXT: csel x30, xzr, x15, eq
+; GISEL-NEXT: ldr x15, [sp, #312] ; 8-byte Folded Reload
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x30, x17, x30
+; GISEL-NEXT: orr x30, x15, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x14, eq
-; GISEL-NEXT: ldr x14, [sp, #216] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x30, xzr, x3, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x30, x14, x30
+; GISEL-NEXT: ldr x3, [sp, #40] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x30, x13, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x13, eq
-; GISEL-NEXT: ldr x13, [sp, #200] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x30, xzr, x14, eq
+; GISEL-NEXT: ldp x13, x14, [sp, #256] ; 16-byte Folded Reload
; GISEL-NEXT: cmp x9, #7
; GISEL-NEXT: orr x30, x13, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x15, eq
-; GISEL-NEXT: ldr x15, [sp, #32] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x30, xzr, x14, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: orr x30, x15, x30
+; GISEL-NEXT: orr x30, x3, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x12, eq
+; GISEL-NEXT: csel x30, xzr, x0, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: orr x30, x16, x30
+; GISEL-NEXT: orr x30, x12, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x23, eq
+; GISEL-NEXT: csel x30, xzr, x19, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: orr x30, x5, x30
+; GISEL-NEXT: orr x30, x23, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x7, eq
+; GISEL-NEXT: csel x30, xzr, x17, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: orr x30, x19, x30
+; GISEL-NEXT: orr x30, x21, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x27, eq
+; GISEL-NEXT: csel x30, xzr, x20, eq
; GISEL-NEXT: cmp x9, #12
-; GISEL-NEXT: orr x30, x6, x30
+; GISEL-NEXT: mov x20, x26
+; GISEL-NEXT: orr x30, x26, x30
+; GISEL-NEXT: mov x26, x5
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x25, eq
+; GISEL-NEXT: csel x30, xzr, x22, eq
; GISEL-NEXT: cmp x9, #13
-; GISEL-NEXT: orr x30, x3, x30
+; GISEL-NEXT: orr x30, x5, x30
+; GISEL-NEXT: ldr x5, [sp, #16] ; 8-byte Folded Reload
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: cmp x9, #14
-; GISEL-NEXT: lsr x30, x10, x4
-; GISEL-NEXT: csel x1, x26, x1, eq
+; GISEL-NEXT: csel x1, x6, x1, eq
; GISEL-NEXT: cmp x9, #15
+; GISEL-NEXT: lsr x30, x10, x5
; GISEL-NEXT: csel x1, xzr, x1, eq
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: csel x26, x10, x1, eq
-; GISEL-NEXT: lsl x10, x20, x11
+; GISEL-NEXT: csel x5, x10, x1, eq
+; GISEL-NEXT: and x10, x8, #0x3f
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x16, xzr, x30, eq
+; GISEL-NEXT: lsl x10, x4, x10
+; GISEL-NEXT: csel x1, xzr, x30, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: ldr x11, [sp, #296] ; 8-byte Folded Reload
-; GISEL-NEXT: orr x10, x10, x16
-; GISEL-NEXT: ldr x16, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT: ldp x29, x30, [sp, #416] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x10, x10, x1
+; GISEL-NEXT: ldr x1, [sp, #296] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, x10, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x11, x16, x11
-; GISEL-NEXT: ldr x16, [sp, #272] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x11, x24, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x21, eq
+; GISEL-NEXT: csel x11, xzr, x28, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x11, x28, x11
-; GISEL-NEXT: ldp x29, x30, [sp, #400] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x11, x25, x11
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #16] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x11, eq
+; GISEL-NEXT: csel x11, xzr, x7, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x11, x24, x11
+; GISEL-NEXT: orr x11, x1, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x22, eq
+; GISEL-NEXT: csel x11, xzr, x16, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x11, x16, x11
-; GISEL-NEXT: ldr x16, [sp, #232] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x16, [sp, #280] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x11, x2, x11
; GISEL-NEXT: csel x10, x11, x10, eq
+; GISEL-NEXT: ldr x11, [sp, #240] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x2, eq
+; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #5
; GISEL-NEXT: orr x11, x16, x11
-; GISEL-NEXT: ldp x22, x21, [sp, #368] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
+; GISEL-NEXT: ldr x11, [sp, #32] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x0, eq
+; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x11, x17, x11
+; GISEL-NEXT: orr x11, x15, x11
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #264] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #328] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: orr x11, x14, x11
+; GISEL-NEXT: orr x11, x27, x11
+; GISEL-NEXT: ldp x28, x27, [sp, #336] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #256] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #320] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #8
; GISEL-NEXT: orr x11, x13, x11
-; GISEL-NEXT: ldr x13, [sp, #112] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x13, [sp, #144] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #24] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x11, eq
+; GISEL-NEXT: csel x11, xzr, x14, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: orr x11, x15, x11
+; GISEL-NEXT: orr x11, x3, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x12, eq
-; GISEL-NEXT: ldr x12, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x11, xzr, x0, eq
; GISEL-NEXT: cmp x9, #10
; GISEL-NEXT: orr x11, x12, x11
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #192] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #232] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: str x11, [x13]
-; GISEL-NEXT: ldp x12, x11, [sp, #176] ; 16-byte Folded Reload
+; GISEL-NEXT: ldp x12, x11, [sp, #216] ; 16-byte Folded Reload
; GISEL-NEXT: stp x11, x12, [x13, #8]
-; GISEL-NEXT: csel x11, xzr, x23, eq
+; GISEL-NEXT: csel x11, xzr, x19, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: orr x11, x5, x11
-; GISEL-NEXT: ldp x24, x23, [sp, #352] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x11, x23, x11
+; GISEL-NEXT: ldp x24, x23, [sp, #368] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #168] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #208] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: str x11, [x13, #24]
-; GISEL-NEXT: ldp x12, x11, [sp, #152] ; 16-byte Folded Reload
+; GISEL-NEXT: ldp x12, x11, [sp, #192] ; 16-byte Folded Reload
; GISEL-NEXT: stp x11, x12, [x13, #32]
-; GISEL-NEXT: csel x11, xzr, x7, eq
+; GISEL-NEXT: csel x11, xzr, x17, eq
; GISEL-NEXT: cmp x9, #12
-; GISEL-NEXT: orr x11, x19, x11
+; GISEL-NEXT: orr x11, x21, x11
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #144] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #184] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: str x11, [x13, #48]
-; GISEL-NEXT: ldp x12, x11, [sp, #128] ; 16-byte Folded Reload
+; GISEL-NEXT: ldp x12, x11, [sp, #168] ; 16-byte Folded Reload
; GISEL-NEXT: stp x11, x12, [x13, #56]
-; GISEL-NEXT: csel x11, xzr, x27, eq
+; GISEL-NEXT: ldr x11, [sp, #112] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x12, [sp, #136] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #13
-; GISEL-NEXT: orr x11, x6, x11
-; GISEL-NEXT: ldp x28, x27, [sp, #320] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x11, x20, x11
+; GISEL-NEXT: ldp x20, x19, [sp, #400] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #120] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #160] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: str x11, [x13, #72]
-; GISEL-NEXT: ldp x12, x11, [sp, #96] ; 16-byte Folded Reload
-; GISEL-NEXT: stp x11, x12, [x13, #80]
-; GISEL-NEXT: csel x11, xzr, x25, eq
+; GISEL-NEXT: ldr x11, [sp, #152] ; 8-byte Folded Reload
+; GISEL-NEXT: str x11, [x13, #80]
+; GISEL-NEXT: csel x11, xzr, x22, eq
; GISEL-NEXT: cmp x9, #14
-; GISEL-NEXT: orr x11, x3, x11
+; GISEL-NEXT: orr x11, x26, x11
+; GISEL-NEXT: ldp x22, x21, [sp, #384] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: cmp x9, #15
-; GISEL-NEXT: ldr x9, [sp, #288] ; 8-byte Folded Reload
-; GISEL-NEXT: ldr x11, [sp, #88] ; 8-byte Folded Reload
-; GISEL-NEXT: csel x9, x9, x10, eq
+; GISEL-NEXT: ldr x9, [sp, #128] ; 8-byte Folded Reload
+; GISEL-NEXT: ldp x26, x25, [sp, #352] ; 16-byte Folded Reload
+; GISEL-NEXT: stp x12, x9, [x13, #88]
+; GISEL-NEXT: csel x9, x6, x10, eq
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: ldr x8, [sp, #80] ; 8-byte Folded Reload
-; GISEL-NEXT: stp x11, x8, [x13, #96]
-; GISEL-NEXT: csel x8, x20, x9, eq
-; GISEL-NEXT: stp x26, x8, [x13, #112]
-; GISEL-NEXT: ldp x20, x19, [sp, #384] ; 16-byte Folded Reload
-; GISEL-NEXT: ldp x26, x25, [sp, #336] ; 16-byte Folded Reload
-; GISEL-NEXT: add sp, sp, #416
+; GISEL-NEXT: ldr x8, [sp, #120] ; 8-byte Folded Reload
+; GISEL-NEXT: stp x8, x5, [x13, #104]
+; GISEL-NEXT: csel x8, x4, x9, eq
+; GISEL-NEXT: str x8, [x13, #120]
+; GISEL-NEXT: add sp, sp, #432
; GISEL-NEXT: ret
entry:
%input_val = load i1024, ptr %input, align 128
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 63c08dd..b215c51 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -267,7 +267,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-SD-NEXT: and x9, x8, #0xfffffff0
; CHECK-SD-NEXT: add x10, x2, #32
; CHECK-SD-NEXT: add x11, x0, #16
-; CHECK-SD-NEXT: mov x12, x9
+; CHECK-SD-NEXT: and x12, x8, #0xfffffff0
; CHECK-SD-NEXT: .LBB3_4: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16]
@@ -313,7 +313,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-GI-NEXT: and x10, x9, #0xfffffff0
; CHECK-GI-NEXT: add x11, x2, #32
; CHECK-GI-NEXT: add x12, x0, #16
-; CHECK-GI-NEXT: mov x13, x10
+; CHECK-GI-NEXT: and x13, x9, #0xfffffff0
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
; CHECK-GI-NEXT: .LBB3_3: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
@@ -428,7 +428,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-SD-NEXT: and x9, x8, #0xfffffff0
; CHECK-SD-NEXT: add x10, x2, #32
; CHECK-SD-NEXT: add x11, x0, #16
-; CHECK-SD-NEXT: mov x12, x9
+; CHECK-SD-NEXT: and x12, x8, #0xfffffff0
; CHECK-SD-NEXT: .LBB4_4: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16]
@@ -472,7 +472,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-GI-NEXT: and x8, x9, #0xfffffff0
; CHECK-GI-NEXT: add x10, x2, #32
; CHECK-GI-NEXT: add x11, x0, #16
-; CHECK-GI-NEXT: mov x12, x8
+; CHECK-GI-NEXT: and x12, x9, #0xfffffff0
; CHECK-GI-NEXT: .LBB4_3: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: and w13, w1, #0xffff
@@ -596,7 +596,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-SD-NEXT: and x11, x10, #0xfffffff0
; CHECK-SD-NEXT: fmov s2, w9
; CHECK-SD-NEXT: add x8, x0, #8
-; CHECK-SD-NEXT: mov x12, x11
+; CHECK-SD-NEXT: and x12, x10, #0xfffffff0
; CHECK-SD-NEXT: .LBB5_5: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldp d3, d4, [x8, #-8]
@@ -646,10 +646,10 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-NEXT: add x10, x0, #8
+; CHECK-GI-NEXT: and x11, x8, #0xfffffff0
; CHECK-GI-NEXT: sbfx w9, w9, #8, #8
; CHECK-GI-NEXT: dup v2.8h, w9
; CHECK-GI-NEXT: and x9, x8, #0xfffffff0
-; CHECK-GI-NEXT: mov x11, x9
; CHECK-GI-NEXT: .LBB5_5: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldp d3, d4, [x10, #-8]
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
index 4c8e589..c23e4e1 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
@@ -17,7 +17,7 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
; CHECK-NEXT: and x9, x8, #0xfffffff0
; CHECK-NEXT: add x10, x1, #16
; CHECK-NEXT: add x11, x0, #16
-; CHECK-NEXT: mov x12, x9
+; CHECK-NEXT: and x12, x8, #0xfffffff0
; CHECK-NEXT: .LBB0_4: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldp q1, q4, [x10, #-16]
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
index f6bbdf5..1770bb9 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
@@ -14,7 +14,6 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
; CHECK-NEXT: mov x9, xzr
; CHECK-NEXT: and x12, x10, #0xfffffff0
; CHECK-NEXT: add x13, x1, #32
-; CHECK-NEXT: add x14, x2, #16
; CHECK-NEXT: b .LBB0_3
; CHECK-NEXT: .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us
; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
@@ -27,52 +26,52 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
; CHECK-NEXT: // =>This Loop Header: Depth=1
; CHECK-NEXT: // Child Loop BB0_6 Depth 2
; CHECK-NEXT: // Child Loop BB0_9 Depth 2
-; CHECK-NEXT: ldrsh w15, [x2, x9, lsl #1]
+; CHECK-NEXT: ldrsh w14, [x2, x9, lsl #1]
; CHECK-NEXT: cmp w0, #16
; CHECK-NEXT: b.hs .LBB0_5
; CHECK-NEXT: // %bb.4: // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: mov x18, xzr
+; CHECK-NEXT: mov x17, xzr
; CHECK-NEXT: b .LBB0_8
; CHECK-NEXT: .LBB0_5: // %vector.ph
; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: dup v0.8h, w15
-; CHECK-NEXT: mov x16, x14
-; CHECK-NEXT: mov x17, x13
-; CHECK-NEXT: mov x18, x12
+; CHECK-NEXT: dup v0.8h, w14
+; CHECK-NEXT: add x15, x2, #16
+; CHECK-NEXT: mov x16, x13
+; CHECK-NEXT: and x17, x10, #0xfffffff0
; CHECK-NEXT: .LBB0_6: // %vector.body
; CHECK-NEXT: // Parent Loop BB0_3 Depth=1
; CHECK-NEXT: // => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldp q1, q4, [x16, #-16]
-; CHECK-NEXT: subs x18, x18, #16
-; CHECK-NEXT: ldp q3, q2, [x17, #-32]
-; CHECK-NEXT: add x16, x16, #32
-; CHECK-NEXT: ldp q6, q5, [x17]
+; CHECK-NEXT: ldp q1, q4, [x15, #-16]
+; CHECK-NEXT: subs x17, x17, #16
+; CHECK-NEXT: ldp q3, q2, [x16, #-32]
+; CHECK-NEXT: add x15, x15, #32
+; CHECK-NEXT: ldp q6, q5, [x16]
; CHECK-NEXT: smlal2 v2.4s, v0.8h, v1.8h
; CHECK-NEXT: smlal v3.4s, v0.4h, v1.4h
; CHECK-NEXT: smlal2 v5.4s, v0.8h, v4.8h
; CHECK-NEXT: smlal v6.4s, v0.4h, v4.4h
-; CHECK-NEXT: stp q3, q2, [x17, #-32]
-; CHECK-NEXT: stp q6, q5, [x17], #64
+; CHECK-NEXT: stp q3, q2, [x16, #-32]
+; CHECK-NEXT: stp q6, q5, [x16], #64
; CHECK-NEXT: b.ne .LBB0_6
; CHECK-NEXT: // %bb.7: // %middle.block
; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: cmp x12, x10
-; CHECK-NEXT: mov x18, x12
+; CHECK-NEXT: and x17, x10, #0xfffffff0
; CHECK-NEXT: b.eq .LBB0_2
; CHECK-NEXT: .LBB0_8: // %for.body4.us.preheader
; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: add x16, x18, x8
-; CHECK-NEXT: add x17, x2, x18, lsl #1
-; CHECK-NEXT: sub x18, x10, x18
-; CHECK-NEXT: add x16, x1, x16, lsl #2
+; CHECK-NEXT: add x15, x17, x8
+; CHECK-NEXT: add x16, x2, x17, lsl #1
+; CHECK-NEXT: sub x17, x10, x17
+; CHECK-NEXT: add x15, x1, x15, lsl #2
; CHECK-NEXT: .LBB0_9: // %for.body4.us
; CHECK-NEXT: // Parent Loop BB0_3 Depth=1
; CHECK-NEXT: // => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldrsh w3, [x17], #2
-; CHECK-NEXT: ldr w4, [x16]
-; CHECK-NEXT: subs x18, x18, #1
-; CHECK-NEXT: madd w3, w3, w15, w4
-; CHECK-NEXT: str w3, [x16], #4
+; CHECK-NEXT: ldrsh w18, [x16], #2
+; CHECK-NEXT: ldr w3, [x15]
+; CHECK-NEXT: subs x17, x17, #1
+; CHECK-NEXT: madd w18, w18, w14, w3
+; CHECK-NEXT: str w18, [x15], #4
; CHECK-NEXT: b.ne .LBB0_9
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup
diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
index 3caac1d..74b0e69 100644
--- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -278,9 +278,9 @@ define i64 @test_and_4(i64 %x, i64 %y) {
; CHECK-GI-NEXT: .cfi_offset w19, -8
; CHECK-GI-NEXT: .cfi_offset w20, -16
; CHECK-GI-NEXT: .cfi_offset w30, -32
-; CHECK-GI-NEXT: and x20, x0, #0x3
; CHECK-GI-NEXT: mov x19, x0
-; CHECK-GI-NEXT: mov x0, x20
+; CHECK-GI-NEXT: and x20, x0, #0x3
+; CHECK-GI-NEXT: and x0, x0, #0x3
; CHECK-GI-NEXT: bl callee
; CHECK-GI-NEXT: tst x19, #0x3
; CHECK-GI-NEXT: csel x0, x20, x0, eq
diff --git a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
index e0f2155..58c01db 100644
--- a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
+++ b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
@@ -7,20 +7,16 @@
define void @foo(i64 %v1, i64 %v2, ptr %ptr) {
; CHECK-LABEL: foo:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: add x3, x0, x1
-; CHECK-NEXT: str x3, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: str x3, [x2, #8]
; CHECK-NEXT: ldr x3, [x2, #16]
; CHECK-NEXT: add x3, x0, x3
; CHECK-NEXT: sub x3, x3, x1
; CHECK-NEXT: str x3, [x2, #16]
-; CHECK-NEXT: ldr x3, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT: add x3, x0, x1
; CHECK-NEXT: str x3, [x2, #24]
; CHECK-NEXT: str x0, [x2, #32]
; CHECK-NEXT: str x1, [x2, #40]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%v3 = add i64 %v1, %v2
%p1 = getelementptr i64, ptr %ptr, i64 1
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 5fc996a..0f62997 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -23,7 +23,7 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: add x13, x1, #16
; CHECK-NEXT: add x8, x1, x10, lsl #2
; CHECK-NEXT: add x9, x0, x10
-; CHECK-NEXT: mov x14, x10
+; CHECK-NEXT: and x14, x11, #0x1fffffff8
; CHECK-NEXT: .LBB0_4: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldp q1, q2, [x13, #-16]
@@ -194,9 +194,9 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: mov w8, #1132396544 // =0x437f0000
; CHECK-NEXT: and x10, x11, #0x1fffffffc
; CHECK-NEXT: dup v0.4s, w8
+; CHECK-NEXT: and x12, x11, #0x1fffffffc
; CHECK-NEXT: add x8, x1, x10, lsl #3
; CHECK-NEXT: add x9, x0, x10, lsl #1
-; CHECK-NEXT: mov x12, x10
; CHECK-NEXT: .LBB1_9: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld2 { v1.4s, v2.4s }, [x1], #32
@@ -341,7 +341,7 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: dup v0.4s, w8
; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI2_0]
; CHECK-NEXT: add x9, x10, x10, lsl #1
-; CHECK-NEXT: mov x12, x10
+; CHECK-NEXT: and x12, x11, #0x1fffffffc
; CHECK-NEXT: add x8, x1, x9, lsl #2
; CHECK-NEXT: add x9, x0, x9
; CHECK-NEXT: .LBB2_4: // %vector.body
@@ -597,7 +597,7 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI3_0]
; CHECK-NEXT: add x8, x1, x10, lsl #4
; CHECK-NEXT: add x9, x0, x10, lsl #2
-; CHECK-NEXT: mov x12, x10
+; CHECK-NEXT: and x12, x11, #0x1fffffffc
; CHECK-NEXT: .LBB3_9: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64
diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll
index 0e68270..3e933fa 100644
--- a/llvm/test/CodeGen/AArch64/trampoline.ll
+++ b/llvm/test/CodeGen/AArch64/trampoline.ll
@@ -263,3 +263,9 @@ define i64 @func2() {
%fp = call ptr @llvm.adjust.trampoline(ptr @trampg)
ret i64 0
}
+
+; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the
+; presence of trampolines.
+; UTC_ARGS: --disable
+; CHECK-LINUX: .section ".note.GNU-stack","x",@progbits
+; UTC_ARGS: --enable
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
index c82b341..5bc9cdb 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -256,7 +256,7 @@ endif: ; preds = %else, %if
define amdgpu_kernel void @copy1(ptr addrspace(1) %out, ptr addrspace(1) %in0) {
entry:
%tmp = load float, ptr addrspace(1) %in0
- %tmp1 = fcmp oeq float %tmp, 0.000000e+00
+ %tmp1 = fcmp one float %tmp, 0.000000e+00
br i1 %tmp1, label %if0, label %endif
if0: ; preds = %entry
diff --git a/llvm/test/CodeGen/ARM/combine-movc-sub.ll b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
index ca5d089..8ca4c43 100644
--- a/llvm/test/CodeGen/ARM/combine-movc-sub.ll
+++ b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
@@ -27,11 +27,11 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: sub.w r7, r2, #32
-; CHECK-NEXT: mov r8, r0
+; CHECK-NEXT: sub.w r8, r2, #32
+; CHECK-NEXT: mov r6, r0
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: mov r4, r2
-; CHECK-NEXT: add.w r6, r0, r7, lsr #5
+; CHECK-NEXT: add.w r7, r0, r8, lsr #5
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: b .LBB0_2
@@ -44,16 +44,16 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
; CHECK-NEXT: mov r2, r4
; CHECK-NEXT: cmp r4, #31
; CHECK-NEXT: ldr r0, [r1, #16]
-; CHECK-NEXT: add.w r0, r0, r6, lsl #2
+; CHECK-NEXT: add.w r0, r0, r7, lsl #2
; CHECK-NEXT: ldr r0, [r0, #40]
; CHECK-NEXT: it hi
-; CHECK-NEXT: andhi r2, r7, #31
+; CHECK-NEXT: andhi r2, r8, #31
; CHECK-NEXT: lsrs r0, r2
; CHECK-NEXT: lsls r0, r0, #31
; CHECK-NEXT: beq .LBB0_1
; CHECK-NEXT: @ %bb.3: @ %if.then
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT: mov r0, r8
+; CHECK-NEXT: mov r0, r6
; CHECK-NEXT: bl foo
; CHECK-NEXT: str.w r9, [r5, #4]
; CHECK-NEXT: b .LBB0_1
diff --git a/llvm/test/CodeGen/ARM/extract-bits.ll b/llvm/test/CodeGen/ARM/extract-bits.ll
index 77deaa5..d717806 100644
--- a/llvm/test/CodeGen/ARM/extract-bits.ll
+++ b/llvm/test/CodeGen/ARM/extract-bits.ll
@@ -316,28 +316,28 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
;
; V7A-LABEL: bextr64_a0:
; V7A: @ %bb.0:
-; V7A-NEXT: .save {r4, lr}
-; V7A-NEXT: push {r4, lr}
-; V7A-NEXT: ldr r12, [sp, #8]
-; V7A-NEXT: mov lr, #1
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr lr, [sp, #16]
+; V7A-NEXT: mov r5, #1
; V7A-NEXT: lsr r0, r0, r2
-; V7A-NEXT: rsb r3, r12, #32
-; V7A-NEXT: subs r4, r12, #32
-; V7A-NEXT: lsr r3, lr, r3
-; V7A-NEXT: lslpl r3, lr, r4
-; V7A-NEXT: lsl r4, lr, r12
-; V7A-NEXT: movwpl r4, #0
-; V7A-NEXT: subs r4, r4, #1
-; V7A-NEXT: sbc r12, r3, #0
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: orr r0, r0, r1, lsl r3
-; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: rsb r12, lr, #32
+; V7A-NEXT: subs r4, lr, #32
+; V7A-NEXT: lsr r3, r5, r12
+; V7A-NEXT: lslpl r3, r5, r4
+; V7A-NEXT: lsl r5, r5, lr
+; V7A-NEXT: movwpl r5, #0
+; V7A-NEXT: rsb r4, r2, #32
+; V7A-NEXT: subs r5, r5, #1
+; V7A-NEXT: sbc r3, r3, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r4
+; V7A-NEXT: subs r4, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r4
; V7A-NEXT: lsr r1, r1, r2
; V7A-NEXT: movwpl r1, #0
-; V7A-NEXT: and r0, r4, r0
-; V7A-NEXT: and r1, r12, r1
-; V7A-NEXT: pop {r4, pc}
+; V7A-NEXT: and r0, r5, r0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: pop {r4, r5, r11, pc}
;
; V7A-T-LABEL: bextr64_a0:
; V7A-T: @ %bb.0:
@@ -434,28 +434,28 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n
;
; V7A-LABEL: bextr64_a0_arithmetic:
; V7A: @ %bb.0:
-; V7A-NEXT: .save {r4, lr}
-; V7A-NEXT: push {r4, lr}
-; V7A-NEXT: ldr r12, [sp, #8]
-; V7A-NEXT: mov lr, #1
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr lr, [sp, #16]
+; V7A-NEXT: mov r5, #1
; V7A-NEXT: lsr r0, r0, r2
-; V7A-NEXT: rsb r3, r12, #32
-; V7A-NEXT: subs r4, r12, #32
-; V7A-NEXT: lsr r3, lr, r3
-; V7A-NEXT: lslpl r3, lr, r4
-; V7A-NEXT: lsl r4, lr, r12
-; V7A-NEXT: movwpl r4, #0
-; V7A-NEXT: subs r4, r4, #1
-; V7A-NEXT: sbc r12, r3, #0
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: orr r0, r0, r1, lsl r3
-; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: rsb r12, lr, #32
+; V7A-NEXT: subs r4, lr, #32
+; V7A-NEXT: lsr r3, r5, r12
+; V7A-NEXT: lslpl r3, r5, r4
+; V7A-NEXT: lsl r5, r5, lr
+; V7A-NEXT: movwpl r5, #0
+; V7A-NEXT: rsb r4, r2, #32
+; V7A-NEXT: subs r5, r5, #1
+; V7A-NEXT: sbc r3, r3, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r4
+; V7A-NEXT: subs r4, r2, #32
; V7A-NEXT: asr r2, r1, r2
-; V7A-NEXT: asrpl r0, r1, r3
; V7A-NEXT: asrpl r2, r1, #31
-; V7A-NEXT: and r0, r4, r0
-; V7A-NEXT: and r1, r12, r2
-; V7A-NEXT: pop {r4, pc}
+; V7A-NEXT: asrpl r0, r1, r4
+; V7A-NEXT: and r1, r3, r2
+; V7A-NEXT: and r0, r5, r0
+; V7A-NEXT: pop {r4, r5, r11, pc}
;
; V7A-T-LABEL: bextr64_a0_arithmetic:
; V7A-T: @ %bb.0:
@@ -911,28 +911,28 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
;
; V7A-LABEL: bextr64_a4_commutative:
; V7A: @ %bb.0:
-; V7A-NEXT: .save {r4, lr}
-; V7A-NEXT: push {r4, lr}
-; V7A-NEXT: ldr r12, [sp, #8]
-; V7A-NEXT: mov lr, #1
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr lr, [sp, #16]
+; V7A-NEXT: mov r5, #1
; V7A-NEXT: lsr r0, r0, r2
-; V7A-NEXT: rsb r3, r12, #32
-; V7A-NEXT: subs r4, r12, #32
-; V7A-NEXT: lsr r3, lr, r3
-; V7A-NEXT: lslpl r3, lr, r4
-; V7A-NEXT: lsl r4, lr, r12
-; V7A-NEXT: movwpl r4, #0
-; V7A-NEXT: subs r4, r4, #1
-; V7A-NEXT: sbc r12, r3, #0
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: orr r0, r0, r1, lsl r3
-; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: rsb r12, lr, #32
+; V7A-NEXT: subs r4, lr, #32
+; V7A-NEXT: lsr r3, r5, r12
+; V7A-NEXT: lslpl r3, r5, r4
+; V7A-NEXT: lsl r5, r5, lr
+; V7A-NEXT: movwpl r5, #0
+; V7A-NEXT: rsb r4, r2, #32
+; V7A-NEXT: subs r5, r5, #1
+; V7A-NEXT: sbc r3, r3, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r4
+; V7A-NEXT: subs r4, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r4
; V7A-NEXT: lsr r1, r1, r2
; V7A-NEXT: movwpl r1, #0
-; V7A-NEXT: and r0, r0, r4
-; V7A-NEXT: and r1, r1, r12
-; V7A-NEXT: pop {r4, pc}
+; V7A-NEXT: and r0, r0, r5
+; V7A-NEXT: and r1, r1, r3
+; V7A-NEXT: pop {r4, r5, r11, pc}
;
; V7A-T-LABEL: bextr64_a4_commutative:
; V7A-T: @ %bb.0:
@@ -3456,22 +3456,22 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
; V7M-NEXT: uxtb r2, r2
; V7M-NEXT: it pl
; V7M-NEXT: movpl r1, #0
-; V7M-NEXT: rsb.w r12, r2, #32
+; V7M-NEXT: rsb.w r3, r2, #32
; V7M-NEXT: lsls r1, r2
-; V7M-NEXT: sub.w r3, r2, #32
-; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: sub.w r12, r2, #32
+; V7M-NEXT: lsr.w r4, r0, r3
; V7M-NEXT: orrs r1, r4
-; V7M-NEXT: cmp r3, #0
+; V7M-NEXT: cmp.w r12, #0
; V7M-NEXT: it pl
-; V7M-NEXT: lslpl.w r1, r0, r3
+; V7M-NEXT: lslpl.w r1, r0, r12
; V7M-NEXT: lsl.w r0, r0, r2
-; V7M-NEXT: lsl.w r4, r1, r12
+; V7M-NEXT: lsl.w r3, r1, r3
; V7M-NEXT: it pl
; V7M-NEXT: movpl r0, #0
; V7M-NEXT: lsr.w r0, r0, r2
-; V7M-NEXT: orr.w r0, r0, r4
+; V7M-NEXT: orr.w r0, r0, r3
; V7M-NEXT: it pl
-; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsrpl.w r0, r1, r12
; V7M-NEXT: lsr.w r1, r1, r2
; V7M-NEXT: it pl
; V7M-NEXT: movpl r1, #0
@@ -3715,26 +3715,26 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
; V7M-NEXT: uxtb r2, r2
; V7M-NEXT: lsl.w r0, lr, r0
; V7M-NEXT: orr.w r0, r0, r12
-; V7M-NEXT: rsb.w r12, r2, #32
+; V7M-NEXT: sub.w r12, r2, #32
; V7M-NEXT: it pl
; V7M-NEXT: lsrpl.w r0, lr, r3
; V7M-NEXT: it pl
; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: rsb.w r3, r2, #32
; V7M-NEXT: lsls r1, r2
-; V7M-NEXT: sub.w r3, r2, #32
-; V7M-NEXT: lsr.w r4, r0, r12
-; V7M-NEXT: orrs r1, r4
-; V7M-NEXT: cmp r3, #0
+; V7M-NEXT: cmp.w r12, #0
+; V7M-NEXT: lsr.w r4, r0, r3
+; V7M-NEXT: orr.w r1, r1, r4
; V7M-NEXT: it pl
-; V7M-NEXT: lslpl.w r1, r0, r3
+; V7M-NEXT: lslpl.w r1, r0, r12
; V7M-NEXT: lsl.w r0, r0, r2
-; V7M-NEXT: lsl.w r4, r1, r12
; V7M-NEXT: it pl
; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r3, r1, r3
; V7M-NEXT: lsr.w r0, r0, r2
-; V7M-NEXT: orr.w r0, r0, r4
+; V7M-NEXT: orr.w r0, r0, r3
; V7M-NEXT: it pl
-; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsrpl.w r0, r1, r12
; V7M-NEXT: lsr.w r1, r1, r2
; V7M-NEXT: it pl
; V7M-NEXT: movpl r1, #0
diff --git a/llvm/test/CodeGen/ARM/extract-lowbits.ll b/llvm/test/CodeGen/ARM/extract-lowbits.ll
index b483793..373d998 100644
--- a/llvm/test/CodeGen/ARM/extract-lowbits.ll
+++ b/llvm/test/CodeGen/ARM/extract-lowbits.ll
@@ -243,15 +243,15 @@ define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind {
; V7A: @ %bb.0:
; V7A-NEXT: .save {r11, lr}
; V7A-NEXT: push {r11, lr}
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: mov r12, #1
-; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: rsb r12, r2, #32
+; V7A-NEXT: mov lr, #1
; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: lsl r2, lr, r2
+; V7A-NEXT: lsr r12, lr, r12
; V7A-NEXT: movwpl r2, #0
-; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: lslpl r12, lr, r3
; V7A-NEXT: subs r2, r2, #1
-; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: sbc r3, r12, #0
; V7A-NEXT: and r0, r2, r0
; V7A-NEXT: and r1, r3, r1
; V7A-NEXT: pop {r11, pc}
@@ -323,15 +323,15 @@ define i64 @bzhi64_a0_masked(i64 %val, i64 %numlowbits) nounwind {
; V7A-NEXT: .save {r11, lr}
; V7A-NEXT: push {r11, lr}
; V7A-NEXT: and r2, r2, #63
-; V7A-NEXT: mov r12, #1
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: rsb r12, r2, #32
; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: lsl r2, lr, r2
+; V7A-NEXT: lsr r12, lr, r12
; V7A-NEXT: movwpl r2, #0
-; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: lslpl r12, lr, r3
; V7A-NEXT: subs r2, r2, #1
-; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: sbc r3, r12, #0
; V7A-NEXT: and r0, r2, r0
; V7A-NEXT: and r1, r3, r1
; V7A-NEXT: pop {r11, pc}
@@ -404,15 +404,15 @@ define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
; V7A: @ %bb.0:
; V7A-NEXT: .save {r11, lr}
; V7A-NEXT: push {r11, lr}
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: mov r12, #1
-; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: rsb r12, r2, #32
+; V7A-NEXT: mov lr, #1
; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: lsl r2, lr, r2
+; V7A-NEXT: lsr r12, lr, r12
; V7A-NEXT: movwpl r2, #0
-; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: lslpl r12, lr, r3
; V7A-NEXT: subs r2, r2, #1
-; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: sbc r3, r12, #0
; V7A-NEXT: and r0, r2, r0
; V7A-NEXT: and r1, r3, r1
; V7A-NEXT: pop {r11, pc}
@@ -644,15 +644,15 @@ define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind {
; V7A: @ %bb.0:
; V7A-NEXT: .save {r11, lr}
; V7A-NEXT: push {r11, lr}
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: mov r12, #1
-; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: rsb r12, r2, #32
+; V7A-NEXT: mov lr, #1
; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: lsl r2, lr, r2
+; V7A-NEXT: lsr r12, lr, r12
; V7A-NEXT: movwpl r2, #0
-; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: lslpl r12, lr, r3
; V7A-NEXT: subs r2, r2, #1
-; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: sbc r3, r12, #0
; V7A-NEXT: and r0, r0, r2
; V7A-NEXT: and r1, r1, r3
; V7A-NEXT: pop {r11, pc}
@@ -2144,23 +2144,23 @@ define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind {
;
; V7A-LABEL: bzhi64_d2_load:
; V7A: @ %bb.0:
-; V7A-NEXT: .save {r5, r7, r11, lr}
-; V7A-NEXT: push {r5, r7, r11, lr}
+; V7A-NEXT: .save {r5, lr}
+; V7A-NEXT: push {r5, lr}
; V7A-NEXT: rsb r3, r2, #64
-; V7A-NEXT: ldm r0, {r0, r7}
-; V7A-NEXT: rsb r1, r3, #32
+; V7A-NEXT: ldm r0, {r0, r5}
+; V7A-NEXT: rsb r12, r3, #32
; V7A-NEXT: rsbs r2, r2, #32
-; V7A-NEXT: lsr r5, r0, r1
-; V7A-NEXT: orr r7, r5, r7, lsl r3
-; V7A-NEXT: lslpl r7, r0, r2
+; V7A-NEXT: lsr r1, r0, r12
+; V7A-NEXT: orr r1, r1, r5, lsl r3
+; V7A-NEXT: lslpl r1, r0, r2
; V7A-NEXT: lsl r0, r0, r3
; V7A-NEXT: movwpl r0, #0
; V7A-NEXT: lsr r0, r0, r3
-; V7A-NEXT: orr r0, r0, r7, lsl r1
-; V7A-NEXT: lsr r1, r7, r3
-; V7A-NEXT: lsrpl r0, r7, r2
+; V7A-NEXT: orr r0, r0, r1, lsl r12
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: lsr r1, r1, r3
; V7A-NEXT: movwpl r1, #0
-; V7A-NEXT: pop {r5, r7, r11, pc}
+; V7A-NEXT: pop {r5, pc}
;
; V7A-T-LABEL: bzhi64_d2_load:
; V7A-T: @ %bb.0:
@@ -2237,26 +2237,26 @@ define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
;
; V7A-LABEL: bzhi64_d3_load_indexzext:
; V7A: @ %bb.0:
-; V7A-NEXT: .save {r5, r7, r11, lr}
-; V7A-NEXT: push {r5, r7, r11, lr}
+; V7A-NEXT: .save {r5, lr}
+; V7A-NEXT: push {r5, lr}
; V7A-NEXT: rsb r1, r1, #64
-; V7A-NEXT: ldm r0, {r0, r7}
+; V7A-NEXT: ldm r0, {r0, r5}
; V7A-NEXT: uxtb r2, r1
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: lsr r5, r0, r3
-; V7A-NEXT: orr r7, r5, r7, lsl r2
+; V7A-NEXT: rsb r12, r2, #32
+; V7A-NEXT: lsr r3, r0, r12
+; V7A-NEXT: orr r3, r3, r5, lsl r2
; V7A-NEXT: mvn r5, #31
; V7A-NEXT: uxtab r1, r5, r1
; V7A-NEXT: cmp r1, #0
-; V7A-NEXT: lslpl r7, r0, r1
+; V7A-NEXT: lslpl r3, r0, r1
; V7A-NEXT: lsl r0, r0, r2
; V7A-NEXT: movwpl r0, #0
; V7A-NEXT: lsr r0, r0, r2
-; V7A-NEXT: orr r0, r0, r7, lsl r3
-; V7A-NEXT: lsrpl r0, r7, r1
-; V7A-NEXT: lsr r1, r7, r2
+; V7A-NEXT: orr r0, r0, r3, lsl r12
+; V7A-NEXT: lsrpl r0, r3, r1
+; V7A-NEXT: lsr r1, r3, r2
; V7A-NEXT: movwpl r1, #0
-; V7A-NEXT: pop {r5, r7, r11, pc}
+; V7A-NEXT: pop {r5, pc}
;
; V7A-T-LABEL: bzhi64_d3_load_indexzext:
; V7A-T: @ %bb.0:
diff --git a/llvm/test/CodeGen/ARM/llround-conv.ll b/llvm/test/CodeGen/ARM/llround-conv.ll
index 0f57e4a..f734db8 100644
--- a/llvm/test/CodeGen/ARM/llround-conv.ll
+++ b/llvm/test/CodeGen/ARM/llround-conv.ll
@@ -1,25 +1,71 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+
+define i64 @testmsxh_builtin(half %x) {
+; CHECK-SOFT-LABEL: testmsxh_builtin:
+; CHECK-SOFT: @ %bb.0: @ %entry
+; CHECK-SOFT-NEXT: .save {r11, lr}
+; CHECK-SOFT-NEXT: push {r11, lr}
+; CHECK-SOFT-NEXT: bl __aeabi_h2f
+; CHECK-SOFT-NEXT: bl llroundf
+; CHECK-SOFT-NEXT: pop {r11, pc}
+;
+; CHECK-NOFP16-LABEL: testmsxh_builtin:
+; CHECK-NOFP16: @ %bb.0: @ %entry
+; CHECK-NOFP16-NEXT: .save {r11, lr}
+; CHECK-NOFP16-NEXT: push {r11, lr}
+; CHECK-NOFP16-NEXT: vmov r0, s0
+; CHECK-NOFP16-NEXT: bl __aeabi_h2f
+; CHECK-NOFP16-NEXT: vmov s0, r0
+; CHECK-NOFP16-NEXT: bl llroundf
+; CHECK-NOFP16-NEXT: pop {r11, pc}
+;
+; CHECK-FP16-LABEL: testmsxh_builtin:
+; CHECK-FP16: @ %bb.0: @ %entry
+; CHECK-FP16-NEXT: .save {r11, lr}
+; CHECK-FP16-NEXT: push {r11, lr}
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: bl llroundf
+; CHECK-FP16-NEXT: pop {r11, pc}
+entry:
+ %0 = tail call i64 @llvm.llround.i64.f16(half %x)
+ ret i64 %0
+}
-; SOFTFP-LABEL: testmsxs_builtin:
-; SOFTFP: bl llroundf
-; HARDFP-LABEL: testmsxs_builtin:
-; HARDFP: bl llroundf
define i64 @testmsxs_builtin(float %x) {
+; CHECK-LABEL: testmsxs_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl llroundf
+; CHECK-NEXT: pop {r11, pc}
entry:
- %0 = tail call i64 @llvm.llround.f32(float %x)
+ %0 = tail call i64 @llvm.llround.i64.f32(float %x)
ret i64 %0
}
-; SOFTFP-LABEL: testmsxd_builtin:
-; SOFTFP: bl llround
-; HARDFP-LABEL: testmsxd_builtin:
-; HARDFP: bl llround
define i64 @testmsxd_builtin(double %x) {
+; CHECK-LABEL: testmsxd_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl llround
+; CHECK-NEXT: pop {r11, pc}
entry:
- %0 = tail call i64 @llvm.llround.f64(double %x)
+ %0 = tail call i64 @llvm.llround.i64.f64(double %x)
ret i64 %0
}
-declare i64 @llvm.llround.f32(float) nounwind readnone
-declare i64 @llvm.llround.f64(double) nounwind readnone
+define i64 @testmsxq_builtin(fp128 %x) {
+; CHECK-LABEL: testmsxq_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl llroundl
+; CHECK-NEXT: pop {r11, pc}
+entry:
+ %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x)
+ ret i64 %0
+}
diff --git a/llvm/test/CodeGen/ARM/lround-conv.ll b/llvm/test/CodeGen/ARM/lround-conv.ll
index 3aaed74..03f7a0d 100644
--- a/llvm/test/CodeGen/ARM/lround-conv.ll
+++ b/llvm/test/CodeGen/ARM/lround-conv.ll
@@ -1,25 +1,47 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FPv8
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+
+;define i32 @testmswh_builtin(half %x) {
+;entry:
+; %0 = tail call i32 @llvm.lround.i32.f16(half %x)
+; ret i32 %0
+;}
-; SOFTFP-LABEL: testmsws_builtin:
-; SOFTFP: bl lroundf
-; HARDFP-LABEL: testmsws_builtin:
-; HARDFP: bl lroundf
define i32 @testmsws_builtin(float %x) {
+; CHECK-LABEL: testmsws_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: b lroundf
entry:
%0 = tail call i32 @llvm.lround.i32.f32(float %x)
ret i32 %0
}
-; SOFTFP-LABEL: testmswd_builtin:
-; SOFTFP: bl lround
-; HARDFP-LABEL: testmswd_builtin:
-; HARDFP: bl lround
define i32 @testmswd_builtin(double %x) {
+; CHECK-LABEL: testmswd_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: b lround
entry:
%0 = tail call i32 @llvm.lround.i32.f64(double %x)
ret i32 %0
}
-declare i32 @llvm.lround.i32.f32(float) nounwind readnone
-declare i32 @llvm.lround.i32.f64(double) nounwind readnone
+define i32 @testmswq_builtin(fp128 %x) {
+; CHECK-LABEL: testmswq_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl lroundl
+; CHECK-NEXT: pop {r11, pc}
+entry:
+ %0 = tail call i32 @llvm.lround.i32.f128(fp128 %x)
+ ret i32 %0
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-FP16: {{.*}}
+; CHECK-FPv8: {{.*}}
+; CHECK-NOFP16: {{.*}}
+; CHECK-SOFT: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index e761d3a..33b89a4 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -39,119 +39,118 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: slli a2, a2, 1
; NOREMAT-NEXT: sub sp, sp, a2
; NOREMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
-; NOREMAT-NEXT: mv a7, a0
-; NOREMAT-NEXT: li a0, 32
-; NOREMAT-NEXT: addi a5, a7, 512
-; NOREMAT-NEXT: addi a4, a7, 1024
-; NOREMAT-NEXT: addi a6, a7, 1536
-; NOREMAT-NEXT: li t4, 1
-; NOREMAT-NEXT: li a2, 5
+; NOREMAT-NEXT: li a7, 32
+; NOREMAT-NEXT: addi s10, a0, 512
+; NOREMAT-NEXT: addi a4, a0, 1024
+; NOREMAT-NEXT: addi a6, a0, 1536
+; NOREMAT-NEXT: li t0, 1
+; NOREMAT-NEXT: li a3, 5
; NOREMAT-NEXT: li t1, 3
-; NOREMAT-NEXT: li t0, 7
-; NOREMAT-NEXT: lui t5, 1
+; NOREMAT-NEXT: li a2, 7
+; NOREMAT-NEXT: lui t2, 1
; NOREMAT-NEXT: li s4, 9
; NOREMAT-NEXT: li s6, 11
; NOREMAT-NEXT: li s9, 13
; NOREMAT-NEXT: li ra, 15
-; NOREMAT-NEXT: lui t2, 2
+; NOREMAT-NEXT: lui a5, 2
; NOREMAT-NEXT: lui s1, 3
; NOREMAT-NEXT: lui t3, 4
; NOREMAT-NEXT: lui s0, 5
; NOREMAT-NEXT: lui s3, 6
; NOREMAT-NEXT: lui s7, 7
-; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOREMAT-NEXT: slli t4, t4, 11
-; NOREMAT-NEXT: sd t4, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: slli a3, a2, 9
-; NOREMAT-NEXT: sd a3, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: vsetvli zero, a7, e32, m2, ta, ma
+; NOREMAT-NEXT: slli t0, t0, 11
+; NOREMAT-NEXT: sd t0, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: slli t4, a3, 9
+; NOREMAT-NEXT: sd t4, 504(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: slli t6, t1, 10
-; NOREMAT-NEXT: slli s2, t0, 9
-; NOREMAT-NEXT: add a0, a7, t5
+; NOREMAT-NEXT: slli s2, a2, 9
+; NOREMAT-NEXT: add a7, a0, t2
; NOREMAT-NEXT: lui s11, 1
; NOREMAT-NEXT: slli s4, s4, 9
-; NOREMAT-NEXT: slli s5, a2, 10
+; NOREMAT-NEXT: slli s5, a3, 10
; NOREMAT-NEXT: slli s6, s6, 9
; NOREMAT-NEXT: slli s8, t1, 11
-; NOREMAT-NEXT: vle32.v v8, (a5)
+; NOREMAT-NEXT: vle32.v v8, (s10)
; NOREMAT-NEXT: slli s9, s9, 9
; NOREMAT-NEXT: li t5, 13
; NOREMAT-NEXT: vle32.v v10, (a4)
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: slli s10, t0, 10
+; NOREMAT-NEXT: slli s10, a2, 10
; NOREMAT-NEXT: vle32.v v0, (a6)
; NOREMAT-NEXT: vle32.v v12, (a6)
; NOREMAT-NEXT: slli ra, ra, 9
-; NOREMAT-NEXT: vle32.v v4, (a0)
-; NOREMAT-NEXT: vle32.v v20, (a0)
-; NOREMAT-NEXT: add a4, a7, t2
+; NOREMAT-NEXT: vle32.v v4, (a7)
+; NOREMAT-NEXT: vle32.v v20, (a7)
+; NOREMAT-NEXT: add a4, a0, a5
; NOREMAT-NEXT: vle32.v v6, (a4)
; NOREMAT-NEXT: vle32.v v30, (a4)
-; NOREMAT-NEXT: add a4, a7, s1
+; NOREMAT-NEXT: add a4, a0, s1
; NOREMAT-NEXT: vle32.v v28, (a4)
; NOREMAT-NEXT: vle32.v v26, (a4)
-; NOREMAT-NEXT: add a4, a7, t3
+; NOREMAT-NEXT: add a4, a0, t3
; NOREMAT-NEXT: vle32.v v24, (a4)
; NOREMAT-NEXT: vle32.v v22, (a4)
-; NOREMAT-NEXT: add a4, a7, s0
-; NOREMAT-NEXT: vle32.v v14, (a7)
+; NOREMAT-NEXT: add a4, a0, s0
+; NOREMAT-NEXT: vle32.v v14, (a0)
; NOREMAT-NEXT: vle32.v v18, (a4)
; NOREMAT-NEXT: vle32.v v16, (a4)
-; NOREMAT-NEXT: add a4, a7, s3
+; NOREMAT-NEXT: add a4, a0, s3
; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8
; NOREMAT-NEXT: vle32.v v14, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: addi a0, sp, 640
-; NOREMAT-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, t4
+; NOREMAT-NEXT: addi a4, sp, 640
+; NOREMAT-NEXT: vs2r.v v8, (a4) # vscale x 16-byte Folded Spill
+; NOREMAT-NEXT: add a4, a0, t0
; NOREMAT-NEXT: vle32.v v10, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: add a4, a7, a3
+; NOREMAT-NEXT: add a4, a0, t4
; NOREMAT-NEXT: vle32.v v0, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10
; NOREMAT-NEXT: vle32.v v10, (a4)
-; NOREMAT-NEXT: add a4, a7, t6
+; NOREMAT-NEXT: add a4, a0, t6
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: add a4, a7, s2
+; NOREMAT-NEXT: add a4, a0, s2
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s7
+; NOREMAT-NEXT: add a4, a0, s7
; NOREMAT-NEXT: vle32.v v0, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8
; NOREMAT-NEXT: vle32.v v10, (a4)
-; NOREMAT-NEXT: add a4, a7, s4
+; NOREMAT-NEXT: add a4, a0, s4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s5
+; NOREMAT-NEXT: add a4, a0, s5
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a7, s6
+; NOREMAT-NEXT: add a4, a0, s6
; NOREMAT-NEXT: vle32.v v20, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s8
+; NOREMAT-NEXT: add a4, a0, s8
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a7, s9
+; NOREMAT-NEXT: add a4, a0, s9
; NOREMAT-NEXT: vle32.v v20, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s10
+; NOREMAT-NEXT: add a4, a0, s10
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a7, ra
+; NOREMAT-NEXT: add a4, a0, ra
; NOREMAT-NEXT: vle32.v v2, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: lui t4, 8
-; NOREMAT-NEXT: add a5, a7, t4
+; NOREMAT-NEXT: add a5, a0, t4
; NOREMAT-NEXT: vle32.v v20, (a5)
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2
@@ -159,14 +158,14 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: slli a4, a4, 9
; NOREMAT-NEXT: li s1, 17
; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: add a4, a0, a4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6
; NOREMAT-NEXT: li a5, 9
; NOREMAT-NEXT: slli a4, a5, 10
; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: add a4, a0, a4
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: vle32.v v6, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
@@ -174,256 +173,257 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: slli a4, a4, 9
; NOREMAT-NEXT: li t2, 19
; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: add a4, a0, a4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: vle32.v v30, (a4)
-; NOREMAT-NEXT: slli a3, a2, 11
+; NOREMAT-NEXT: slli a3, a3, 11
; NOREMAT-NEXT: sd a3, 600(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
; NOREMAT-NEXT: li s7, 21
; NOREMAT-NEXT: slli a3, s7, 9
; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v6, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12
; NOREMAT-NEXT: li a6, 11
; NOREMAT-NEXT: slli a3, a6, 10
; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v30, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8
; NOREMAT-NEXT: li s3, 23
; NOREMAT-NEXT: slli a3, s3, 9
; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12
; NOREMAT-NEXT: li s0, 25
; NOREMAT-NEXT: slli a3, s0, 9
; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v6, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
; NOREMAT-NEXT: slli a3, t5, 10
; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v30, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28
; NOREMAT-NEXT: li t3, 27
; NOREMAT-NEXT: slli a3, t3, 9
; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v28, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
-; NOREMAT-NEXT: slli a2, t0, 11
+; NOREMAT-NEXT: slli a2, a2, 11
; NOREMAT-NEXT: sd a2, 544(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
; NOREMAT-NEXT: li t0, 29
; NOREMAT-NEXT: slli a2, t0, 9
; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT: li a3, 15
-; NOREMAT-NEXT: slli a2, a3, 10
+; NOREMAT-NEXT: li a7, 15
+; NOREMAT-NEXT: slli a2, a7, 10
; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12
; NOREMAT-NEXT: li t1, 31
; NOREMAT-NEXT: slli a2, t1, 9
; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT: lui a4, 4
-; NOREMAT-NEXT: addi a0, a4, 512
-; NOREMAT-NEXT: sd a0, 496(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a0, a7, a0
-; NOREMAT-NEXT: vle32.v v8, (a0)
-; NOREMAT-NEXT: vle32.v v26, (a0)
+; NOREMAT-NEXT: lui a3, 4
+; NOREMAT-NEXT: addi a2, a3, 512
+; NOREMAT-NEXT: sd a2, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: vle32.v v8, (a2)
+; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28
; NOREMAT-NEXT: slli a2, s1, 10
; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT: addi a2, a4, 1536
+; NOREMAT-NEXT: addi a2, a3, 1536
; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: lui a4, 4
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: slli a2, a5, 11
; NOREMAT-NEXT: sd a2, 472(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v24
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8
; NOREMAT-NEXT: lui a5, 5
; NOREMAT-NEXT: addi a2, a5, -1536
; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28
; NOREMAT-NEXT: slli a2, t2, 10
; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: li t2, 19
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: li a3, 19
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12
; NOREMAT-NEXT: addi a2, a5, -512
; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24
; NOREMAT-NEXT: addi a2, a5, 512
; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: slli a2, s7, 10
; NOREMAT-NEXT: sd a2, 432(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26
; NOREMAT-NEXT: addi a2, a5, 1536
; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: slli a2, a6, 11
; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18
; NOREMAT-NEXT: lui a6, 6
; NOREMAT-NEXT: addi a2, a6, -1536
; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: slli a2, s3, 10
; NOREMAT-NEXT: sd a2, 400(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v24
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
; NOREMAT-NEXT: addi a2, a6, -512
; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22
; NOREMAT-NEXT: addi a2, a6, 512
; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: slli a2, s0, 10
; NOREMAT-NEXT: sd a2, 376(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v2, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18
; NOREMAT-NEXT: addi a2, a6, 1536
; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: slli a2, t5, 11
; NOREMAT-NEXT: sd a2, 360(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v16
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8
; NOREMAT-NEXT: lui s0, 7
; NOREMAT-NEXT: addi a2, s0, -1536
; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: slli a2, t3, 10
; NOREMAT-NEXT: sd a2, 344(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v14
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v14, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
-; NOREMAT-NEXT: addi a0, sp, 640
-; NOREMAT-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; NOREMAT-NEXT: addi a2, sp, 640
+; NOREMAT-NEXT: vl2r.v v12, (a2) # vscale x 16-byte Folded Reload
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22
; NOREMAT-NEXT: addi a2, s0, -512
; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26
; NOREMAT-NEXT: addi a2, s0, 512
; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: lui t3, 7
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: slli a2, t0, 10
; NOREMAT-NEXT: sd a2, 320(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v18
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v2, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16
; NOREMAT-NEXT: addi a2, t3, 1536
; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
-; NOREMAT-NEXT: slli a2, a3, 11
+; NOREMAT-NEXT: slli a2, a7, 11
; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14
; NOREMAT-NEXT: addi a2, t4, -1536
; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v14, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: slli a2, t1, 10
; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
-; NOREMAT-NEXT: addi a0, t4, -512
-; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a0, a7, a0
+; NOREMAT-NEXT: addi a2, t4, -512
+; NOREMAT-NEXT: sd a2, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a0, a0, a2
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0
; NOREMAT-NEXT: vle32.v v12, (a0)
; NOREMAT-NEXT: vle32.v v0, (a0)
@@ -476,7 +476,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: addi s11, a0, 512
; NOREMAT-NEXT: addi s7, a0, 1024
; NOREMAT-NEXT: addi s3, a0, 1536
-; NOREMAT-NEXT: slli s1, t2, 11
+; NOREMAT-NEXT: slli s1, a3, 11
; NOREMAT-NEXT: lui a0, 10
; NOREMAT-NEXT: addi t2, a0, -1536
; NOREMAT-NEXT: addi a7, a0, -1024
diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
index 34d4657..c68fa59 100644
--- a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
@@ -78,3 +78,10 @@ define i64 @test0(i64 %n, ptr %p) nounwind {
ret i64 %ret
}
+
+; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the
+; presence of trampolines.
+; UTC_ARGS: --disable
+; RV64-LINUX: .section ".note.GNU-stack","x",@progbits
+; RV64: .section ".note.GNU-stack","x",@progbits
+; UTC_ARGS: --enable
diff --git a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
index 9798077..e3ed31f 100644
--- a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
+++ b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
@@ -163,9 +163,9 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
; LINUX-NEXT: ld %f10, 8(%r3)
; LINUX-NEXT: ld %f0, 16(%r3)
; LINUX-NEXT: ld %f2, 24(%r3)
-; LINUX-NEXT: la %r3, 16(%r2)
-; LINUX-NEXT: la %r4, 48(%r2)
; LINUX-NEXT: la %r2, 176(%r15)
+; LINUX-NEXT: la %r3, 16(%r13)
+; LINUX-NEXT: la %r4, 48(%r13)
; LINUX-NEXT: std %f0, 176(%r15)
; LINUX-NEXT: std %f2, 184(%r15)
; LINUX-NEXT: brasl %r14, sincosl@PLT
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index 6f986ce..c418038 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -541,11 +541,11 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(ptr nocaptur
; CHECK-NEXT: cbz r2, .LBB7_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: adds r3, r2, #7
-; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: bic r3, r3, #7
; CHECK-NEXT: movs r4, #1
+; CHECK-NEXT: bic r3, r3, #7
+; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: subs r3, #8
-; CHECK-NEXT: vmov q3, q1
+; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: add.w r12, r4, r3, lsr #3
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: mov r4, r1
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
index 4020709..fe06601 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
@@ -16,39 +16,40 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
; CHECK-NEXT: sub sp, #64
; CHECK-NEXT: ldrsh.w r7, [r2]
; CHECK-NEXT: cmp r7, #1
-; CHECK-NEXT: blt.w .LBB0_6
+; CHECK-NEXT: blt .LBB0_6
; CHECK-NEXT: @ %bb.2: @ %for.cond3.preheader.us.preheader
-; CHECK-NEXT: movs r2, #252
; CHECK-NEXT: ldr r4, [sp, #152]
+; CHECK-NEXT: movs r2, #252
; CHECK-NEXT: and.w r6, r2, r3, lsr #3
; CHECK-NEXT: movs r2, #120
; CHECK-NEXT: and.w r5, r2, r3, lsr #9
; CHECK-NEXT: lsls r3, r3, #3
-; CHECK-NEXT: uxtb r3, r3
; CHECK-NEXT: muls r6, r4, r6
+; CHECK-NEXT: uxtb r3, r3
; CHECK-NEXT: rsb.w r2, r4, #256
-; CHECK-NEXT: vmov.i16 q2, #0xfc
+; CHECK-NEXT: vmov.i16 q1, #0xfc
+; CHECK-NEXT: vdup.16 q0, r6
; CHECK-NEXT: mul lr, r5, r4
-; CHECK-NEXT: vdup.16 q4, r6
; CHECK-NEXT: mov.w r6, #2016
-; CHECK-NEXT: vmov.i16 q6, #0xf8
+; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
; CHECK-NEXT: mul r5, r3, r4
; CHECK-NEXT: adds r3, r7, #7
+; CHECK-NEXT: vdup.16 q0, r6
; CHECK-NEXT: bic r3, r3, #7
-; CHECK-NEXT: vdup.16 q3, lr
+; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vdup.16 q0, r5
+; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vdup.16 q0, lr
; CHECK-NEXT: subs r3, #8
+; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: movs r4, #1
-; CHECK-NEXT: vdup.16 q0, r5
-; CHECK-NEXT: lsls r1, r1, #1
+; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: add.w r3, r4, r3, lsr #3
-; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vmov.i16 q0, #0xf800
+; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT: lsls r1, r1, #1
; CHECK-NEXT: movs r4, #0
-; CHECK-NEXT: vdup.16 q5, r6
-; CHECK-NEXT: vmov.i16 q7, #0x78
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vmov.i16 q4, #0xf8
; CHECK-NEXT: .LBB0_3: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_4 Depth 2
@@ -59,37 +60,31 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
; CHECK-NEXT: @ Parent Loop BB0_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.16 r6
-; CHECK-NEXT: subs r6, #8
+; CHECK-NEXT: vmov.i16 q5, #0xf800
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrht.u16 q0, [r5]
-; CHECK-NEXT: vshr.u16 q1, q0, #3
-; CHECK-NEXT: vand q1, q1, q2
-; CHECK-NEXT: vmov q2, q4
-; CHECK-NEXT: vmla.i16 q2, q1, r2
-; CHECK-NEXT: vshr.u16 q1, q2, #5
-; CHECK-NEXT: vshl.i16 q2, q0, #3
-; CHECK-NEXT: vand q3, q1, q5
-; CHECK-NEXT: vmov q1, q7
-; CHECK-NEXT: vand q2, q2, q6
-; CHECK-NEXT: vmov q7, q6
-; CHECK-NEXT: vmov q6, q5
-; CHECK-NEXT: vmov q5, q4
-; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: subs r6, #8
+; CHECK-NEXT: vshr.u16 q3, q0, #3
+; CHECK-NEXT: vand q3, q3, q1
+; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vmla.i16 q1, q3, r2
+; CHECK-NEXT: vshl.i16 q3, q0, #3
+; CHECK-NEXT: vand q3, q3, q4
+; CHECK-NEXT: vmov q4, q6
+; CHECK-NEXT: vshr.u16 q1, q1, #5
+; CHECK-NEXT: vmla.i16 q4, q3, r2
+; CHECK-NEXT: vshr.u16 q3, q4, #11
+; CHECK-NEXT: vand q1, q1, q7
+; CHECK-NEXT: vorr q1, q1, q3
; CHECK-NEXT: vshr.u16 q0, q0, #9
-; CHECK-NEXT: vmla.i16 q4, q2, r2
-; CHECK-NEXT: vshr.u16 q2, q4, #11
-; CHECK-NEXT: vmov q4, q5
-; CHECK-NEXT: vmov q5, q6
-; CHECK-NEXT: vmov q6, q7
-; CHECK-NEXT: vmov q7, q1
-; CHECK-NEXT: vorr q1, q3, q2
-; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vand q0, q0, q7
-; CHECK-NEXT: vmla.i16 q2, q0, r2
-; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT: vand q0, q2, q0
-; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vmov.i16 q3, #0x78
+; CHECK-NEXT: vmov.i16 q4, #0xf8
+; CHECK-NEXT: vand q0, q0, q3
+; CHECK-NEXT: vmov q3, q2
+; CHECK-NEXT: vmla.i16 q3, q0, r2
+; CHECK-NEXT: vand q0, q3, q5
; CHECK-NEXT: vorr q0, q1, q0
+; CHECK-NEXT: vmov.i16 q1, #0xfc
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrht.16 q0, [r5], #16
; CHECK-NEXT: le lr, .LBB0_4
@@ -190,7 +185,7 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrsh.w r12, [r2, #2]
; CHECK-NEXT: cmp.w r12, #1
-; CHECK-NEXT: blt.w .LBB1_7
+; CHECK-NEXT: blt .LBB1_7
; CHECK-NEXT: @ %bb.1: @ %for.cond3.preheader.lr.ph
; CHECK-NEXT: ldrsh.w r2, [r2]
; CHECK-NEXT: cmp r2, #1
@@ -200,71 +195,70 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: sub sp, #80
-; CHECK-NEXT: ldr r7, [sp, #168]
+; CHECK-NEXT: ldr r7, [sp, #88]
; CHECK-NEXT: movs r5, #120
; CHECK-NEXT: lsls r6, r3, #3
; CHECK-NEXT: movs r4, #252
; CHECK-NEXT: and.w r5, r5, r3, lsr #9
; CHECK-NEXT: uxtb r6, r6
; CHECK-NEXT: and.w r3, r4, r3, lsr #3
+; CHECK-NEXT: adds r4, r2, #7
; CHECK-NEXT: muls r6, r7, r6
+; CHECK-NEXT: bic r4, r4, #7
; CHECK-NEXT: mul lr, r3, r7
-; CHECK-NEXT: vdup.16 q0, r6
-; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT: vdup.16 q0, lr
; CHECK-NEXT: muls r5, r7, r5
-; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vmov.i16 q0, #0xfc
-; CHECK-NEXT: mov.w r6, #2016
-; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vdup.16 q0, r5
; CHECK-NEXT: rsb.w r3, r7, #256
; CHECK-NEXT: lsls r7, r1, #1
-; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vdup.16 q0, r6
+; CHECK-NEXT: sub.w r1, r4, #8
+; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: vmov.i16 q2, #0xf8
-; CHECK-NEXT: vmov.i16 q5, #0x78
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT: vmov.i16 q6, #0xf800
+; CHECK-NEXT: add.w r1, r4, r1, lsr #3
+; CHECK-NEXT: vdup.16 q6, r6
+; CHECK-NEXT: mov.w r6, #2016
; CHECK-NEXT: movs r4, #0
-; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vdup.16 q3, lr
+; CHECK-NEXT: vdup.16 q5, r5
+; CHECK-NEXT: vdup.16 q7, r6
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB1_3: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB1_4 Depth 2
; CHECK-NEXT: mov r5, r0
-; CHECK-NEXT: dlstp.16 lr, r2
+; CHECK-NEXT: mov r6, r2
+; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB1_4: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB1_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vldrh.u16 q0, [r5]
+; CHECK-NEXT: vctp.16 r6
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vldrht.u16 q0, [r5]
; CHECK-NEXT: vshl.i16 q1, q0, #3
-; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: subs r6, #8
; CHECK-NEXT: vand q1, q1, q2
-; CHECK-NEXT: vmla.i16 q3, q1, r3
-; CHECK-NEXT: vmov.f64 d8, d4
-; CHECK-NEXT: vmov.f64 d9, d5
-; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vshr.u16 q2, q0, #9
+; CHECK-NEXT: vmov.i16 q2, #0x78
+; CHECK-NEXT: vshr.u16 q4, q0, #9
+; CHECK-NEXT: vand q4, q4, q2
+; CHECK-NEXT: vmov q2, q6
+; CHECK-NEXT: vmla.i16 q2, q1, r3
; CHECK-NEXT: vshr.u16 q0, q0, #3
+; CHECK-NEXT: vmov.i16 q1, #0xfc
; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: vmla.i16 q1, q0, r3
-; CHECK-NEXT: vand q2, q2, q5
-; CHECK-NEXT: vshr.u16 q0, q3, #11
-; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vshr.u16 q0, q2, #11
+; CHECK-NEXT: vmov q2, q5
+; CHECK-NEXT: vmla.i16 q2, q4, r3
; CHECK-NEXT: vshr.u16 q1, q1, #5
-; CHECK-NEXT: vmla.i16 q3, q2, r3
+; CHECK-NEXT: vmov.i16 q4, #0xf800
; CHECK-NEXT: vand q1, q1, q7
; CHECK-NEXT: vorr q0, q1, q0
-; CHECK-NEXT: vand q1, q3, q6
+; CHECK-NEXT: vand q1, q2, q4
+; CHECK-NEXT: vmov.i16 q2, #0xf8
; CHECK-NEXT: vorr q0, q0, q1
-; CHECK-NEXT: vstrh.16 q0, [r5], #16
-; CHECK-NEXT: vmov.f64 d4, d8
-; CHECK-NEXT: vmov.f64 d5, d9
-; CHECK-NEXT: letp lr, .LBB1_4
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vstrht.16 q0, [r5], #16
+; CHECK-NEXT: le lr, .LBB1_4
; CHECK-NEXT: @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB1_3 Depth=1
; CHECK-NEXT: adds r4, #1
@@ -272,7 +266,6 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
; CHECK-NEXT: cmp r4, r12
; CHECK-NEXT: bne .LBB1_3
; CHECK-NEXT: @ %bb.6:
-; CHECK-NEXT: add sp, #80
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, lr}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 07c06e1..1769c5d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -17,17 +17,16 @@
define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr {
; ENABLED-LABEL: varying_outer_2d_reduction:
; ENABLED: @ %bb.0: @ %entry
-; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; ENABLED-NEXT: sub sp, #4
; ENABLED-NEXT: cmp r3, #1
-; ENABLED-NEXT: str r0, [sp] @ 4-byte Spill
-; ENABLED-NEXT: blt .LBB0_8
-; ENABLED-NEXT: @ %bb.1: @ %for.body.lr.ph
-; ENABLED-NEXT: ldr r0, [sp, #36]
-; ENABLED-NEXT: add.w r12, r2, #3
-; ENABLED-NEXT: ldr.w r10, [sp] @ 4-byte Reload
-; ENABLED-NEXT: mov.w r8, #0
-; ENABLED-NEXT: mov r9, r12
+; ENABLED-NEXT: it lt
+; ENABLED-NEXT: bxlt lr
+; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph
+; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT: mov r11, r0
+; ENABLED-NEXT: ldr r0, [sp, #32]
+; ENABLED-NEXT: add.w r9, r2, #3
+; ENABLED-NEXT: mov.w r12, #0
+; ENABLED-NEXT: mov r10, r11
; ENABLED-NEXT: uxth r0, r0
; ENABLED-NEXT: rsbs r5, r0, #0
; ENABLED-NEXT: b .LBB0_4
@@ -37,31 +36,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: lsrs r0, r0, #16
; ENABLED-NEXT: sub.w r9, r9, #1
-; ENABLED-NEXT: strh.w r0, [r1, r8, lsl #1]
-; ENABLED-NEXT: add.w r8, r8, #1
+; ENABLED-NEXT: strh.w r0, [r1, r12, lsl #1]
+; ENABLED-NEXT: add.w r12, r12, #1
; ENABLED-NEXT: add.w r10, r10, #2
-; ENABLED-NEXT: cmp r8, r3
+; ENABLED-NEXT: cmp r12, r3
; ENABLED-NEXT: beq .LBB0_8
; ENABLED-NEXT: .LBB0_4: @ %for.body
; ENABLED-NEXT: @ =>This Loop Header: Depth=1
; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2
-; ENABLED-NEXT: cmp r2, r8
+; ENABLED-NEXT: cmp r2, r12
; ENABLED-NEXT: ble .LBB0_2
; ENABLED-NEXT: @ %bb.5: @ %vector.ph
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: bic r0, r9, #3
; ENABLED-NEXT: movs r7, #1
; ENABLED-NEXT: subs r0, #4
-; ENABLED-NEXT: sub.w r4, r2, r8
+; ENABLED-NEXT: sub.w r4, r2, r12
; ENABLED-NEXT: vmov.i32 q1, #0x0
; ENABLED-NEXT: add.w r6, r7, r0, lsr #2
-; ENABLED-NEXT: sub.w r0, r12, r8
+; ENABLED-NEXT: adds r0, r2, #3
+; ENABLED-NEXT: sub.w r0, r0, r12
; ENABLED-NEXT: bic r0, r0, #3
; ENABLED-NEXT: subs r0, #4
; ENABLED-NEXT: add.w r0, r7, r0, lsr #2
; ENABLED-NEXT: mov r7, r10
; ENABLED-NEXT: dls lr, r0
-; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload
+; ENABLED-NEXT: mov r0, r11
; ENABLED-NEXT: .LBB0_6: @ %vector.body
; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1
; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2
@@ -82,23 +82,22 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: vpsel q0, q1, q0
; ENABLED-NEXT: vaddv.u32 r0, q0
; ENABLED-NEXT: b .LBB0_3
-; ENABLED-NEXT: .LBB0_8: @ %for.end17
-; ENABLED-NEXT: add sp, #4
-; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; ENABLED-NEXT: .LBB0_8:
+; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT: bx lr
;
; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
; NOREDUCTIONS: @ %bb.0: @ %entry
-; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; NOREDUCTIONS-NEXT: sub sp, #4
; NOREDUCTIONS-NEXT: cmp r3, #1
-; NOREDUCTIONS-NEXT: str r0, [sp] @ 4-byte Spill
-; NOREDUCTIONS-NEXT: blt .LBB0_8
-; NOREDUCTIONS-NEXT: @ %bb.1: @ %for.body.lr.ph
-; NOREDUCTIONS-NEXT: ldr r0, [sp, #36]
-; NOREDUCTIONS-NEXT: add.w r12, r2, #3
-; NOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload
-; NOREDUCTIONS-NEXT: mov.w r8, #0
-; NOREDUCTIONS-NEXT: mov r9, r12
+; NOREDUCTIONS-NEXT: it lt
+; NOREDUCTIONS-NEXT: bxlt lr
+; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph
+; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT: mov r11, r0
+; NOREDUCTIONS-NEXT: ldr r0, [sp, #32]
+; NOREDUCTIONS-NEXT: add.w r9, r2, #3
+; NOREDUCTIONS-NEXT: mov.w r12, #0
+; NOREDUCTIONS-NEXT: mov r10, r11
; NOREDUCTIONS-NEXT: uxth r0, r0
; NOREDUCTIONS-NEXT: rsbs r5, r0, #0
; NOREDUCTIONS-NEXT: b .LBB0_4
@@ -108,31 +107,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: lsrs r0, r0, #16
; NOREDUCTIONS-NEXT: sub.w r9, r9, #1
-; NOREDUCTIONS-NEXT: strh.w r0, [r1, r8, lsl #1]
-; NOREDUCTIONS-NEXT: add.w r8, r8, #1
+; NOREDUCTIONS-NEXT: strh.w r0, [r1, r12, lsl #1]
+; NOREDUCTIONS-NEXT: add.w r12, r12, #1
; NOREDUCTIONS-NEXT: add.w r10, r10, #2
-; NOREDUCTIONS-NEXT: cmp r8, r3
+; NOREDUCTIONS-NEXT: cmp r12, r3
; NOREDUCTIONS-NEXT: beq .LBB0_8
; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body
; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1
; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2
-; NOREDUCTIONS-NEXT: cmp r2, r8
+; NOREDUCTIONS-NEXT: cmp r2, r12
; NOREDUCTIONS-NEXT: ble .LBB0_2
; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: bic r0, r9, #3
; NOREDUCTIONS-NEXT: movs r7, #1
; NOREDUCTIONS-NEXT: subs r0, #4
-; NOREDUCTIONS-NEXT: sub.w r4, r2, r8
+; NOREDUCTIONS-NEXT: sub.w r4, r2, r12
; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0
; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2
-; NOREDUCTIONS-NEXT: sub.w r0, r12, r8
+; NOREDUCTIONS-NEXT: adds r0, r2, #3
+; NOREDUCTIONS-NEXT: sub.w r0, r0, r12
; NOREDUCTIONS-NEXT: bic r0, r0, #3
; NOREDUCTIONS-NEXT: subs r0, #4
; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2
; NOREDUCTIONS-NEXT: mov r7, r10
; NOREDUCTIONS-NEXT: dls lr, r0
-; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload
+; NOREDUCTIONS-NEXT: mov r0, r11
; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body
; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1
; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2
@@ -153,9 +153,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: vpsel q0, q1, q0
; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0
; NOREDUCTIONS-NEXT: b .LBB0_3
-; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17
-; NOREDUCTIONS-NEXT: add sp, #4
-; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; NOREDUCTIONS-NEXT: .LBB0_8:
+; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT: bx lr
entry:
%conv = sext i16 %N to i32
%cmp36 = icmp sgt i16 %N, 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
index e0a61b1..78dc35b 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
@@ -49,18 +49,17 @@ define i32 @vcmp_new_vpst_combination(i32 %len, ptr nocapture readonly %arr) {
; CHECK-NEXT: cmp r0, #1
; CHECK-NEXT: blt .LBB1_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: vmov.i32 q1, #0x1
+; CHECK-NEXT: vmov.i32 q0, #0x1
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.32 lr, r0
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q2, [r1], #16
-; CHECK-NEXT: vcmp.i32 ne, q2, zr
-; CHECK-NEXT: vmov q2, q0
+; CHECK-NEXT: vldrw.u32 q1, [r1], #16
+; CHECK-NEXT: vcmp.i32 ne, q1, zr
+; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmovt q2, q1
-; CHECK-NEXT: vaddva.u32 r2, q2
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vaddva.u32 r2, q1
; CHECK-NEXT: letp lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index c8dd949..a904347 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -993,10 +993,10 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .pad #24
-; CHECK-NEXT: sub sp, #24
+; CHECK-NEXT: .pad #20
+; CHECK-NEXT: sub sp, #20
; CHECK-NEXT: cmp r3, #8
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: blo.w .LBB16_12
; CHECK-NEXT: @ %bb.1: @ %if.then
; CHECK-NEXT: lsrs.w r12, r3, #2
@@ -1016,50 +1016,48 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: subs r1, r7, #2
; CHECK-NEXT: rsbs r7, r4, #0
-; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: add.w r7, r3, #16
-; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_6
; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: add.w r5, r5, r0, lsl #1
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: add.w r6, r6, r0, lsl #1
; CHECK-NEXT: b .LBB16_5
; CHECK-NEXT: .LBB16_4: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: wls lr, r0, .LBB16_5
; CHECK-NEXT: b .LBB16_10
; CHECK-NEXT: .LBB16_5: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #8
-; CHECK-NEXT: add.w r0, r5, r0, lsl #1
+; CHECK-NEXT: add.w r0, r6, r0, lsl #1
; CHECK-NEXT: add.w r5, r0, #8
; CHECK-NEXT: beq.w .LBB16_12
; CHECK-NEXT: .LBB16_6: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_8 Depth 2
; CHECK-NEXT: @ Child Loop BB16_11 Depth 2
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: ldrh.w lr, [r3, #14]
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
-; CHECK-NEXT: ldrh.w r8, [r3, #12]
+; CHECK-NEXT: ldrh.w r10, [r3, #12]
; CHECK-NEXT: ldrh r7, [r3, #10]
; CHECK-NEXT: ldrh r4, [r3, #8]
; CHECK-NEXT: ldrh r6, [r3, #6]
; CHECK-NEXT: ldrh.w r9, [r3, #4]
; CHECK-NEXT: ldrh.w r11, [r3, #2]
-; CHECK-NEXT: ldrh.w r10, [r3]
+; CHECK-NEXT: ldrh.w r8, [r3]
; CHECK-NEXT: vstrb.8 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q0, [r5]
-; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: adds r0, r5, #2
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmul.f16 q0, q0, r10
+; CHECK-NEXT: vmul.f16 q0, q0, r8
; CHECK-NEXT: adds r0, r5, #6
; CHECK-NEXT: vfma.f16 q0, q1, r11
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
@@ -1068,73 +1066,73 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: add.w r0, r5, #10
; CHECK-NEXT: vfma.f16 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
+; CHECK-NEXT: add.w r6, r5, #16
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: add.w r0, r5, #14
; CHECK-NEXT: vfma.f16 q0, q1, r7
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
-; CHECK-NEXT: adds r5, #16
-; CHECK-NEXT: vfma.f16 q0, q1, r8
+; CHECK-NEXT: vfma.f16 q0, q1, r10
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: vfma.f16 q0, q1, lr
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: blo .LBB16_9
; CHECK-NEXT: @ %bb.7: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w r5, r3, #16
; CHECK-NEXT: dls lr, r0
-; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB16_8: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldrh r0, [r6], #16
-; CHECK-NEXT: vldrw.u32 q1, [r5]
-; CHECK-NEXT: adds r4, r5, #2
+; CHECK-NEXT: ldrh r0, [r5], #16
+; CHECK-NEXT: vldrw.u32 q1, [r6]
+; CHECK-NEXT: adds r4, r6, #2
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r4]
-; CHECK-NEXT: ldrh r0, [r6, #-14]
-; CHECK-NEXT: adds r4, r5, #6
+; CHECK-NEXT: ldrh r0, [r5, #-14]
+; CHECK-NEXT: adds r4, r6, #6
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: ldrh r0, [r6, #-12]
-; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
+; CHECK-NEXT: ldrh r0, [r5, #-12]
+; CHECK-NEXT: vldrw.u32 q1, [r6, #4]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r4]
-; CHECK-NEXT: ldrh r0, [r6, #-10]
-; CHECK-NEXT: add.w r4, r5, #10
+; CHECK-NEXT: ldrh r0, [r5, #-10]
+; CHECK-NEXT: add.w r4, r6, #10
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: ldrh r0, [r6, #-8]
-; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
+; CHECK-NEXT: ldrh r0, [r5, #-8]
+; CHECK-NEXT: vldrw.u32 q1, [r6, #8]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r4]
-; CHECK-NEXT: ldrh r0, [r6, #-6]
-; CHECK-NEXT: ldrh r4, [r6, #-2]
+; CHECK-NEXT: ldrh r0, [r5, #-6]
+; CHECK-NEXT: ldrh r4, [r5, #-2]
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: ldrh r0, [r6, #-4]
-; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
+; CHECK-NEXT: ldrh r0, [r5, #-4]
+; CHECK-NEXT: vldrw.u32 q1, [r6, #12]
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: add.w r0, r5, #14
+; CHECK-NEXT: add.w r0, r6, #14
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: adds r5, #16
+; CHECK-NEXT: adds r6, #16
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: le lr, .LBB16_8
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: add.w r5, r3, #16
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: mov r0, r6
; CHECK-NEXT: .LBB16_11: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldrh r4, [r6], #2
+; CHECK-NEXT: ldrh r4, [r5], #2
; CHECK-NEXT: vldrh.u16 q1, [r0], #2
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: le lr, .LBB16_11
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_12: @ %if.end
-; CHECK-NEXT: add sp, #24
+; CHECK-NEXT: add sp, #20
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 28166e4..f7b4548 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -995,46 +995,44 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: .pad #24
+; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: ldrh r6, [r0]
-; CHECK-NEXT: movs r5, #1
-; CHECK-NEXT: ldrd r4, r10, [r0, #4]
+; CHECK-NEXT: movs r4, #1
+; CHECK-NEXT: ldrd r7, r10, [r0, #4]
; CHECK-NEXT: sub.w r0, r6, #8
; CHECK-NEXT: add.w r3, r0, r0, lsr #29
; CHECK-NEXT: and r0, r0, #7
-; CHECK-NEXT: asrs r7, r3, #3
-; CHECK-NEXT: cmp r7, #1
+; CHECK-NEXT: asrs r5, r3, #3
+; CHECK-NEXT: cmp r5, #1
; CHECK-NEXT: it gt
-; CHECK-NEXT: asrgt r5, r3, #3
-; CHECK-NEXT: add.w r3, r4, r6, lsl #2
+; CHECK-NEXT: asrgt r4, r3, #3
+; CHECK-NEXT: add.w r3, r7, r6, lsl #2
; CHECK-NEXT: sub.w r9, r3, #4
; CHECK-NEXT: rsbs r3, r6, #0
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: add.w r3, r10, #32
-; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: str r4, [sp] @ 4-byte Spill
+; CHECK-NEXT: str r6, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_6
; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: add.w r4, r4, r0, lsl #2
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: add.w r7, r7, r0, lsl #2
; CHECK-NEXT: b .LBB16_5
; CHECK-NEXT: .LBB16_4: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
+; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: ldrd r0, r9, [sp, #12] @ 8-byte Folded Reload
; CHECK-NEXT: wls lr, r0, .LBB16_5
; CHECK-NEXT: b .LBB16_10
; CHECK-NEXT: .LBB16_5: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #16
-; CHECK-NEXT: add.w r0, r4, r0, lsl #2
-; CHECK-NEXT: add.w r4, r0, #16
+; CHECK-NEXT: add.w r0, r7, r0, lsl #2
+; CHECK-NEXT: add.w r7, r0, #16
; CHECK-NEXT: beq .LBB16_12
; CHECK-NEXT: .LBB16_6: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
@@ -1042,76 +1040,76 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: @ Child Loop BB16_11 Depth 2
; CHECK-NEXT: add.w lr, r10, #8
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
-; CHECK-NEXT: ldrd r3, r7, [r10]
+; CHECK-NEXT: ldrd r3, r4, [r10]
; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr}
; CHECK-NEXT: ldrd r11, r8, [r10, #24]
; CHECK-NEXT: vstrb.8 q0, [r9], #16
-; CHECK-NEXT: vldrw.u32 q0, [r4], #32
-; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: str.w r9, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
+; CHECK-NEXT: vldrw.u32 q0, [r7], #32
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: str.w r9, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [r7, #-28]
; CHECK-NEXT: vmul.f32 q0, q0, r3
-; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
-; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
-; CHECK-NEXT: vfma.f32 q0, q1, r7
-; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
+; CHECK-NEXT: vldrw.u32 q6, [r7, #-24]
+; CHECK-NEXT: vldrw.u32 q4, [r7, #-20]
+; CHECK-NEXT: vfma.f32 q0, q1, r4
+; CHECK-NEXT: vldrw.u32 q5, [r7, #-16]
; CHECK-NEXT: vfma.f32 q0, q6, r0
-; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
+; CHECK-NEXT: vldrw.u32 q2, [r7, #-12]
; CHECK-NEXT: vfma.f32 q0, q4, r5
-; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
+; CHECK-NEXT: vldrw.u32 q3, [r7, #-8]
; CHECK-NEXT: vfma.f32 q0, q5, r6
-; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
+; CHECK-NEXT: vldrw.u32 q1, [r7, #-4]
; CHECK-NEXT: vfma.f32 q0, q2, lr
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: vfma.f32 q0, q3, r11
; CHECK-NEXT: vfma.f32 q0, q1, r8
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: blo .LBB16_9
; CHECK-NEXT: @ %bb.7: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w r4, r10, #32
; CHECK-NEXT: dls lr, r0
-; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: .LBB16_8: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11}
-; CHECK-NEXT: vldrw.u32 q1, [r4], #32
-; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
-; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
+; CHECK-NEXT: ldm.w r4, {r0, r3, r5, r6, r8, r11}
+; CHECK-NEXT: vldrw.u32 q1, [r7], #32
+; CHECK-NEXT: vldrw.u32 q6, [r7, #-24]
+; CHECK-NEXT: vldrw.u32 q4, [r7, #-20]
; CHECK-NEXT: vfma.f32 q0, q1, r0
-; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
-; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
-; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
+; CHECK-NEXT: vldrw.u32 q1, [r7, #-28]
+; CHECK-NEXT: vldrw.u32 q5, [r7, #-16]
+; CHECK-NEXT: vldrw.u32 q2, [r7, #-12]
; CHECK-NEXT: vfma.f32 q0, q1, r3
-; CHECK-NEXT: ldrd r9, r1, [r7, #24]
+; CHECK-NEXT: ldrd r9, r1, [r4, #24]
; CHECK-NEXT: vfma.f32 q0, q6, r5
-; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
+; CHECK-NEXT: vldrw.u32 q3, [r7, #-8]
; CHECK-NEXT: vfma.f32 q0, q4, r6
-; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
+; CHECK-NEXT: vldrw.u32 q1, [r7, #-4]
; CHECK-NEXT: vfma.f32 q0, q5, r8
-; CHECK-NEXT: adds r7, #32
+; CHECK-NEXT: adds r4, #32
; CHECK-NEXT: vfma.f32 q0, q2, r11
; CHECK-NEXT: vfma.f32 q0, q3, r9
; CHECK-NEXT: vfma.f32 q0, q1, r1
; CHECK-NEXT: le lr, .LBB16_8
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: add.w r4, r10, #32
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: mov r3, r4
+; CHECK-NEXT: mov r3, r7
; CHECK-NEXT: .LBB16_11: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldr r0, [r7], #4
+; CHECK-NEXT: ldr r0, [r4], #4
; CHECK-NEXT: vldrw.u32 q1, [r3], #4
; CHECK-NEXT: vfma.f32 q0, q1, r0
; CHECK-NEXT: le lr, .LBB16_11
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_12:
-; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: add sp, #24
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index e8b49c1..0d86f22 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -711,8 +711,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #136
-; CHECK-NEXT: sub sp, #136
+; CHECK-NEXT: .pad #120
+; CHECK-NEXT: sub sp, #120
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill
; CHECK-NEXT: blt.w .LBB14_5
@@ -725,22 +725,20 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: subs r1, #8
; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill
-; CHECK-NEXT: vmov.i16 q2, #0x18
; CHECK-NEXT: add.w r1, r2, r1, lsr #3
; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill
; CHECK-NEXT: adr r1, .LCPI14_0
; CHECK-NEXT: adr r2, .LCPI14_1
; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r2]
-; CHECK-NEXT: add r2, sp, #120
+; CHECK-NEXT: add r2, sp, #104
; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill
; CHECK-NEXT: .LBB14_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB14_3 Depth 2
; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT: add.w r10, sp, #104
+; CHECK-NEXT: add.w r10, sp, #88
; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload
; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload
@@ -762,7 +760,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: vmov r6, r2, d4
; CHECK-NEXT: ldrh r1, [r1]
; CHECK-NEXT: ldrh.w r12, [r4]
-; CHECK-NEXT: add r4, sp, #88
+; CHECK-NEXT: add r4, sp, #72
; CHECK-NEXT: ldrh.w r11, [r5]
; CHECK-NEXT: ldrh r3, [r3]
; CHECK-NEXT: ldrh r5, [r6]
@@ -807,7 +805,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: vmov.16 q3[0], r2
; CHECK-NEXT: vmov.16 q3[1], r5
; CHECK-NEXT: vmov r2, r5, d5
-; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT: vmov.i16 q2, #0x18
; CHECK-NEXT: vadd.i16 q6, q6, q2
; CHECK-NEXT: vadd.i16 q5, q5, q2
; CHECK-NEXT: vadd.i16 q4, q4, q2
@@ -849,7 +847,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: cmp r1, r3
; CHECK-NEXT: bne.w .LBB14_2
; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #136
+; CHECK-NEXT: add sp, #120
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -950,7 +948,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: adr r6, .LCPI15_9
-; CHECK-NEXT: vmov.i32 q2, #0x30
; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r7]
; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
@@ -963,212 +960,213 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
; CHECK-NEXT: .LBB15_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB15_3 Depth 2
+; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: adr r1, .LCPI15_3
-; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: adr r1, .LCPI15_4
; CHECK-NEXT: vldrw.u32 q5, [r1]
+; CHECK-NEXT: adr r1, .LCPI15_4
+; CHECK-NEXT: vstrw.32 q2, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q6, [r1]
; CHECK-NEXT: adr r1, .LCPI15_2
-; CHECK-NEXT: vldrw.u32 q3, [r1]
+; CHECK-NEXT: vstrw.32 q2, [sp, #280] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: adr r1, .LCPI15_10
-; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q2, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q3, [r1]
; CHECK-NEXT: adr r1, .LCPI15_11
; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q2, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q7, [r1]
-; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
; CHECK-NEXT: mov r11, r10
-; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q2, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q0, [sp, #216] @ 16-byte Spill
; CHECK-NEXT: .LBB15_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB15_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vadd.i32 q4, q1, r0
-; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill
-; CHECK-NEXT: vmov r1, lr, d8
-; CHECK-NEXT: vadd.i32 q7, q7, r0
-; CHECK-NEXT: vmov r5, r4, d15
-; CHECK-NEXT: vadd.i32 q6, q0, r0
-; CHECK-NEXT: vmov r6, r7, d13
+; CHECK-NEXT: vmov q0, q7
+; CHECK-NEXT: vstrw.32 q7, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT: vadd.i32 q7, q5, r0
+; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill
+; CHECK-NEXT: vadd.i32 q5, q0, r0
+; CHECK-NEXT: vmov q0, q6
+; CHECK-NEXT: vadd.i32 q6, q4, r0
+; CHECK-NEXT: vmov r5, r4, d11
+; CHECK-NEXT: vmov r1, lr, d12
+; CHECK-NEXT: vadd.i32 q2, q1, r0
+; CHECK-NEXT: vmov r6, r7, d15
; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q1, q1, r0
-; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill
-; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vmov q1, q3
+; CHECK-NEXT: vstrw.32 q4, [sp, #168] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q4, [sp, #248] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q0, [sp, #120] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q3, [sp, #136] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q3, [sp, #184] @ 16-byte Reload
; CHECK-NEXT: subs.w r11, r11, #16
-; CHECK-NEXT: ldrb.w r9, [r1]
-; CHECK-NEXT: vmov r1, r3, d14
; CHECK-NEXT: ldrb r5, [r5]
+; CHECK-NEXT: ldrb.w r9, [r1]
+; CHECK-NEXT: vmov r1, r3, d10
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q7[0], r1
+; CHECK-NEXT: vmov.8 q5[0], r1
; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q7[1], r1
-; CHECK-NEXT: vmov r1, r3, d12
-; CHECK-NEXT: vmov.8 q7[2], r5
+; CHECK-NEXT: vmov.8 q5[1], r1
+; CHECK-NEXT: vmov r1, r3, d14
+; CHECK-NEXT: vmov.8 q5[2], r5
; CHECK-NEXT: ldrb r5, [r6]
; CHECK-NEXT: ldrb r6, [r4]
-; CHECK-NEXT: vmov.8 q7[3], r6
+; CHECK-NEXT: vmov.8 q5[3], r6
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.8 q6[0], r1
-; CHECK-NEXT: vmov r6, r1, d2
-; CHECK-NEXT: vmov.8 q6[1], r3
-; CHECK-NEXT: vmov.8 q6[2], r5
-; CHECK-NEXT: vmov.8 q6[3], r7
+; CHECK-NEXT: vmov.8 q7[0], r1
+; CHECK-NEXT: vmov r6, r1, d4
+; CHECK-NEXT: vmov.8 q7[1], r3
+; CHECK-NEXT: vmov.8 q7[2], r5
+; CHECK-NEXT: vmov.8 q7[3], r7
; CHECK-NEXT: ldrb.w r7, [lr]
-; CHECK-NEXT: vmov.8 q6[4], r9
-; CHECK-NEXT: vmov.8 q6[5], r7
+; CHECK-NEXT: vmov.8 q7[4], r9
+; CHECK-NEXT: vmov.8 q7[5], r7
; CHECK-NEXT: ldrb r4, [r1]
-; CHECK-NEXT: vmov r1, r5, d3
-; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload
+; CHECK-NEXT: vmov r1, r5, d5
+; CHECK-NEXT: vadd.i32 q2, q1, r0
+; CHECK-NEXT: vldrw.u32 q1, [sp, #280] @ 16-byte Reload
; CHECK-NEXT: ldrb.w r12, [r1]
-; CHECK-NEXT: vmov r1, r3, d9
+; CHECK-NEXT: vmov r1, r3, d13
; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q6, [sp, #232] @ 16-byte Reload
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.8 q6[6], r1
-; CHECK-NEXT: vmov r1, r7, d0
-; CHECK-NEXT: vmov.8 q6[7], r3
+; CHECK-NEXT: vmov.8 q7[6], r1
+; CHECK-NEXT: vmov r1, r7, d4
+; CHECK-NEXT: vmov.8 q7[7], r3
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r7, [r7]
-; CHECK-NEXT: vmov.8 q7[4], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload
-; CHECK-NEXT: vmov.8 q7[5], r7
-; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vmov.8 q5[4], r1
+; CHECK-NEXT: vmov r1, r3, d5
+; CHECK-NEXT: vmov.8 q5[5], r7
+; CHECK-NEXT: vadd.i32 q2, q1, r0
+; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.8 q7[6], r1
+; CHECK-NEXT: vmov.8 q5[6], r1
; CHECK-NEXT: ldrb r1, [r6]
-; CHECK-NEXT: vmov r7, r6, d0
-; CHECK-NEXT: vmov.8 q7[7], r3
-; CHECK-NEXT: vmov r3, lr, d1
-; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT: vmov.8 q7[8], r1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov.8 q7[9], r4
-; CHECK-NEXT: vmov r4, r1, d0
-; CHECK-NEXT: vmov.8 q7[10], r12
-; CHECK-NEXT: vmov.8 q7[11], r5
+; CHECK-NEXT: vmov.8 q5[7], r3
+; CHECK-NEXT: vmov r7, r6, d4
+; CHECK-NEXT: vmov r3, lr, d5
+; CHECK-NEXT: vmov.8 q5[8], r1
+; CHECK-NEXT: vadd.i32 q2, q1, r0
+; CHECK-NEXT: vmov.8 q5[9], r4
+; CHECK-NEXT: vmov r4, r1, d4
+; CHECK-NEXT: vmov.8 q5[10], r12
+; CHECK-NEXT: vmov.8 q5[11], r5
+; CHECK-NEXT: vldrw.u32 q1, [sp, #264] @ 16-byte Reload
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: ldrb r6, [r6]
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q6[8], r4
-; CHECK-NEXT: vmov r5, r4, d1
-; CHECK-NEXT: vmov.8 q6[9], r1
-; CHECK-NEXT: vadd.i32 q0, q5, r0
-; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload
+; CHECK-NEXT: vmov.8 q7[8], r4
+; CHECK-NEXT: vmov r5, r4, d5
+; CHECK-NEXT: vmov.8 q7[9], r1
+; CHECK-NEXT: vadd.i32 q2, q0, r0
+; CHECK-NEXT: vldrw.u32 q0, [sp, #216] @ 16-byte Reload
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: vmov.8 q6[10], r5
-; CHECK-NEXT: vmov.8 q6[11], r4
-; CHECK-NEXT: vmov.8 q6[12], r7
-; CHECK-NEXT: vmov.8 q6[13], r6
-; CHECK-NEXT: vmov.8 q6[14], r3
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov.8 q7[10], r5
+; CHECK-NEXT: vmov.8 q7[11], r4
+; CHECK-NEXT: vmov.8 q7[12], r7
+; CHECK-NEXT: vmov.8 q7[13], r6
+; CHECK-NEXT: vmov.8 q7[14], r3
+; CHECK-NEXT: vmov r1, r3, d4
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q7[12], r1
+; CHECK-NEXT: vmov.8 q5[12], r1
; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q7[13], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q1, r0
-; CHECK-NEXT: vadd.i32 q1, q1, q2
-; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q1, q1, q2
-; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q1, q1, q2
+; CHECK-NEXT: vmov.8 q5[13], r1
+; CHECK-NEXT: vmov r1, r3, d5
+; CHECK-NEXT: vadd.i32 q2, q1, r0
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q7[14], r1
+; CHECK-NEXT: vmov.8 q5[14], r1
; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q7[15], r1
+; CHECK-NEXT: vmov.8 q5[15], r1
; CHECK-NEXT: ldrb.w r1, [lr]
-; CHECK-NEXT: vmov.8 q6[15], r1
-; CHECK-NEXT: vmov r1, r3, d0
-; CHECK-NEXT: vadd.i8 q6, q6, q7
+; CHECK-NEXT: vmov.8 q7[15], r1
+; CHECK-NEXT: vmov r1, r3, d4
+; CHECK-NEXT: vadd.i8 q5, q7, q5
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: vmov.8 q7[0], r1
; CHECK-NEXT: vmov.8 q7[1], r3
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q3, r0
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT: vmov r1, r3, d5
+; CHECK-NEXT: vadd.i32 q2, q4, r0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[2], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[3], r1
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov r1, r3, d4
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[4], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[5], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q5, r0
-; CHECK-NEXT: vadd.i32 q5, q5, q2
-; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q5, q5, q2
+; CHECK-NEXT: vmov r1, r3, d5
+; CHECK-NEXT: vadd.i32 q2, q6, r0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[6], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[7], r1
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov r1, r3, d4
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[8], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[9], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q4, r0
-; CHECK-NEXT: vadd.i32 q4, q4, q2
-; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT: vmov r1, r3, d5
+; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[10], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[11], r1
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov r1, r3, d4
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[12], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[13], r1
-; CHECK-NEXT: vmov r1, r3, d1
+; CHECK-NEXT: vmov r1, r3, d5
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[14], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[15], r1
-; CHECK-NEXT: vadd.i8 q0, q6, q7
-; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload
-; CHECK-NEXT: vstrb.8 q0, [r8], #16
-; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q7, q7, q2
+; CHECK-NEXT: vadd.i8 q2, q5, q7
+; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload
+; CHECK-NEXT: vstrb.8 q2, [r8], #16
+; CHECK-NEXT: vmov.i32 q2, #0x30
+; CHECK-NEXT: vadd.i32 q6, q6, q2
+; CHECK-NEXT: vadd.i32 q3, q3, q2
+; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q6, [sp, #296] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q1, q1, q2
+; CHECK-NEXT: vadd.i32 q4, q4, q2
+; CHECK-NEXT: vadd.i32 q6, q6, q2
; CHECK-NEXT: vadd.i32 q0, q0, q2
+; CHECK-NEXT: vmov q7, q3
+; CHECK-NEXT: vldrw.u32 q3, [sp, #136] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q1, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q4, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q4, [sp, #168] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q6, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q6, [sp, #120] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q0, [sp, #216] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q5, q5, q2
+; CHECK-NEXT: vadd.i32 q3, q3, q2
+; CHECK-NEXT: vadd.i32 q0, q0, q2
+; CHECK-NEXT: vadd.i32 q4, q4, q2
+; CHECK-NEXT: vadd.i32 q1, q1, q2
+; CHECK-NEXT: vadd.i32 q6, q6, q2
+; CHECK-NEXT: vstrw.32 q0, [sp, #280] @ 16-byte Spill
; CHECK-NEXT: bne.w .LBB15_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB15_2 Depth=1
@@ -1501,14 +1499,14 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n)
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: blt .LBB18_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: adr.w lr, .LCPI18_0
+; CHECK-NEXT: adr r3, .LCPI18_0
; CHECK-NEXT: adr r4, .LCPI18_1
; CHECK-NEXT: adr r5, .LCPI18_2
; CHECK-NEXT: adr r6, .LCPI18_3
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: vldrw.u32 q1, [r5]
; CHECK-NEXT: vldrw.u32 q2, [r4]
-; CHECK-NEXT: vldrw.u32 q3, [lr]
+; CHECK-NEXT: vldrw.u32 q3, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r1
; CHECK-NEXT: vadd.i32 q1, q1, r1
; CHECK-NEXT: vadd.i32 q2, q2, r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index dad856c..00a998c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -38,7 +38,7 @@ define arm_aapcs_vfpcc void @k() {
; CHECK-NEXT: vmov.i32 q5, #0x0
; CHECK-NEXT: vpsel q6, q4, q3
; CHECK-NEXT: vstrh.16 q6, [r0]
-; CHECK-NEXT: vmov q6, q5
+; CHECK-NEXT: vmov.i32 q6, #0x0
; CHECK-NEXT: cbz r1, .LBB0_2
; CHECK-NEXT: le .LBB0_1
; CHECK-NEXT: .LBB0_2: @ %for.cond4.preheader
@@ -135,12 +135,12 @@ vector.body115: ; preds = %vector.body115, %ve
define dso_local i32 @e() #0 {
; CHECK-LABEL: e:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #408
-; CHECK-NEXT: sub sp, #408
+; CHECK-NEXT: .pad #392
+; CHECK-NEXT: sub sp, #392
; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals
; CHECK-NEXT: vldr s15, .LCPI1_1
; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals
@@ -148,18 +148,16 @@ define dso_local i32 @e() #0 {
; CHECK-NEXT: mov r4, r7
; CHECK-NEXT: mov r3, r7
; CHECK-NEXT: ldr r6, [r4, #8]!
-; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: ldr r0, [r3, #4]!
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: movt r2, :upper16:e
+; CHECK-NEXT: ldr r0, [r3, #4]!
; CHECK-NEXT: vmov r5, s15
; CHECK-NEXT: vmov q0[2], q0[0], r4, r4
-; CHECK-NEXT: vmov s13, r3
; CHECK-NEXT: vldr s12, .LCPI1_0
+; CHECK-NEXT: vmov s13, r3
; CHECK-NEXT: vmov q0[3], q0[1], r5, r2
; CHECK-NEXT: vdup.32 q7, r3
; CHECK-NEXT: vmov q6[2], q6[0], r3, r5
-; CHECK-NEXT: vstrw.32 q0, [sp, #92]
+; CHECK-NEXT: vstrw.32 q0, [sp, #76]
; CHECK-NEXT: vmov q0, q7
; CHECK-NEXT: vmov q6[3], q6[1], r3, r2
; CHECK-NEXT: vmov q4, q7
@@ -168,7 +166,7 @@ define dso_local i32 @e() #0 {
; CHECK-NEXT: vmov s21, r2
; CHECK-NEXT: movs r1, #64
; CHECK-NEXT: vmov.f32 s20, s12
-; CHECK-NEXT: str r0, [sp, #40]
+; CHECK-NEXT: str r0, [sp, #24]
; CHECK-NEXT: vmov.f32 s22, s13
; CHECK-NEXT: str r6, [r0]
; CHECK-NEXT: vmov.f32 s23, s15
@@ -186,12 +184,12 @@ define dso_local i32 @e() #0 {
; CHECK-NEXT: vmov q2[3], q2[1], r4, r5
; CHECK-NEXT: vmov.32 q4[0], r8
; CHECK-NEXT: @ implicit-def: $r2
-; CHECK-NEXT: str.w r8, [sp, #44]
-; CHECK-NEXT: vstrw.32 q3, [sp, #60]
-; CHECK-NEXT: strh.w r12, [sp, #406]
+; CHECK-NEXT: str.w r8, [sp, #28]
+; CHECK-NEXT: vstrw.32 q3, [sp, #44]
+; CHECK-NEXT: strh.w r12, [sp, #390]
; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2
; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: letp lr, .LBB1_1
; CHECK-NEXT: .LBB1_2: @ %entry
@@ -199,7 +197,7 @@ define dso_local i32 @e() #0 {
; CHECK-NEXT: str.w r8, [r7]
; CHECK-NEXT: vstrw.32 q4, [r0]
; CHECK-NEXT: vstrw.32 q2, [r0]
-; CHECK-NEXT: str.w r12, [sp, #324]
+; CHECK-NEXT: str.w r12, [sp, #308]
; CHECK-NEXT: .LBB1_3: @ %for.cond
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: b .LBB1_3
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index f90af3c..2587a0bb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -115,17 +115,17 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: ldr r3, [r0]
; CHECK-NEXT: add.w r11, r3, r12, lsl #2
-; CHECK-NEXT: add.w r7, r3, r12, lsl #3
-; CHECK-NEXT: lsl.w r9, r12, #3
+; CHECK-NEXT: add.w r6, r3, r12, lsl #3
+; CHECK-NEXT: lsl.w r10, r12, #3
; CHECK-NEXT: .LBB1_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB1_3 Depth 2
; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w r9, r4, #1
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: add.w r10, r4, #1
; CHECK-NEXT: mov r3, r11
-; CHECK-NEXT: mov r0, r7
-; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: mov r0, r6
+; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB1_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1
@@ -139,11 +139,11 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: vadd.f32 s2, s2, s3
-; CHECK-NEXT: add.w r0, r2, r10, lsl #2
+; CHECK-NEXT: add.w r0, r2, r9, lsl #2
; CHECK-NEXT: vadd.f32 s0, s0, s1
-; CHECK-NEXT: add r11, r9
+; CHECK-NEXT: add r11, r10
; CHECK-NEXT: vadd.f32 s6, s6, s7
-; CHECK-NEXT: add r7, r9
+; CHECK-NEXT: add r6, r10
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: vadd.f32 s2, s4, s6
@@ -228,46 +228,40 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: .pad #24
-; CHECK-NEXT: sub sp, #24
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
-; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: subs r1, #3
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo .LBB2_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr.w r9, [r0, #8]
; CHECK-NEXT: movs r5, #1
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: add.w r0, r3, r3, lsl #1
-; CHECK-NEXT: add.w r9, r1, r3, lsl #2
-; CHECK-NEXT: add.w r12, r1, r3, lsl #3
-; CHECK-NEXT: adds r3, #3
+; CHECK-NEXT: add.w r3, r9, #3
; CHECK-NEXT: bic r3, r3, #3
-; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: add.w r10, r1, r0, lsl #2
+; CHECK-NEXT: add.w r0, r9, r9, lsl #1
; CHECK-NEXT: subs r3, #4
+; CHECK-NEXT: add.w r10, r1, r9, lsl #2
+; CHECK-NEXT: add.w r12, r1, r9, lsl #3
+; CHECK-NEXT: add.w r1, r1, r0, lsl #2
+; CHECK-NEXT: add.w r3, r5, r3, lsr #2
+; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: lsl.w r11, r0, #2
-; CHECK-NEXT: add.w r1, r5, r3, lsr #2
-; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB2_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB2_3 Depth 2
-; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: adds r0, r5, #2
-; CHECK-NEXT: adds r2, r5, #1
-; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT: mov r3, r9
-; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: mov r3, r10
; CHECK-NEXT: mov r0, r12
-; CHECK-NEXT: mov r4, r10
-; CHECK-NEXT: vmov q2, q0
-; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: dlstp.32 lr, r7
+; CHECK-NEXT: mov r4, r1
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: vmov.i32 q1, #0x0
+; CHECK-NEXT: dlstp.32 lr, r9
; CHECK-NEXT: .LBB2_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
@@ -282,31 +276,31 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1
; CHECK-NEXT: vadd.f32 s10, s10, s11
-; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: adds r0, r5, #1
; CHECK-NEXT: vadd.f32 s8, s8, s9
-; CHECK-NEXT: add r9, r11
+; CHECK-NEXT: add r10, r11
; CHECK-NEXT: vadd.f32 s6, s6, s7
-; CHECK-NEXT: add.w r0, r1, r2, lsl #2
+; CHECK-NEXT: add.w r0, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: add r12, r11
; CHECK-NEXT: vadd.f32 s2, s2, s3
-; CHECK-NEXT: add r10, r11
+; CHECK-NEXT: add r1, r11
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s8, s8, s10
; CHECK-NEXT: vadd.f32 s4, s4, s6
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: vstr s8, [r0]
-; CHECK-NEXT: add.w r0, r1, r5, lsl #2
-; CHECK-NEXT: adds r5, #3
+; CHECK-NEXT: add.w r0, r2, r5, lsl #2
; CHECK-NEXT: vstr s4, [r0]
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: add.w r0, r1, r0, lsl #2
+; CHECK-NEXT: adds r0, r5, #2
+; CHECK-NEXT: adds r5, #3
+; CHECK-NEXT: add.w r0, r2, r0, lsl #2
; CHECK-NEXT: vstr s0, [r0]
-; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: cmp r5, r0
; CHECK-NEXT: blo .LBB2_2
; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #24
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -394,15 +388,15 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: .pad #40
-; CHECK-NEXT: sub sp, #40
-; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: .pad #24
+; CHECK-NEXT: sub sp, #24
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
-; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: subs r1, #4
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
-; CHECK-NEXT: blo.w .LBB3_5
+; CHECK-NEXT: blo .LBB3_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: ldr r2, [r0, #8]
; CHECK-NEXT: movs r6, #1
@@ -410,34 +404,28 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: add.w r0, r2, r2, lsl #1
; CHECK-NEXT: add.w r12, r1, r2, lsl #2
; CHECK-NEXT: add.w r8, r1, r2, lsl #3
-; CHECK-NEXT: add.w r9, r1, r2, lsl #4
-; CHECK-NEXT: add.w r11, r1, r0, lsl #2
+; CHECK-NEXT: add.w r10, r1, r2, lsl #4
+; CHECK-NEXT: add.w r9, r1, r0, lsl #2
; CHECK-NEXT: adds r0, r2, #3
; CHECK-NEXT: bic r0, r0, #3
; CHECK-NEXT: subs r0, #4
; CHECK-NEXT: add.w r0, r6, r0, lsr #2
-; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT: strd r0, r2, [sp, #4] @ 8-byte Folded Spill
; CHECK-NEXT: lsls r0, r2, #4
-; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: ldrd r2, r7, [sp, #4] @ 8-byte Folded Reload
+; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB3_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
-; CHECK-NEXT: adds r0, r6, #3
-; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT: adds r0, r6, #2
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT: adds r0, r6, #1
-; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: mov r0, r8
-; CHECK-NEXT: mov r5, r11
-; CHECK-NEXT: mov r4, r9
-; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vmov q2, q0
-; CHECK-NEXT: vmov q3, q0
+; CHECK-NEXT: mov r5, r9
+; CHECK-NEXT: mov r4, r10
+; CHECK-NEXT: vmov.i32 q1, #0x0
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB3_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
@@ -455,9 +443,9 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
; CHECK-NEXT: vadd.f32 s14, s14, s15
-; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s12, s12, s13
-; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: adds r0, r6, #1
; CHECK-NEXT: vadd.f32 s10, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
@@ -471,24 +459,24 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: vstr s12, [r0]
; CHECK-NEXT: add.w r0, r1, r6, lsl #2
-; CHECK-NEXT: adds r6, #4
; CHECK-NEXT: vstr s8, [r0]
-; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT: adds r0, r6, #2
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vstr s4, [r0]
-; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT: adds r0, r6, #3
+; CHECK-NEXT: adds r6, #4
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vstr s0, [r0]
-; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
; CHECK-NEXT: add r12, r0
; CHECK-NEXT: add r8, r0
-; CHECK-NEXT: add r11, r0
; CHECK-NEXT: add r9, r0
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add r10, r0
+; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: cmp r6, r0
; CHECK-NEXT: blo .LBB3_2
; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #40
+; CHECK-NEXT: add sp, #24
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -588,60 +576,53 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #5
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB4_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr.w r12, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: adds r0, r3, #3
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: add.w r0, r12, #3
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r8, r1, r3, lsl #2
+; CHECK-NEXT: add.w r8, r1, r12, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsls r5, r3, #2
+; CHECK-NEXT: lsl.w r5, r12, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: add.w r1, r3, r3, lsl #2
-; CHECK-NEXT: lsls r1, r1, #2
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: add.w r1, r12, r12, lsl #2
+; CHECK-NEXT: lsls r1, r1, #2
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB4_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB4_3 Depth 2
-; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: adds r1, r0, #4
-; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #3
-; CHECK-NEXT: add.w r10, r0, #2
+; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: add.w r11, r0, #1
-; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: mov r3, r8
-; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT: vmov q0, q1
-; CHECK-NEXT: vmov q3, q1
-; CHECK-NEXT: vmov q2, q1
-; CHECK-NEXT: vmov q4, q1
-; CHECK-NEXT: dlstp.32 lr, r7
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vmov.i32 q3, #0x0
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: vmov.i32 q4, #0x0
+; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB4_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: add.w r9, r3, r5
; CHECK-NEXT: vldrw.u32 q5, [r4], #16
; CHECK-NEXT: vldrw.u32 q6, [r3], #16
-; CHECK-NEXT: add.w r12, r9, r5
+; CHECK-NEXT: add.w r10, r9, r5
; CHECK-NEXT: vfma.f32 q3, q6, q5
; CHECK-NEXT: vldrw.u32 q6, [r9]
-; CHECK-NEXT: add.w r6, r12, r5
+; CHECK-NEXT: add.w r6, r10, r5
; CHECK-NEXT: vfma.f32 q4, q6, q5
-; CHECK-NEXT: vldrw.u32 q6, [r12]
+; CHECK-NEXT: vldrw.u32 q6, [r10]
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vfma.f32 q2, q6, q5
; CHECK-NEXT: vldrw.u32 q6, [r6]
@@ -662,30 +643,31 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s1, s16, s18
-; CHECK-NEXT: vadd.f32 s2, s2, s3
; CHECK-NEXT: vadd.f32 s12, s12, s14
+; CHECK-NEXT: vadd.f32 s2, s2, s3
; CHECK-NEXT: vadd.f32 s4, s4, s6
; CHECK-NEXT: vadd.f32 s6, s8, s10
; CHECK-NEXT: vstr s1, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
-; CHECK-NEXT: vadd.f32 s0, s0, s2
-; CHECK-NEXT: adds r0, #5
; CHECK-NEXT: vstr s12, [r1]
-; CHECK-NEXT: add.w r1, r2, r10, lsl #2
+; CHECK-NEXT: adds r1, r0, #2
+; CHECK-NEXT: vadd.f32 s0, s0, s2
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #3
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s0, [r1]
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: adds r0, #5
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s4, [r1]
-; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: add r8, r1
-; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
-; CHECK-NEXT: blo.w .LBB4_2
+; CHECK-NEXT: blo .LBB4_2
; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -797,63 +779,54 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #6
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB5_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr.w r12, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: adds r0, r3, #3
-; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: add.w r0, r12, #3
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r8, r1, r3, lsl #2
+; CHECK-NEXT: add.w r8, r1, r12, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsls r5, r3, #2
+; CHECK-NEXT: lsl.w r5, r12, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: add.w r1, r3, r3, lsl #1
+; CHECK-NEXT: add.w r1, r12, r12, lsl #1
; CHECK-NEXT: lsls r1, r1, #3
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB5_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB5_3 Depth 2
-; CHECK-NEXT: adds r1, r0, #5
-; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #4
-; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #3
-; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: add.w r11, r0, #2
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #1
-; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: mov r3, r8
-; CHECK-NEXT: vmov q3, q1
-; CHECK-NEXT: vmov q4, q1
-; CHECK-NEXT: vmov q0, q1
-; CHECK-NEXT: vmov q5, q1
-; CHECK-NEXT: vmov q2, q1
-; CHECK-NEXT: dlstp.32 lr, r7
+; CHECK-NEXT: vmov.i32 q3, #0x0
+; CHECK-NEXT: vmov.i32 q4, #0x0
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vmov.i32 q5, #0x0
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB5_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: add.w r12, r3, r5
+; CHECK-NEXT: add.w r10, r3, r5
; CHECK-NEXT: vldrw.u32 q6, [r1], #16
; CHECK-NEXT: vldrw.u32 q7, [r3], #16
-; CHECK-NEXT: add.w r10, r12, r5
+; CHECK-NEXT: add.w r11, r10, r5
; CHECK-NEXT: vfma.f32 q4, q7, q6
-; CHECK-NEXT: vldrw.u32 q7, [r12]
-; CHECK-NEXT: add.w r6, r10, r5
-; CHECK-NEXT: vfma.f32 q5, q7, q6
; CHECK-NEXT: vldrw.u32 q7, [r10]
+; CHECK-NEXT: add.w r6, r11, r5
+; CHECK-NEXT: vfma.f32 q5, q7, q6
+; CHECK-NEXT: vldrw.u32 q7, [r11]
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vfma.f32 q2, q7, q6
; CHECK-NEXT: vldrw.u32 q7, [r6]
@@ -885,28 +858,29 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vstr s1, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s0, s0, s2
-; CHECK-NEXT: adds r0, #6
; CHECK-NEXT: vstr s3, [r1]
-; CHECK-NEXT: add.w r1, r2, r11, lsl #2
+; CHECK-NEXT: adds r1, r0, #2
; CHECK-NEXT: vadd.f32 s4, s4, s6
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s8, [r1]
-; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #3
; CHECK-NEXT: vadd.f32 s6, s12, s14
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s0, [r1]
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #5
+; CHECK-NEXT: adds r0, #6
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s4, [r1]
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: add r8, r1
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB5_2
; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1030,73 +1004,64 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #72
-; CHECK-NEXT: sub sp, #72
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: .pad #48
+; CHECK-NEXT: sub sp, #48
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #7
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB6_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr.w r10, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: adds r0, r3, #3
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: add.w r0, r10, #3
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r9, r1, r3, lsl #2
+; CHECK-NEXT: add.w r9, r1, r10, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsls r5, r3, #2
+; CHECK-NEXT: lsl.w r5, r10, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: rsb r1, r3, r3, lsl #3
-; CHECK-NEXT: lsls r1, r1, #2
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: rsb r1, r10, r10, lsl #3
+; CHECK-NEXT: lsls r1, r1, #2
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB6_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB6_3 Depth 2
-; CHECK-NEXT: adds r1, r0, #6
-; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #5
-; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #4
-; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #3
-; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT: vmov.i32 q2, #0x0
-; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #2
; CHECK-NEXT: add.w r8, r0, #1
-; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vmov.i32 q2, #0x0
; CHECK-NEXT: mov r3, r9
-; CHECK-NEXT: vmov q4, q2
-; CHECK-NEXT: vmov q5, q2
-; CHECK-NEXT: vmov q3, q2
-; CHECK-NEXT: vmov q6, q2
-; CHECK-NEXT: vmov q1, q2
-; CHECK-NEXT: mov r12, r7
-; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT: dls lr, r6
+; CHECK-NEXT: vmov.i32 q4, #0x0
+; CHECK-NEXT: vmov.i32 q5, #0x0
+; CHECK-NEXT: vmov.i32 q3, #0x0
+; CHECK-NEXT: vmov.i32 q6, #0x0
+; CHECK-NEXT: vmov.i32 q1, #0x0
+; CHECK-NEXT: mov r12, r10
+; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: dls lr, r7
; CHECK-NEXT: .LBB6_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.32 r12
-; CHECK-NEXT: add.w r10, r3, r5
+; CHECK-NEXT: add.w r11, r3, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q7, [r1], #16
; CHECK-NEXT: vldrwt.u32 q0, [r3], #16
-; CHECK-NEXT: add.w r11, r10, r5
+; CHECK-NEXT: add.w r6, r11, r5
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q5, q0, q7
-; CHECK-NEXT: vldrwt.u32 q0, [r10]
-; CHECK-NEXT: add.w r6, r11, r5
+; CHECK-NEXT: vldrwt.u32 q0, [r11]
+; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q6, q0, q7
-; CHECK-NEXT: vldrwt.u32 q0, [r11]
-; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT: vldrwt.u32 q0, [r6]
+; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vmov q6, q5
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q1, q0, q7
@@ -1104,26 +1069,26 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q0, [r6]
-; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT: adds r7, r6, r5
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vfmat.f32 q1, q0, q7
; CHECK-NEXT: vldrwt.u32 q0, [r7]
+; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: adds r6, r7, r5
-; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vfmat.f32 q1, q0, q7
+; CHECK-NEXT: vldrwt.u32 q0, [r6]
+; CHECK-NEXT: adds r7, r6, r5
+; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill
; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q3, q0, q7
-; CHECK-NEXT: vldrwt.u32 q0, [r6]
+; CHECK-NEXT: vldrwt.u32 q0, [r7]
; CHECK-NEXT: vmov q4, q5
-; CHECK-NEXT: adds r7, r6, r5
+; CHECK-NEXT: adds r6, r7, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q4, q0, q7
-; CHECK-NEXT: vldrwt.u32 q0, [r7]
+; CHECK-NEXT: vldrwt.u32 q0, [r6]
; CHECK-NEXT: vmov q5, q6
-; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q2, q0, q7
; CHECK-NEXT: le lr, .LBB6_3
@@ -1138,45 +1103,45 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: vadd.f32 s10, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
-; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: vadd.f32 s9, s18, s19
; CHECK-NEXT: vadd.f32 s11, s16, s17
-; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT: vadd.f32 s2, s3, s1
+; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: vadd.f32 s5, s18, s19
; CHECK-NEXT: vadd.f32 s7, s16, s17
+; CHECK-NEXT: vadd.f32 s2, s3, s1
; CHECK-NEXT: vadd.f32 s4, s4, s6
-; CHECK-NEXT: vstr s0, [r1]
-; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s14, s14, s15
-; CHECK-NEXT: adds r0, #7
; CHECK-NEXT: vadd.f32 s12, s12, s13
-; CHECK-NEXT: vstr s2, [r1]
-; CHECK-NEXT: add.w r1, r2, r4, lsl #2
+; CHECK-NEXT: vstr s0, [r1]
+; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s8, s8, s10
; CHECK-NEXT: vadd.f32 s6, s7, s5
-; CHECK-NEXT: vstr s4, [r1]
+; CHECK-NEXT: vstr s2, [r1]
+; CHECK-NEXT: add.w r1, r2, r4, lsl #2
; CHECK-NEXT: vadd.f32 s10, s11, s9
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: vstr s4, [r1]
+; CHECK-NEXT: adds r1, r0, #3
; CHECK-NEXT: vadd.f32 s12, s12, s14
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s12, [r1]
-; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #5
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s10, [r1]
-; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #6
+; CHECK-NEXT: adds r0, #7
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s8, [r1]
-; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: add r9, r1
-; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB6_2
; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #72
+; CHECK-NEXT: add sp, #48
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1312,107 +1277,99 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #88
-; CHECK-NEXT: sub sp, #88
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: .pad #64
+; CHECK-NEXT: sub sp, #64
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #8
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB7_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr.w r11, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: adds r0, r3, #3
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: add.w r0, r11, #3
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r12, r1, r3, lsl #2
+; CHECK-NEXT: add.w r12, r1, r11, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsls r6, r3, #2
+; CHECK-NEXT: lsl.w r6, r11, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: lsls r1, r3, #5
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: lsl.w r1, r11, #5
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB7_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB7_3 Depth 2
-; CHECK-NEXT: adds r1, r0, #7
-; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #6
-; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #5
-; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #4
-; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: vmov.i32 q3, #0x0
-; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: ldr.w r9, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: adds r4, r0, #3
; CHECK-NEXT: add.w r8, r0, #2
; CHECK-NEXT: adds r1, r0, #1
-; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: mov r3, r12
-; CHECK-NEXT: vmov q5, q3
-; CHECK-NEXT: vmov q6, q3
-; CHECK-NEXT: vmov q4, q3
-; CHECK-NEXT: vmov q7, q3
-; CHECK-NEXT: vmov q2, q3
-; CHECK-NEXT: mov r10, r7
-; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill
-; CHECK-NEXT: dls lr, r5
+; CHECK-NEXT: vmov.i32 q5, #0x0
+; CHECK-NEXT: vmov.i32 q6, #0x0
+; CHECK-NEXT: vmov.i32 q4, #0x0
+; CHECK-NEXT: vmov.i32 q7, #0x0
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: mov r10, r11
+; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: dls lr, r7
; CHECK-NEXT: .LBB7_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.32 r10
-; CHECK-NEXT: add.w r11, r3, r6
+; CHECK-NEXT: adds r5, r3, r6
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q0, [r9], #16
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
-; CHECK-NEXT: add.w r5, r11, r6
+; CHECK-NEXT: adds r7, r5, r6
; CHECK-NEXT: sub.w r10, r10, #4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q6, q1, q0
-; CHECK-NEXT: vldrwt.u32 q1, [r11]
-; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT: vldrwt.u32 q1, [r5]
+; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vmov q6, q5
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q7, q1, q0
-; CHECK-NEXT: vmov q5, q3
-; CHECK-NEXT: vmov q3, q4
-; CHECK-NEXT: vmov q4, q2
+; CHECK-NEXT: vmov q5, q4
+; CHECK-NEXT: vmov q4, q3
+; CHECK-NEXT: vmov q3, q2
; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q1, [r5]
-; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT: adds r7, r5, r6
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r7]
-; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: adds r5, r7, r6
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r5]
+; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: adds r7, r5, r6
-; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
-; CHECK-NEXT: vmov q2, q4
-; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r7]
; CHECK-NEXT: adds r5, r7, r6
-; CHECK-NEXT: vmov q3, q5
+; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: vmov q2, q3
+; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vpstt
-; CHECK-NEXT: vfmat.f32 q4, q1, q0
+; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r5]
+; CHECK-NEXT: vmov q4, q5
+; CHECK-NEXT: adds r7, r5, r6
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vfmat.f32 q4, q1, q0
+; CHECK-NEXT: vldrwt.u32 q1, [r7]
; CHECK-NEXT: vmov q5, q6
-; CHECK-NEXT: add r5, r6
+; CHECK-NEXT: adds r5, r7, r6
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q5, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r5]
-; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q3, q1, q0
; CHECK-NEXT: le lr, .LBB7_3
@@ -1425,12 +1382,12 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vadd.f32 s6, s24, s25
; CHECK-NEXT: vadd.f32 s5, s18, s19
; CHECK-NEXT: vadd.f32 s7, s16, s17
-; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vadd.f32 s10, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: vadd.f32 s9, s18, s19
; CHECK-NEXT: vadd.f32 s11, s16, s17
-; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: vadd.f32 s14, s14, s15
; CHECK-NEXT: vadd.f32 s12, s12, s13
; CHECK-NEXT: vadd.f32 s13, s18, s19
@@ -1445,33 +1402,33 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vstr s0, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s3, s20, s21
-; CHECK-NEXT: adds r0, #8
; CHECK-NEXT: vstr s2, [r1]
; CHECK-NEXT: add.w r1, r2, r8, lsl #2
; CHECK-NEXT: vadd.f32 s12, s7, s5
; CHECK-NEXT: vstr s10, [r1]
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
; CHECK-NEXT: vstr s14, [r1]
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT: vadd.f32 s4, s3, s1
+; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vadd.f32 s4, s3, s1
; CHECK-NEXT: vstr s8, [r1]
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #5
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s12, [r1]
-; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #6
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s4, [r1]
-; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #7
+; CHECK-NEXT: adds r0, #8
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
-; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: add r12, r1
-; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB7_2
; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #88
+; CHECK-NEXT: add sp, #64
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
index 29c4fb9..413c4a1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
@@ -1496,15 +1496,14 @@ define void @vfmasq(ptr %x, ptr %y, i32 %n) {
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB34_1: @ %for.body.preheader
-; CHECK-NEXT: vmov.f32 q0, #1.000000e+01
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB34_2: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vldrw.u32 q2, [r0], #16
-; CHECK-NEXT: vfma.f32 q3, q2, q1
-; CHECK-NEXT: vstrw.32 q3, [r1], #16
+; CHECK-NEXT: vmov.f32 q2, #1.000000e+01
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [r0], #16
+; CHECK-NEXT: vfma.f32 q2, q1, q0
+; CHECK-NEXT: vstrw.32 q2, [r1], #16
; CHECK-NEXT: letp lr, .LBB34_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -1542,15 +1541,14 @@ define void @vfmas(ptr %s1, ptr %s2, i32 %N) {
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB35_1: @ %while.body.lr.ph
-; CHECK-NEXT: vmov.f32 q0, #1.000000e+01
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB35_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vldrw.u32 q2, [r0]
-; CHECK-NEXT: vfma.f32 q3, q2, q1
-; CHECK-NEXT: vstrw.32 q3, [r0], #16
+; CHECK-NEXT: vmov.f32 q2, #1.000000e+01
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vfma.f32 q2, q1, q0
+; CHECK-NEXT: vstrw.32 q2, [r0], #16
; CHECK-NEXT: letp lr, .LBB35_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
index e845070..62482c1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
@@ -287,17 +287,17 @@ define void @shlor(ptr nocapture readonly %x, ptr noalias nocapture %y, i32 %n)
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: blt .LBB5_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: adr.w lr, .LCPI5_0
-; CHECK-NEXT: adr r4, .LCPI5_1
+; CHECK-NEXT: adr r4, .LCPI5_0
+; CHECK-NEXT: adr r3, .LCPI5_1
; CHECK-NEXT: adr r5, .LCPI5_2
; CHECK-NEXT: adr r6, .LCPI5_3
-; CHECK-NEXT: vldrw.u32 q2, [r4]
+; CHECK-NEXT: vldrw.u32 q2, [r3]
+; CHECK-NEXT: vldrw.u32 q3, [r4]
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: vldrw.u32 q1, [r5]
-; CHECK-NEXT: vldrw.u32 q3, [lr]
+; CHECK-NEXT: vadd.i32 q2, q2, r1
; CHECK-NEXT: vadd.i32 q0, q0, r1
; CHECK-NEXT: vadd.i32 q1, q1, r1
-; CHECK-NEXT: vadd.i32 q2, q2, r1
; CHECK-NEXT: vadd.i32 q3, q3, r1
; CHECK-NEXT: mov.w r12, #1
; CHECK-NEXT: movs r4, #3
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index f9948db..c92c2be 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -656,14 +656,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vcmp.i8 eq, q1, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q1, #0xff
; CHECK-NEXT: vpsel q5, q1, q0
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.u8 r0, q5[0]
; CHECK-NEXT: vmov.16 q3[0], r0
; CHECK-NEXT: vmov.u8 r0, q5[1]
@@ -706,7 +704,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
; CHECK-NEXT: orrs r1, r3
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov r2, r3, d15
-; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
; CHECK-NEXT: vmov.u8 r2, q2[3]
@@ -785,6 +783,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vpsel q6, q1, q7
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: vmov r2, r3, d12
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
@@ -853,7 +852,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
entry:
@@ -2065,14 +2063,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vcmp.i8 eq, q1, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q1, #0xff
; CHECK-NEXT: vpsel q5, q1, q0
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.u8 r2, q5[0]
; CHECK-NEXT: vmov.16 q3[0], r2
; CHECK-NEXT: vmov.u8 r2, q5[1]
@@ -2115,7 +2111,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
; CHECK-NEXT: orr.w lr, lr, r3
; CHECK-NEXT: add r12, r2
; CHECK-NEXT: vmov r3, r2, d15
-; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vmov.u8 r2, q2[3]
@@ -2194,6 +2190,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vpsel q6, q1, q7
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: vmov r2, r3, d12
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
@@ -2264,7 +2261,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
; CHECK-NEXT: adc.w r3, r3, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r7, pc}
entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index 63b1431..9f55183 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -817,16 +817,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vcmp.i8 eq, q2, zr
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vpsel q6, q2, q0
-; CHECK-NEXT: vmov q4, q0
+; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vmov.u8 r0, q6[0]
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov.16 q0[0], r0
; CHECK-NEXT: vmov.u8 r0, q6[1]
; CHECK-NEXT: vmov.16 q0[1], r0
@@ -842,9 +840,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.16 q0[6], r0
; CHECK-NEXT: vmov.u8 r0, q6[7]
; CHECK-NEXT: vmov.16 q0[7], r0
-; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vcmp.i16 ne, q0, zr
; CHECK-NEXT: vmov.u8 r2, q3[0]
+; CHECK-NEXT: vcmp.i16 ne, q0, zr
; CHECK-NEXT: vpsel q7, q2, q4
; CHECK-NEXT: vmov.u16 r0, q7[2]
; CHECK-NEXT: vmov.u16 r1, q7[0]
@@ -895,7 +892,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r0, s18
; CHECK-NEXT: vmov r1, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0xff
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r0, r2, r0, r2
; CHECK-NEXT: umull r1, r3, r1, r3
@@ -916,8 +913,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.u16 r3, q7[5]
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vpsel q0, q0, q4
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vpsel q0, q4, q0
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
; CHECK-NEXT: vmov q4[3], q4[1], r2, r3
@@ -932,7 +929,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov r2, s18
; CHECK-NEXT: vmov r0, s30
; CHECK-NEXT: vmov r1, s28
-; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0xff
; CHECK-NEXT: vmov r3, s16
; CHECK-NEXT: umull r0, r2, r0, r2
; CHECK-NEXT: umull r1, r3, r1, r3
@@ -960,7 +957,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r0, s18
; CHECK-NEXT: vmov r1, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r0, r2, r0, r2
; CHECK-NEXT: umull r1, r3, r1, r3
@@ -1041,7 +1038,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r0, s18
; CHECK-NEXT: vmov r1, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r0, r2, r0, r2
; CHECK-NEXT: umull r1, r3, r1, r3
@@ -1062,7 +1059,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.u16 r3, q6[5]
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q0, #0xff
; CHECK-NEXT: vpsel q0, q0, q4
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
@@ -1117,7 +1114,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r7, pc}
entry:
@@ -1137,16 +1133,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: vcmp.i8 eq, q2, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vpsel q5, q2, q0
-; CHECK-NEXT: vmov.s8 r2, q1[0]
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.u8 r0, q5[0]
-; CHECK-NEXT: vmov.s8 r3, q3[0]
+; CHECK-NEXT: vmov.s8 r2, q1[0]
; CHECK-NEXT: vmov.16 q4[0], r0
; CHECK-NEXT: vmov.u8 r0, q5[1]
; CHECK-NEXT: vmov.16 q4[1], r0
@@ -1162,9 +1156,9 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.16 q4[6], r0
; CHECK-NEXT: vmov.u8 r0, q5[7]
; CHECK-NEXT: vmov.16 q4[7], r0
-; CHECK-NEXT: smull r2, r3, r3, r2
+; CHECK-NEXT: vmov.s8 r3, q3[0]
; CHECK-NEXT: vcmp.i16 ne, q4, zr
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vpsel q6, q2, q0
; CHECK-NEXT: vmov.u16 r0, q6[2]
; CHECK-NEXT: vmov.u16 r1, q6[0]
@@ -1198,7 +1192,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.s8 r3, q3[3]
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
; CHECK-NEXT: vmov q0[3], q0[1], r1, r3
@@ -1219,7 +1213,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vpsel q6, q2, q7
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vpsel q6, q2, q0
; CHECK-NEXT: vmov r2, r3, d12
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
@@ -1273,17 +1268,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.16 q6[7], r2
; CHECK-NEXT: vmov.s8 r0, q1[8]
; CHECK-NEXT: vcmp.i16 ne, q6, zr
+; CHECK-NEXT: vmov.i8 q6, #0x0
+; CHECK-NEXT: vpsel q5, q2, q6
; CHECK-NEXT: vmov.s8 r1, q3[8]
-; CHECK-NEXT: vpsel q5, q2, q7
-; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vmov.u16 r2, q5[2]
; CHECK-NEXT: vmov.u16 r3, q5[0]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov.u16 r2, q5[3]
; CHECK-NEXT: vmov.u16 r3, q5[1]
+; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vpsel q6, q2, q7
+; CHECK-NEXT: vpsel q6, q2, q6
; CHECK-NEXT: vmov r2, r3, d12
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
@@ -1365,7 +1361,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r7, pc}
entry:
@@ -2296,16 +2291,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vcmp.i8 eq, q2, zr
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vpsel q6, q2, q0
-; CHECK-NEXT: vmov q4, q0
+; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vmov.u8 r2, q6[0]
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov.16 q0[0], r2
; CHECK-NEXT: vmov.u8 r2, q6[1]
; CHECK-NEXT: vmov.16 q0[1], r2
@@ -2321,9 +2314,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov.16 q0[6], r2
; CHECK-NEXT: vmov.u8 r2, q6[7]
; CHECK-NEXT: vmov.16 q0[7], r2
-; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vcmp.i16 ne, q0, zr
; CHECK-NEXT: vmov.u8 r4, q3[2]
+; CHECK-NEXT: vcmp.i16 ne, q0, zr
; CHECK-NEXT: vpsel q7, q2, q4
; CHECK-NEXT: vmov.u16 r2, q7[2]
; CHECK-NEXT: vmov.u16 r3, q7[0]
@@ -2374,7 +2366,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: vmov r5, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0xff
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: umull r4, r5, r5, r4
@@ -2395,8 +2387,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov.u16 r4, q7[5]
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vpsel q0, q0, q4
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vpsel q0, q4, q0
; CHECK-NEXT: vmov r5, r4, d0
; CHECK-NEXT: vmov q4[2], q4[0], r5, r4
; CHECK-NEXT: vmov q4[3], q4[1], r5, r4
@@ -2411,7 +2403,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov r5, s18
; CHECK-NEXT: vmov r2, s30
; CHECK-NEXT: vmov r3, s28
-; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0xff
; CHECK-NEXT: vmov r4, s16
; CHECK-NEXT: umull r2, r5, r2, r5
; CHECK-NEXT: umull r3, r4, r3, r4
@@ -2439,7 +2431,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov r5, s2
; CHECK-NEXT: vmov r2, s18
; CHECK-NEXT: vmov r3, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: umull r2, r5, r2, r5
; CHECK-NEXT: umull r3, r4, r3, r4
@@ -2520,7 +2512,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov r5, s2
; CHECK-NEXT: vmov r2, s18
; CHECK-NEXT: vmov r3, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: umull r2, r5, r2, r5
; CHECK-NEXT: umull r3, r4, r3, r4
@@ -2541,7 +2533,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov.u16 r4, q6[5]
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q0, #0xff
; CHECK-NEXT: vpsel q0, q0, q4
; CHECK-NEXT: vmov r5, r4, d0
; CHECK-NEXT: vmov q4[2], q4[0], r5, r4
@@ -2598,7 +2590,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
@@ -2619,14 +2610,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: vcmp.i8 eq, q2, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vpsel q5, q2, q0
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.u8 r2, q5[0]
; CHECK-NEXT: vmov.s8 r4, q1[2]
; CHECK-NEXT: vmov.16 q4[0], r2
@@ -2676,7 +2665,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov r2, r3, d15
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
-; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
; CHECK-NEXT: vmov.s8 r2, q1[3]
; CHECK-NEXT: vmov.s8 r3, q3[3]
@@ -2701,7 +2690,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vpsel q6, q2, q7
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vpsel q6, q2, q0
; CHECK-NEXT: vmov r5, r4, d12
; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
@@ -2755,17 +2745,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov.16 q6[7], r5
; CHECK-NEXT: vmov.s8 r2, q1[8]
; CHECK-NEXT: vcmp.i16 ne, q6, zr
+; CHECK-NEXT: vmov.i8 q6, #0x0
+; CHECK-NEXT: vpsel q5, q2, q6
; CHECK-NEXT: vmov.s8 r3, q3[8]
-; CHECK-NEXT: vpsel q5, q2, q7
-; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.u16 r5, q5[2]
; CHECK-NEXT: vmov.u16 r4, q5[0]
; CHECK-NEXT: vmov q0[2], q0[0], r4, r5
; CHECK-NEXT: vmov.u16 r5, q5[3]
; CHECK-NEXT: vmov.u16 r4, q5[1]
+; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vpsel q6, q2, q7
+; CHECK-NEXT: vpsel q6, q2, q6
; CHECK-NEXT: vmov r5, r4, d12
; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
@@ -2849,7 +2840,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
index 0c349c3..cba394f 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
@@ -59,18 +59,18 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: [[MOVSX64rm32_:%[0-9]+]]:gr64_nosp = MOVSX64rm32 %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[MOV32rm2]].sub_32bit
; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
- ; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_1]] :: (store (s64) into %stack.5)
; CHECK-NEXT: [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]]
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[MOV32rm3]]
- ; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[COPY6]]
+ ; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm3]]
; CHECK-NEXT: [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]]
; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8)
+ ; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[COPY1]] :: (store (s64) into %stack.5)
; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6)
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY7]], [[MOVSX64rr32_2]], implicit-def dead $eflags
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY7]], [[MOVSX64rm32_]], implicit-def dead $eflags
- ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY7]], 0, $noreg
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY6]], [[MOVSX64rr32_2]], implicit-def dead $eflags
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY6]], [[MOVSX64rm32_]], implicit-def dead $eflags
+ ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY6]], 0, $noreg
; CHECK-NEXT: MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9)
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[MOV32rm2]]
; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]]
; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = IMUL64rr [[COPY8]], [[MOVSX64rr32_2]], implicit-def dead $eflags
@@ -87,8 +87,11 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
; CHECK-NEXT: CMP32rm [[MOV32rm4]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16)
- ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
- ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+ ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
+ ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64_nosp = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
+ ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
+ ; CHECK-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16)
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm6]]
; CHECK-NEXT: JCC_1 %bb.5, 13, implicit $eflags
; CHECK-NEXT: JMP_1 %bb.3
; CHECK-NEXT: {{ $}}
@@ -98,9 +101,8 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm2]], [[MOVSX64rr32_]], implicit-def dead $eflags
; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm2]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1)
- ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[MOV64rm2]] :: (store (s64) into %stack.13)
- ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12)
- ; CHECK-NEXT: undef [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm5]]
+ ; CHECK-NEXT: [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12)
+ ; CHECK-NEXT: undef [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm7]]
; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9)
; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
; CHECK-NEXT: JMP_1 %bb.6
@@ -123,40 +125,30 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: bb.6.for.body17:
; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit
- ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13)
- ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm7]], 1, [[MOVSX64rr32_]], 0, $noreg
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit
- ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]]
- ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOV64rm1]]
- ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY4]]
- ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY6]]
- ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
- ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
- ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
- ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64 = COPY [[MOV32rm2]]
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64 = COPY [[COPY1]]
- ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY18]], 1, [[COPY9]], 0, $noreg
- ; CHECK-NEXT: [[COPY19:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]]
- ; CHECK-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
- ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[COPY19]], 0, $noreg
- ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = COPY [[COPY19]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY18]]
- ; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr64_nosp = COPY [[COPY17]]
- ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY16]]
- ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY15]]
- ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY14]]
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[COPY13]]
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr32 = COPY [[COPY12]]
- ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = COPY [[COPY11]]
- ; CHECK-NEXT: [[MOV64rm8:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
- ; CHECK-NEXT: [[LEA64_32r1:%[0-9]+]]:gr32 = COPY [[COPY10]]
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY6]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]]
- ; CHECK-NEXT: PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]]
+ ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit
+ ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY9]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm2]], 1, [[MOVSX64rr32_]], 0, $noreg
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY10]].sub_32bit
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]]
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
+ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[COPY7]]
+ ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+ ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[MOV64rm7]], 1, [[COPY10]], 0, $noreg
+ ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm]], 0, $noreg
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[COPY15]]
+ ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY14]]
+ ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY13]]
+ ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY12]]
+ ; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = COPY [[COPY11]]
+ ; CHECK-NEXT: [[MOV32rm8:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16)
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm8]]
+ ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY9]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]]
+ ; CHECK-NEXT: PTILESTOREDV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]]
; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm4]], [[MOVSX64rr32_3]], implicit-def dead $eflags
- ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm8]], implicit-def dead $eflags
- ; CHECK-NEXT: [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY9]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags
- ; CHECK-NEXT: CMP64rr [[MOV64rm4]], [[MOV64rm1]], implicit-def $eflags
+ ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm1]], implicit-def dead $eflags
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY10]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags
+ ; CHECK-NEXT: CMP64rr [[MOV64rm4]], [[MOVSX64rm32_1]], implicit-def $eflags
; CHECK-NEXT: JCC_1 %bb.6, 12, implicit $eflags
; CHECK-NEXT: JMP_1 %bb.5
entry:
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index bf6b096..b428ce4 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -136,8 +136,6 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: movl (%r8), %edx
; CHECK-NEXT: leal 8(,%rbx,8), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: leaq 8(%rsi), %rax
-; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: leaq 32(%rsi), %r11
; CHECK-NEXT: leaq 8(,%rbx,8), %rbx
; CHECK-NEXT: xorl %r14d, %r14d
@@ -189,7 +187,8 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: jae .LBB1_7
; CHECK-NEXT: # %bb.6: # %vector.memcheck
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; CHECK-NEXT: leaq 8(%rsi), %r9
+; CHECK-NEXT: addq %r9, %rax
; CHECK-NEXT: leaq (%rax,%r10,8), %rax
; CHECK-NEXT: cmpq %r15, %rax
; CHECK-NEXT: ja .LBB1_14
diff --git a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
index 10ee445..d355374 100644
--- a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
+++ b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
@@ -7,8 +7,8 @@
# CHECK: jne
# CHECK: andl $-16, %edx
# CHECK: xorl %ebx, %ebx
-# CHECK: movl -16(%ebp), %esi
-# CHECK: xorl %eax, %eax
+# CHECK: xorl %esi, %esi
+# CHECK: movl %eax, %ecx
name: test
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/X86/inalloca-invoke.ll b/llvm/test/CodeGen/X86/inalloca-invoke.ll
index c2728f7..68cb24d 100644
--- a/llvm/test/CodeGen/X86/inalloca-invoke.ll
+++ b/llvm/test/CodeGen/X86/inalloca-invoke.ll
@@ -23,7 +23,6 @@ blah:
; CHECK: pushl %eax
; CHECK: subl $20, %esp
; CHECK: movl %esp, %[[beg:[^ ]*]]
-; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]]
call void @begin(ptr sret(%Iter) %temp.lvalue)
; CHECK: calll _begin
@@ -32,6 +31,7 @@ blah:
to label %invoke.cont unwind label %lpad
; Uses end as sret param.
+; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]]
; CHECK: pushl %[[end]]
; CHECK: calll _plus
diff --git a/llvm/test/CodeGen/X86/licm-regpressure.ll b/llvm/test/CodeGen/X86/licm-regpressure.ll
index 72a4832..26ed2a3 100644
--- a/llvm/test/CodeGen/X86/licm-regpressure.ll
+++ b/llvm/test/CodeGen/X86/licm-regpressure.ll
@@ -1,14 +1,64 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; This tests currently fails as MachineLICM does not compute register pressure
-; correctly. More details: llvm.org/PR23143
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-linux -stop-after=early-machinelicm -o - | FileCheck %s
-; MachineLICM should take register pressure into account.
-; CHECK-NOT: Spill
+; FIXME: MachineLICM does not compute register pressure correctly and we end up
+; emitting too many ADD64ri32s. More details: llvm.org/PR23143
%struct.A = type { i32, i32, i32, i32, i32, i32, i32 }
define void @test(i1 %b, ptr %a) nounwind {
+ ; CHECK-LABEL: name: test
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $edi, $rsi
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $edi
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr8 = COPY [[COPY1]].sub_8bit
+ ; CHECK-NEXT: [[ADD64ri32_:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 4, implicit-def dead $eflags
+ ; CHECK-NEXT: [[ADD64ri32_1:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 8, implicit-def dead $eflags
+ ; CHECK-NEXT: [[ADD64ri32_2:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 12, implicit-def dead $eflags
+ ; CHECK-NEXT: [[ADD64ri32_3:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 16, implicit-def dead $eflags
+ ; CHECK-NEXT: [[ADD64ri32_4:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 20, implicit-def dead $eflags
+ ; CHECK-NEXT: [[ADD64ri32_5:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 24, implicit-def dead $eflags
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.loop-body:
+ ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[COPY]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_1]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_2]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_3]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_4]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_5]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: TEST8ri [[COPY2]], 1, implicit-def $eflags
+ ; CHECK-NEXT: JCC_1 %bb.1, 5, implicit $eflags
+ ; CHECK-NEXT: JMP_1 %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.loop-exit:
+ ; CHECK-NEXT: RET 0
entry:
br label %loop-header
diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
index 7f32766..0ccaa9c 100644
--- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
@@ -172,10 +172,8 @@ define float @clamp_negative_wrong_const(float %x) {
; Like @clamp_test_1 but both are min
define float @clamp_negative_same_op(float %x) {
; CHECK-LABEL: @clamp_negative_same_op(
-; CHECK-NEXT: [[INNER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02
-; CHECK-NEXT: [[INNER_SEL:%.*]] = select nnan ninf i1 [[INNER_CMP_INV]], float 2.550000e+02, float [[X]]
-; CHECK-NEXT: [[OUTER_CMP:%.*]] = fcmp fast ult float [[X]], 1.000000e+00
-; CHECK-NEXT: [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 1.000000e+00
+; CHECK-NEXT: [[OUTER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 1.000000e+00
+; CHECK-NEXT: [[R:%.*]] = select nnan ninf i1 [[OUTER_CMP_INV]], float 1.000000e+00, float [[X]]
; CHECK-NEXT: ret float [[R]]
;
%inner_cmp = fcmp fast ult float %x, 255.0
diff --git a/llvm/test/Transforms/InstCombine/select-gep.ll b/llvm/test/Transforms/InstCombine/select-gep.ll
index dd8dffb..7181336 100644
--- a/llvm/test/Transforms/InstCombine/select-gep.ll
+++ b/llvm/test/Transforms/InstCombine/select-gep.ll
@@ -286,3 +286,35 @@ define <2 x ptr> @test7(<2 x ptr> %p1, i64 %idx, <2 x i1> %cc) {
%select = select <2 x i1> %cc, <2 x ptr> %p1, <2 x ptr> %gep
ret <2 x ptr> %select
}
+
+define ptr @ptr_eq_replace_freeze1(ptr %p, ptr %q) {
+; CHECK-LABEL: @ptr_eq_replace_freeze1(
+; CHECK-NEXT: [[Q_FR:%.*]] = freeze ptr [[Q:%.*]]
+; CHECK-NEXT: [[Q_FR1:%.*]] = freeze ptr [[Q1:%.*]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[Q_FR]], [[Q_FR1]]
+; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], ptr [[Q_FR]], ptr [[Q_FR1]]
+; CHECK-NEXT: ret ptr [[SELECT]]
+;
+ %p.fr = freeze ptr %p
+ %q.fr = freeze ptr %q
+ %cmp = icmp eq ptr %p.fr, %q.fr
+ %select = select i1 %cmp, ptr %p.fr, ptr %q.fr
+ ret ptr %select
+}
+
+define ptr @ptr_eq_replace_freeze2(ptr %p, ptr %q) {
+; CHECK-LABEL: @ptr_eq_replace_freeze2(
+; CHECK-NEXT: [[P_FR:%.*]] = freeze ptr [[P:%.*]]
+; CHECK-NEXT: [[P_FR1:%.*]] = freeze ptr [[P1:%.*]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P_FR1]], [[P_FR]]
+; CHECK-NEXT: [[SELECT_V:%.*]] = select i1 [[CMP]], ptr [[P_FR1]], ptr [[P_FR]]
+; CHECK-NEXT: [[SELECT:%.*]] = getelementptr i8, ptr [[SELECT_V]], i64 16
+; CHECK-NEXT: ret ptr [[SELECT]]
+;
+ %gep1 = getelementptr i32, ptr %p, i64 4
+ %gep2 = getelementptr i32, ptr %q, i64 4
+ %cmp = icmp eq ptr %p, %q
+ %cmp.fr = freeze i1 %cmp
+ %select = select i1 %cmp.fr, ptr %gep1, ptr %gep2
+ ret ptr %select
+}
diff --git a/llvm/test/Transforms/InstSimplify/domcondition.ll b/llvm/test/Transforms/InstSimplify/domcondition.ll
index 43be5de..2893bb1 100644
--- a/llvm/test/Transforms/InstSimplify/domcondition.ll
+++ b/llvm/test/Transforms/InstSimplify/domcondition.ll
@@ -278,3 +278,210 @@ end:
}
declare void @foo(i32)
+
+
+define i1 @simplify_fcmp_implied_by_dom_cond_range_true(float %x) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_true(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: ret i1 true
+; CHECK: if.else:
+; CHECK-NEXT: ret i1 false
+;
+ %cmp = fcmp olt float %x, 0.0
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp2 = fcmp olt float %x, 1.0
+ ret i1 %cmp2
+
+if.else:
+ ret i1 false
+}
+
+define i1 @simplify_fcmp_in_else_implied_by_dom_cond_range_true(float %x) {
+; CHECK-LABEL: @simplify_fcmp_in_else_implied_by_dom_cond_range_true(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], 1.000000e+00
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: ret i1 true
+; CHECK: if.else:
+; CHECK-NEXT: ret i1 true
+;
+ %cmp = fcmp olt float %x, 1.0
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ ret i1 true
+
+if.else:
+ %cmp2 = fcmp uge float %x, 0.5
+ ret i1 %cmp2
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_range_false(float %x) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_false(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: ret i1 false
+; CHECK: if.else:
+; CHECK-NEXT: ret i1 false
+;
+ %cmp = fcmp olt float %x, 0.0
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp2 = fcmp ogt float %x, 1.0
+ ret i1 %cmp2
+
+if.else:
+ ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_true(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_true(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: ret i1 true
+; CHECK: if.else:
+; CHECK-NEXT: ret i1 false
+;
+ %cmp = fcmp olt float %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp2 = fcmp ole float %x, %y
+ ret i1 %cmp2
+
+if.else:
+ ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_false(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_false(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: ret i1 false
+; CHECK: if.else:
+; CHECK-NEXT: ret i1 false
+;
+ %cmp = fcmp olt float %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp2 = fcmp ogt float %x, %y
+ ret i1 %cmp2
+
+if.else:
+ ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_commuted(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_commuted(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: ret i1 true
+; CHECK: if.else:
+; CHECK-NEXT: ret i1 false
+;
+ %cmp = fcmp olt float %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp2 = fcmp oge float %y, %x
+ ret i1 %cmp2
+
+if.else:
+ ret i1 false
+}
+
+; Negative tests
+
+define i1 @simplify_fcmp_implied_by_dom_cond_wrong_range(float %x) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_wrong_range(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[CMP2:%.*]] = fcmp olt float [[X]], -1.000000e+00
+; CHECK-NEXT: ret i1 [[CMP2]]
+; CHECK: if.else:
+; CHECK-NEXT: ret i1 false
+;
+ %cmp = fcmp olt float %x, 0.0
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp2 = fcmp olt float %x, -1.0
+ ret i1 %cmp2
+
+if.else:
+ ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_range_mismatched_operand(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_mismatched_operand(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[CMP2:%.*]] = fcmp olt float [[Y:%.*]], 1.000000e+00
+; CHECK-NEXT: ret i1 [[CMP2]]
+; CHECK: if.else:
+; CHECK-NEXT: ret i1 false
+;
+ %cmp = fcmp olt float %x, 0.0
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp2 = fcmp olt float %y, 1.0
+ ret i1 %cmp2
+
+if.else:
+ ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_wrong_pred(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_wrong_pred(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp ole float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[CMP2:%.*]] = fcmp olt float [[X]], [[Y]]
+; CHECK-NEXT: ret i1 [[CMP2]]
+; CHECK: if.else:
+; CHECK-NEXT: ret i1 false
+;
+ %cmp = fcmp ole float %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp2 = fcmp olt float %x, %y
+ ret i1 %cmp2
+
+if.else:
+ ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_mismatched_operand(float %x, float %y, float %z) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_mismatched_operand(
+; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[CMP2:%.*]] = fcmp ole float [[X]], [[Z:%.*]]
+; CHECK-NEXT: ret i1 [[CMP2]]
+; CHECK: if.else:
+; CHECK-NEXT: ret i1 false
+;
+ %cmp = fcmp olt float %x, %y
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp2 = fcmp ole float %x, %z
+ ret i1 %cmp2
+
+if.else:
+ ret i1 false
+}
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index 0f09c74..e376d82 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -325,6 +325,13 @@ int llvm_test_dibuilder(void) {
LLVMValueRef Phi2 = LLVMBuildPhi(Builder, I64, "p2");
LLVMAddIncoming(Phi2, &Zero, &FooEntryBlock, 1);
+ // Test that LLVMGetFirstDbgRecord and LLVMGetLastDbgRecord return NULL for
+ // instructions without debug info.
+ LLVMDbgRecordRef Phi1FirstDbgRecord = LLVMGetFirstDbgRecord(Phi1);
+ assert(Phi1FirstDbgRecord == NULL);
+ LLVMDbgRecordRef Phi1LastDbgRecord = LLVMGetLastDbgRecord(Phi1);
+ assert(Phi1LastDbgRecord == NULL);
+
// Insert a non-phi before the `ret` but not before the debug records to
// test that works as expected.
LLVMPositionBuilder(Builder, FooVarBlock, Ret);
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index ba55079..2a89961 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -642,7 +642,8 @@ int main(int argc, char **argv) {
: MAB->createObjectWriter(*OS),
std::unique_ptr<MCCodeEmitter>(CE), *STI));
if (NoExecStack)
- Str->switchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
+ Str->switchSection(
+ Ctx.getAsmInfo()->getStackSection(Ctx, /*Exec=*/false));
Str->emitVersionForTarget(TheTriple, VersionTuple(), nullptr,
VersionTuple());
}
diff --git a/llvm/unittests/ADT/TypeSwitchTest.cpp b/llvm/unittests/ADT/TypeSwitchTest.cpp
index c54b798..a7d9342 100644
--- a/llvm/unittests/ADT/TypeSwitchTest.cpp
+++ b/llvm/unittests/ADT/TypeSwitchTest.cpp
@@ -114,3 +114,31 @@ TEST(TypeSwitchTest, CasesOptional) {
EXPECT_EQ(std::nullopt, translate(DerivedC()));
EXPECT_EQ(-1, translate(DerivedD()));
}
+
+TEST(TypeSwitchTest, DefaultUnreachableWithValue) {
+ auto translate = [](auto value) {
+ return TypeSwitch<Base *, int>(&value)
+ .Case([](DerivedA *) { return 0; })
+ .DefaultUnreachable("Unhandled type");
+ };
+ EXPECT_EQ(0, translate(DerivedA()));
+
+#if defined(GTEST_HAS_DEATH_TEST) && !defined(NDEBUG)
+ EXPECT_DEATH((void)translate(DerivedD()), "Unhandled type");
+#endif
+}
+
+TEST(TypeSwitchTest, DefaultUnreachableWithVoid) {
+ auto translate = [](auto value) {
+ int result = -1;
+ TypeSwitch<Base *>(&value)
+ .Case([&result](DerivedA *) { result = 0; })
+ .DefaultUnreachable("Unhandled type");
+ return result;
+ };
+ EXPECT_EQ(0, translate(DerivedA()));
+
+#if defined(GTEST_HAS_DEATH_TEST) && !defined(NDEBUG)
+ EXPECT_DEATH((void)translate(DerivedD()), "Unhandled type");
+#endif
+}
diff --git a/mlir/docs/Tutorials/CreatingADialect.md b/mlir/docs/Tutorials/CreatingADialect.md
index af709fc..e30b80bc 100644
--- a/mlir/docs/Tutorials/CreatingADialect.md
+++ b/mlir/docs/Tutorials/CreatingADialect.md
@@ -22,7 +22,7 @@ typically defined in FooDialect.cpp, which includes FooOps.cpp.inc and
FooOpsInterfaces.h.inc.
The 'Transforms' directory contains rewrite rules for the dialect,
-typically described in TableGen file using the [DDR
+typically described in TableGen file using the [DRR
format](../DeclarativeRewrites.md).
Note that dialect names should not generally be suffixed with “Ops”,
diff --git a/mlir/include/mlir-c/Rewrite.h b/mlir/include/mlir-c/Rewrite.h
index 77be1f4..5dd285e 100644
--- a/mlir/include/mlir-c/Rewrite.h
+++ b/mlir/include/mlir-c/Rewrite.h
@@ -101,6 +101,12 @@ mlirRewriterBaseGetInsertionBlock(MlirRewriterBase rewriter);
MLIR_CAPI_EXPORTED MlirBlock
mlirRewriterBaseGetBlock(MlirRewriterBase rewriter);
+/// Returns the operation right after the current insertion point
+/// of the rewriter. A null MlirOperation will be returned
+// if the current insertion point is at the end of the block.
+MLIR_CAPI_EXPORTED MlirOperation
+mlirRewriterBaseGetOperationAfterInsertion(MlirRewriterBase rewriter);
+
//===----------------------------------------------------------------------===//
/// Block and operation creation/insertion/cloning
//===----------------------------------------------------------------------===//
@@ -311,6 +317,14 @@ MLIR_CAPI_EXPORTED MlirLogicalResult mlirApplyPatternsAndFoldGreedily(
MlirGreedyRewriteDriverConfig);
//===----------------------------------------------------------------------===//
+/// PatternRewriter API
+//===----------------------------------------------------------------------===//
+
+/// Cast the PatternRewriter to a RewriterBase
+MLIR_CAPI_EXPORTED MlirRewriterBase
+mlirPatternRewriterAsBase(MlirPatternRewriter rewriter);
+
+//===----------------------------------------------------------------------===//
/// PDLPatternModule API
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 32b2b0c..7b17106 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -2046,6 +2046,9 @@ PyInsertionPoint::PyInsertionPoint(PyOperationBase &beforeOperationBase)
: refOperation(beforeOperationBase.getOperation().getRef()),
block((*refOperation)->getBlock()) {}
+PyInsertionPoint::PyInsertionPoint(PyOperationRef beforeOperationRef)
+ : refOperation(beforeOperationRef), block((*refOperation)->getBlock()) {}
+
void PyInsertionPoint::insert(PyOperationBase &operationBase) {
PyOperation &operation = operationBase.getOperation();
if (operation.isAttached())
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index edbd73e..e706be3b 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -841,6 +841,8 @@ public:
PyInsertionPoint(const PyBlock &block);
/// Creates an insertion point positioned before a reference operation.
PyInsertionPoint(PyOperationBase &beforeOperationBase);
+ /// Creates an insertion point positioned before a reference operation.
+ PyInsertionPoint(PyOperationRef beforeOperationRef);
/// Shortcut to create an insertion point at the beginning of the block.
static PyInsertionPoint atBlockBegin(PyBlock &block);
diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp
index 836f44fd..9e3d970 100644
--- a/mlir/lib/Bindings/Python/Rewrite.cpp
+++ b/mlir/lib/Bindings/Python/Rewrite.cpp
@@ -26,6 +26,30 @@ using namespace mlir::python;
namespace {
+class PyPatternRewriter {
+public:
+ PyPatternRewriter(MlirPatternRewriter rewriter)
+ : base(mlirPatternRewriterAsBase(rewriter)),
+ ctx(PyMlirContext::forContext(mlirRewriterBaseGetContext(base))) {}
+
+ PyInsertionPoint getInsertionPoint() const {
+ MlirBlock block = mlirRewriterBaseGetInsertionBlock(base);
+ MlirOperation op = mlirRewriterBaseGetOperationAfterInsertion(base);
+
+ if (mlirOperationIsNull(op)) {
+ MlirOperation owner = mlirBlockGetParentOperation(block);
+ auto parent = PyOperation::forOperation(ctx, owner);
+ return PyInsertionPoint(PyBlock(parent, block));
+ }
+
+ return PyInsertionPoint(PyOperation::forOperation(ctx, op));
+ }
+
+private:
+ MlirRewriterBase base;
+ PyMlirContextRef ctx;
+};
+
#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
static nb::object objectFromPDLValue(MlirPDLValue value) {
if (MlirValue v = mlirPDLValueAsValue(value); !mlirValueIsNull(v))
@@ -84,7 +108,8 @@ public:
void *userData) -> MlirLogicalResult {
nb::handle f = nb::handle(static_cast<PyObject *>(userData));
return logicalResultFromObject(
- f(rewriter, results, objectsFromPDLValues(nValues, values)));
+ f(PyPatternRewriter(rewriter), results,
+ objectsFromPDLValues(nValues, values)));
},
fn.ptr());
}
@@ -98,7 +123,8 @@ public:
void *userData) -> MlirLogicalResult {
nb::handle f = nb::handle(static_cast<PyObject *>(userData));
return logicalResultFromObject(
- f(rewriter, results, objectsFromPDLValues(nValues, values)));
+ f(PyPatternRewriter(rewriter), results,
+ objectsFromPDLValues(nValues, values)));
},
fn.ptr());
}
@@ -143,7 +169,9 @@ private:
/// Create the `mlir.rewrite` here.
void mlir::python::populateRewriteSubmodule(nb::module_ &m) {
- nb::class_<MlirPatternRewriter>(m, "PatternRewriter");
+ nb::class_<PyPatternRewriter>(m, "PatternRewriter")
+ .def_prop_ro("ip", &PyPatternRewriter::getInsertionPoint,
+ "The current insertion point of the PatternRewriter.");
//----------------------------------------------------------------------------
// Mapping of the PDLResultList and PDLModule
//----------------------------------------------------------------------------
diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp
index 0d56259..c15a73b 100644
--- a/mlir/lib/CAPI/Transforms/Rewrite.cpp
+++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp
@@ -70,6 +70,17 @@ MlirBlock mlirRewriterBaseGetBlock(MlirRewriterBase rewriter) {
return wrap(unwrap(rewriter)->getBlock());
}
+MlirOperation
+mlirRewriterBaseGetOperationAfterInsertion(MlirRewriterBase rewriter) {
+ mlir::RewriterBase *base = unwrap(rewriter);
+ mlir::Block *block = base->getInsertionBlock();
+ mlir::Block::iterator it = base->getInsertionPoint();
+ if (it == block->end())
+ return {nullptr};
+
+ return wrap(std::addressof(*it));
+}
+
//===----------------------------------------------------------------------===//
/// Block and operation creation/insertion/cloning
//===----------------------------------------------------------------------===//
@@ -317,6 +328,10 @@ inline MlirPatternRewriter wrap(mlir::PatternRewriter *rewriter) {
return {rewriter};
}
+MlirRewriterBase mlirPatternRewriterAsBase(MlirPatternRewriter rewriter) {
+ return wrap(static_cast<mlir::RewriterBase *>(unwrap(rewriter)));
+}
+
//===----------------------------------------------------------------------===//
/// PDLPatternModule API
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp b/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp
index 7b9ed1d..79c2f23 100644
--- a/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp
+++ b/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp
@@ -363,7 +363,7 @@ static TypedValue<VectorType> storeTile(PatternRewriter &rewriter,
}
struct ContractionToAMX : public OpRewritePattern<vector::ContractionOp> {
- using OpRewritePattern::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
index 4e1da39..363685a 100644
--- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
+++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
@@ -44,7 +44,7 @@ namespace {
/// arm_sme.tile_load ... layout<vertical>
struct TransferReadToArmSMELowering
: public OpRewritePattern<vector::TransferReadOp> {
- using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::TransferReadOp transferReadOp,
PatternRewriter &rewriter) const final {
@@ -120,7 +120,7 @@ struct TransferReadToArmSMELowering
/// : memref<?x?xi8>, vector<[16]x[16]xi8>
struct TransferWriteToArmSMELowering
: public OpRewritePattern<vector::TransferWriteOp> {
- using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
PatternRewriter &rewriter) const final {
@@ -157,7 +157,7 @@ struct TransferWriteToArmSMELowering
/// Conversion pattern for vector.load.
struct VectorLoadToArmSMELowering : public OpRewritePattern<vector::LoadOp> {
- using OpRewritePattern<vector::LoadOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::LoadOp load,
PatternRewriter &rewriter) const override {
@@ -173,7 +173,7 @@ struct VectorLoadToArmSMELowering : public OpRewritePattern<vector::LoadOp> {
/// Conversion pattern for vector.store.
struct VectorStoreToArmSMELowering : public OpRewritePattern<vector::StoreOp> {
- using OpRewritePattern<vector::StoreOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::StoreOp store,
PatternRewriter &rewriter) const override {
@@ -208,7 +208,7 @@ struct VectorStoreToArmSMELowering : public OpRewritePattern<vector::StoreOp> {
/// Supports scalar, 0-d vector, and 1-d vector broadcasts.
struct BroadcastOpToArmSMELowering
: public OpRewritePattern<vector::BroadcastOp> {
- using OpRewritePattern<vector::BroadcastOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::BroadcastOp broadcastOp,
PatternRewriter &rewriter) const final {
@@ -279,7 +279,7 @@ struct BroadcastOpToArmSMELowering
/// implementation, perhaps with tile <-> vector (MOVA) ops.
struct TransposeOpToArmSMELowering
: public OpRewritePattern<vector::TransposeOp> {
- using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::TransposeOp transposeOp,
PatternRewriter &rewriter) const final {
@@ -372,7 +372,7 @@ struct TransposeOpToArmSMELowering
struct VectorOuterProductToArmSMELowering
: public OpRewritePattern<vector::OuterProductOp> {
- using OpRewritePattern<vector::OuterProductOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::OuterProductOp outerProductOp,
PatternRewriter &rewriter) const override {
@@ -451,7 +451,7 @@ struct VectorOuterProductToArmSMELowering
/// ```
struct VectorExtractToArmSMELowering
: public OpRewritePattern<vector::ExtractOp> {
- using OpRewritePattern<vector::ExtractOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::ExtractOp extractOp,
PatternRewriter &rewriter) const override {
@@ -507,7 +507,7 @@ struct VectorExtractToArmSMELowering
/// ```
struct VectorInsertToArmSMELowering
: public OpRewritePattern<vector::InsertOp> {
- using OpRewritePattern<vector::InsertOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::InsertOp insertOp,
PatternRewriter &rewriter) const override {
@@ -568,7 +568,7 @@ struct VectorInsertToArmSMELowering
/// }
/// ```
struct VectorPrintToArmSMELowering : public OpRewritePattern<vector::PrintOp> {
- using OpRewritePattern<vector::PrintOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::PrintOp printOp,
PatternRewriter &rewriter) const override {
@@ -623,7 +623,7 @@ struct VectorPrintToArmSMELowering : public OpRewritePattern<vector::PrintOp> {
/// ```
struct FoldTransferWriteOfExtractTileSlice
: public OpRewritePattern<vector::TransferWriteOp> {
- using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
PatternRewriter &rewriter) const final {
@@ -679,7 +679,7 @@ struct FoldTransferWriteOfExtractTileSlice
/// ```
struct ExtractFromCreateMaskToPselLowering
: public OpRewritePattern<vector::ExtractOp> {
- using OpRewritePattern<vector::ExtractOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::ExtractOp extractOp,
PatternRewriter &rewriter) const override {
@@ -734,7 +734,7 @@ struct ExtractFromCreateMaskToPselLowering
// Convert all `vector.splat` to `vector.broadcast`. There is a path from
// `vector.broadcast` to ArmSME via another pattern.
struct ConvertSplatToBroadcast : public OpRewritePattern<vector::SplatOp> {
- using OpRewritePattern<vector::SplatOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::SplatOp splatOp,
PatternRewriter &rewriter) const final {
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index d6a2622..9843435 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -386,7 +386,7 @@ namespace {
// to MMA matmul.
struct PrepareContractToGPUMMA
: public OpRewritePattern<vector::ContractionOp> {
- using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::ContractionOp op,
PatternRewriter &rewriter) const override {
@@ -450,7 +450,7 @@ struct PrepareContractToGPUMMA
// Shared Memory to registers.
struct CombineTransferReadOpTranspose final
: public OpRewritePattern<vector::TransposeOp> {
- using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::TransposeOp op,
PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index e0b1a88..5461646 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1342,7 +1342,7 @@ struct VectorScalableExtractOpLowering
/// ```
class VectorFMAOpNDRewritePattern : public OpRewritePattern<FMAOp> {
public:
- using OpRewritePattern<FMAOp>::OpRewritePattern;
+ using Base::Base;
void initialize() {
// This pattern recursively unpacks one dimension at a time. The recursion
@@ -2127,7 +2127,7 @@ FailureOr<Value> ContractionOpToMatmulOpLowering::matchAndRewriteMaskableOp(
class TransposeOpToMatrixTransposeOpLowering
: public OpRewritePattern<vector::TransposeOp> {
public:
- using OpRewritePattern<TransposeOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::TransposeOp op,
PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
index 5061a44..311ff6f 100644
--- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
+++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
@@ -838,7 +838,7 @@ struct VectorStoreOpConverter final
struct VectorReductionToIntDotProd final
: OpRewritePattern<vector::ReductionOp> {
- using OpRewritePattern::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::ReductionOp op,
PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index 9f5585a..e2c7d80 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -475,7 +475,7 @@ static LogicalResult lowerToScatteredStoreOp(vector::TransferWriteOp writeOp,
}
struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
- using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
PatternRewriter &rewriter) const override {
@@ -546,7 +546,7 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
struct TransferWriteLowering
: public OpRewritePattern<vector::TransferWriteOp> {
- using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
PatternRewriter &rewriter) const override {
@@ -597,7 +597,7 @@ struct TransferWriteLowering
};
struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
- using OpRewritePattern<vector::GatherOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::GatherOp gatherOp,
PatternRewriter &rewriter) const override {
@@ -632,7 +632,7 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
};
struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
- using OpRewritePattern<vector::ScatterOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::ScatterOp scatterOp,
PatternRewriter &rewriter) const override {
@@ -662,7 +662,7 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
};
struct LoadLowering : public OpRewritePattern<vector::LoadOp> {
- using OpRewritePattern<vector::LoadOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::LoadOp loadOp,
PatternRewriter &rewriter) const override {
@@ -694,7 +694,7 @@ struct LoadLowering : public OpRewritePattern<vector::LoadOp> {
};
struct StoreLowering : public OpRewritePattern<vector::StoreOp> {
- using OpRewritePattern<vector::StoreOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::StoreOp storeOp,
PatternRewriter &rewriter) const override {
@@ -727,7 +727,7 @@ struct StoreLowering : public OpRewritePattern<vector::StoreOp> {
};
struct ContractionLowering : public OpRewritePattern<vector::ContractionOp> {
- using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
PatternRewriter &rewriter) const override {
diff --git a/mlir/test/python/integration/dialects/pdl.py b/mlir/test/python/integration/dialects/pdl.py
index c8e6197..fe27dd4 100644
--- a/mlir/test/python/integration/dialects/pdl.py
+++ b/mlir/test/python/integration/dialects/pdl.py
@@ -16,6 +16,7 @@ def construct_and_print_in_module(f):
print(module)
return f
+
def get_pdl_patterns():
# Create a rewrite from add to mul. This will match
# - operation name is arith.addi
@@ -121,8 +122,10 @@ def load_myint_dialect():
# This PDL pattern is to fold constant additions,
-# i.e. add(constant0, constant1) -> constant2
-# where constant2 = constant0 + constant1.
+# including two patterns:
+# 1. add(constant0, constant1) -> constant2
+# where constant2 = constant0 + constant1;
+# 2. add(x, 0) or add(0, x) -> x.
def get_pdl_pattern_fold():
m = Module.create()
i32 = IntegerType.get_signless(32)
@@ -237,3 +240,87 @@ def test_pdl_register_function_constraint(module_):
apply_patterns_and_fold_greedily(module_, frozen)
return module_
+
+
+# This pattern is to expand constant to additions
+# unless the constant is no more than 1,
+# e.g. 3 -> 1 + 2 -> 1 + (1 + 1).
+def get_pdl_pattern_expand():
+ m = Module.create()
+ i32 = IntegerType.get_signless(32)
+ with InsertionPoint(m.body):
+
+ @pdl.pattern(benefit=1, sym_name="myint_constant_expand")
+ def pat():
+ t = pdl.TypeOp(i32)
+ cst = pdl.AttributeOp()
+ pdl.apply_native_constraint([], "is_one", [cst])
+ op0 = pdl.OperationOp(
+ name="myint.constant", attributes={"value": cst}, types=[t]
+ )
+
+ @pdl.rewrite()
+ def rew():
+ expanded = pdl.apply_native_rewrite(
+ [pdl.OperationType.get()], "expand", [cst]
+ )
+ pdl.ReplaceOp(op0, with_op=expanded)
+
+ def is_one(rewriter, results, values):
+ cst = values[0].value
+ return cst <= 1
+
+ def expand(rewriter, results, values):
+ cst = values[0].value
+ c1 = cst // 2
+ c2 = cst - c1
+ with rewriter.ip:
+ op1 = Operation.create(
+ "myint.constant",
+ results=[i32],
+ attributes={"value": IntegerAttr.get(i32, c1)},
+ )
+ op2 = Operation.create(
+ "myint.constant",
+ results=[i32],
+ attributes={"value": IntegerAttr.get(i32, c2)},
+ )
+ res = Operation.create(
+ "myint.add", results=[i32], operands=[op1.result, op2.result]
+ )
+ results.append(res)
+
+ pdl_module = PDLModule(m)
+ pdl_module.register_constraint_function("is_one", is_one)
+ pdl_module.register_rewrite_function("expand", expand)
+ return pdl_module.freeze()
+
+
+# CHECK-LABEL: TEST: test_pdl_register_function_expand
+# CHECK: %0 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %1 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %2 = "myint.add"(%0, %1) : (i32, i32) -> i32
+# CHECK: %3 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %4 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %5 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %6 = "myint.add"(%4, %5) : (i32, i32) -> i32
+# CHECK: %7 = "myint.add"(%3, %6) : (i32, i32) -> i32
+# CHECK: %8 = "myint.add"(%2, %7) : (i32, i32) -> i32
+# CHECK: return %8 : i32
+@construct_and_print_in_module
+def test_pdl_register_function_expand(module_):
+ load_myint_dialect()
+
+ module_ = Module.parse(
+ """
+ func.func @f() -> i32 {
+ %0 = "myint.constant"() { value = 5 }: () -> (i32)
+ return %0 : i32
+ }
+ """
+ )
+
+ frozen = get_pdl_pattern_expand()
+ apply_patterns_and_fold_greedily(module_, frozen)
+
+ return module_
diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h
index 14a3d8e..3ed3295 100644
--- a/orc-rt/include/orc-rt/SPSWrapperFunction.h
+++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h
@@ -57,8 +57,8 @@ private:
template <typename... Ts>
using DeserializableTuple_t = typename DeserializableTuple<Ts...>::type;
- template <typename T> static T fromSerializable(T &&Arg) noexcept {
- return Arg;
+ template <typename T> static T &&fromSerializable(T &&Arg) noexcept {
+ return std::forward<T>(Arg);
}
static Error fromSerializable(SPSSerializableError Err) noexcept {
@@ -86,7 +86,10 @@ public:
decltype(Args)>::deserialize(IB, Args))
return std::nullopt;
return std::apply(
- [](auto &&...A) { return ArgTuple(fromSerializable(A)...); },
+ [](auto &&...A) {
+ return std::optional<ArgTuple>(std::in_place,
+ std::move(fromSerializable(A))...);
+ },
std::move(Args));
}
};
diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h
index ca165db..47e770f 100644
--- a/orc-rt/include/orc-rt/WrapperFunction.h
+++ b/orc-rt/include/orc-rt/WrapperFunction.h
@@ -111,7 +111,23 @@ struct WFHandlerTraitsImpl {
static_assert(std::is_void_v<RetT>,
"Async wrapper function handler must return void");
typedef ReturnT YieldType;
- typedef std::tuple<ArgTs...> ArgTupleType;
+ typedef std::tuple<std::decay_t<ArgTs>...> ArgTupleType;
+
+ // Forwards arguments based on the parameter types of the handler.
+ template <typename FnT> class ForwardArgsAsRequested {
+ public:
+ ForwardArgsAsRequested(FnT &&Fn) : Fn(std::move(Fn)) {}
+ void operator()(ArgTs &...Args) { Fn(std::forward<ArgTs>(Args)...); }
+
+ private:
+ FnT Fn;
+ };
+
+ template <typename FnT>
+ static ForwardArgsAsRequested<std::decay_t<FnT>>
+ forwardArgsAsRequested(FnT &&Fn) {
+ return ForwardArgsAsRequested<std::decay_t<FnT>>(std::forward<FnT>(Fn));
+ }
};
template <typename C>
@@ -244,10 +260,11 @@ struct WrapperFunction {
if (auto Args =
S.arguments().template deserialize<ArgTuple>(std::move(ArgBytes)))
- std::apply(bind_front(std::forward<Handler>(H),
- detail::StructuredYield<RetTupleType, Serializer>(
- Session, CallCtx, Return, std::move(S))),
- std::move(*Args));
+ std::apply(HandlerTraits::forwardArgsAsRequested(bind_front(
+ std::forward<Handler>(H),
+ detail::StructuredYield<RetTupleType, Serializer>(
+ Session, CallCtx, Return, std::move(S)))),
+ *Args);
else
Return(Session, CallCtx,
WrapperFunctionBuffer::createOutOfBandError(
diff --git a/orc-rt/unittests/CMakeLists.txt b/orc-rt/unittests/CMakeLists.txt
index 54c453d..4d3da68 100644
--- a/orc-rt/unittests/CMakeLists.txt
+++ b/orc-rt/unittests/CMakeLists.txt
@@ -15,7 +15,6 @@ add_orc_rt_unittest(CoreTests
AllocActionTest.cpp
BitmaskEnumTest.cpp
CallableTraitsHelperTest.cpp
- CommonTestUtils.cpp
ErrorTest.cpp
ExecutorAddressTest.cpp
IntervalMapTest.cpp
diff --git a/orc-rt/unittests/CommonTestUtils.cpp b/orc-rt/unittests/CommonTestUtils.cpp
deleted file mode 100644
index d9f9433..0000000
--- a/orc-rt/unittests/CommonTestUtils.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===- CommonTestUtils.cpp ------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Common test utilities.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CommonTestUtils.h"
-
-size_t OpCounter::DefaultConstructions = 0;
-size_t OpCounter::CopyConstructions = 0;
-size_t OpCounter::CopyAssignments = 0;
-size_t OpCounter::MoveConstructions = 0;
-size_t OpCounter::MoveAssignments = 0;
-size_t OpCounter::Destructions = 0;
diff --git a/orc-rt/unittests/CommonTestUtils.h b/orc-rt/unittests/CommonTestUtils.h
index 5ff2c8e..1c66bdd 100644
--- a/orc-rt/unittests/CommonTestUtils.h
+++ b/orc-rt/unittests/CommonTestUtils.h
@@ -11,7 +11,7 @@
#include <cstddef>
-class OpCounter {
+template <size_t Idx = 0> class OpCounter {
public:
OpCounter() { ++DefaultConstructions; }
OpCounter(const OpCounter &Other) { ++CopyConstructions; }
@@ -57,4 +57,11 @@ private:
static size_t Destructions;
};
+template <size_t Idx> size_t OpCounter<Idx>::DefaultConstructions = 0;
+template <size_t Idx> size_t OpCounter<Idx>::CopyConstructions = 0;
+template <size_t Idx> size_t OpCounter<Idx>::CopyAssignments = 0;
+template <size_t Idx> size_t OpCounter<Idx>::MoveConstructions = 0;
+template <size_t Idx> size_t OpCounter<Idx>::MoveAssignments = 0;
+template <size_t Idx> size_t OpCounter<Idx>::Destructions = 0;
+
#endif // ORC_RT_UNITTEST_COMMONTESTUTILS_H
diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
index c0c86ff..32aaa61 100644
--- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp
+++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
@@ -10,6 +10,8 @@
//
//===----------------------------------------------------------------------===//
+#include "CommonTestUtils.h"
+
#include "orc-rt/SPSWrapperFunction.h"
#include "orc-rt/WrapperFunction.h"
#include "orc-rt/move_only_function.h"
@@ -218,3 +220,80 @@ TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedFailureCase) {
EXPECT_EQ(ErrMsg, "N is not a multiple of 2");
}
+
+template <size_t N> struct SPSOpCounter {};
+
+namespace orc_rt {
+template <size_t N>
+class SPSSerializationTraits<SPSOpCounter<N>, OpCounter<N>> {
+public:
+ static size_t size(const OpCounter<N> &O) { return 0; }
+ static bool serialize(SPSOutputBuffer &OB, const OpCounter<N> &O) {
+ return true;
+ }
+ static bool deserialize(SPSInputBuffer &OB, OpCounter<N> &O) { return true; }
+};
+} // namespace orc_rt
+
+static void
+handle_with_reference_types_sps_wrapper(orc_rt_SessionRef Session,
+ void *CallCtx,
+ orc_rt_WrapperFunctionReturn Return,
+ orc_rt_WrapperFunctionBuffer ArgBytes) {
+ SPSWrapperFunction<void(
+ SPSOpCounter<0>, SPSOpCounter<1>, SPSOpCounter<2>,
+ SPSOpCounter<3>)>::handle(Session, CallCtx, Return, ArgBytes,
+ [](move_only_function<void()> Return,
+ OpCounter<0>, OpCounter<1> &,
+ const OpCounter<2> &,
+ OpCounter<3> &&) { Return(); });
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestHandlerWithReferences) {
+ // Test that we can handle by-value, by-ref, by-const-ref, and by-rvalue-ref
+ // arguments, and that we generate the expected number of moves.
+ OpCounter<0>::reset();
+ OpCounter<1>::reset();
+ OpCounter<2>::reset();
+ OpCounter<3>::reset();
+
+ bool DidRun = false;
+ SPSWrapperFunction<void(SPSOpCounter<0>, SPSOpCounter<1>, SPSOpCounter<2>,
+ SPSOpCounter<3>)>::
+ call(
+ DirectCaller(nullptr, handle_with_reference_types_sps_wrapper),
+ [&](Error R) {
+ cantFail(std::move(R));
+ DidRun = true;
+ },
+ OpCounter<0>(), OpCounter<1>(), OpCounter<2>(), OpCounter<3>());
+
+ EXPECT_TRUE(DidRun);
+
+ // We expect two default constructions for each parameter: one for the
+ // argument to call, and one for the object to deserialize into.
+ EXPECT_EQ(OpCounter<0>::defaultConstructions(), 2U);
+ EXPECT_EQ(OpCounter<1>::defaultConstructions(), 2U);
+ EXPECT_EQ(OpCounter<2>::defaultConstructions(), 2U);
+ EXPECT_EQ(OpCounter<3>::defaultConstructions(), 2U);
+
+ // Pass-by-value: we expect two moves (one for SPS transparent conversion,
+ // one to copy the value to the parameter), and no copies.
+ EXPECT_EQ(OpCounter<0>::moves(), 2U);
+ EXPECT_EQ(OpCounter<0>::copies(), 0U);
+
+ // Pass-by-lvalue-reference: we expect one move (for SPS transparent
+ // conversion), no copies.
+ EXPECT_EQ(OpCounter<1>::moves(), 1U);
+ EXPECT_EQ(OpCounter<1>::copies(), 0U);
+
+ // Pass-by-const-lvalue-reference: we expect one move (for SPS transparent
+ // conversion), no copies.
+ EXPECT_EQ(OpCounter<2>::moves(), 1U);
+ EXPECT_EQ(OpCounter<2>::copies(), 0U);
+
+ // Pass-by-rvalue-reference: we expect one move (for SPS transparent
+ // conversion), no copies.
+ EXPECT_EQ(OpCounter<3>::moves(), 1U);
+ EXPECT_EQ(OpCounter<3>::copies(), 0U);
+}
diff --git a/orc-rt/unittests/bind-test.cpp b/orc-rt/unittests/bind-test.cpp
index bfaef4e..93a61e63 100644
--- a/orc-rt/unittests/bind-test.cpp
+++ b/orc-rt/unittests/bind-test.cpp
@@ -47,28 +47,28 @@ TEST(BindTest, LambdaCapture) {
}
TEST(BindTest, MinimalMoves) {
- OpCounter::reset();
+ OpCounter<>::reset();
{
- auto B = bind_front([](OpCounter &O, int) {}, OpCounter());
+ auto B = bind_front([](OpCounter<> &O, int) {}, OpCounter<>());
B(0);
}
- EXPECT_EQ(OpCounter::defaultConstructions(), 1U);
- EXPECT_EQ(OpCounter::copies(), 0U);
- EXPECT_EQ(OpCounter::moves(), 1U);
- EXPECT_EQ(OpCounter::destructions(), 2U);
+ EXPECT_EQ(OpCounter<>::defaultConstructions(), 1U);
+ EXPECT_EQ(OpCounter<>::copies(), 0U);
+ EXPECT_EQ(OpCounter<>::moves(), 1U);
+ EXPECT_EQ(OpCounter<>::destructions(), 2U);
}
TEST(BindTest, MinimalCopies) {
- OpCounter::reset();
+ OpCounter<>::reset();
{
- OpCounter O;
- auto B = bind_front([](OpCounter &O, int) {}, O);
+ OpCounter<> O;
+ auto B = bind_front([](OpCounter<> &O, int) {}, O);
B(0);
}
- EXPECT_EQ(OpCounter::defaultConstructions(), 1U);
- EXPECT_EQ(OpCounter::copies(), 1U);
- EXPECT_EQ(OpCounter::moves(), 0U);
- EXPECT_EQ(OpCounter::destructions(), 2U);
+ EXPECT_EQ(OpCounter<>::defaultConstructions(), 1U);
+ EXPECT_EQ(OpCounter<>::copies(), 1U);
+ EXPECT_EQ(OpCounter<>::moves(), 0U);
+ EXPECT_EQ(OpCounter<>::destructions(), 2U);
}
TEST(BindTest, ForwardUnboundArgs) {
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 026664b..e91e7c8 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2157,24 +2157,13 @@ libc_function(
########################### math support library ###############################
libc_support_library(
- name = "common_constants",
- srcs = ["src/math/generic/common_constants.cpp"],
- hdrs = ["src/math/generic/common_constants.h"],
- deps = [
- ":__support_math_acosh_float_constants",
- ":__support_math_exp_constants",
- ":__support_number_pair",
- ],
-)
-
-libc_support_library(
name = "log_range_reduction",
hdrs = ["src/math/generic/log_range_reduction.h"],
deps = [
":__support_common",
":__support_fputil_dyadic_float",
+ ":__support_math_common_constants",
":__support_uint128",
- ":common_constants",
],
)
@@ -2189,8 +2178,8 @@ libc_support_library(
":__support_fputil_polyeval",
":__support_fputil_rounding_mode",
":__support_macros_optimization",
+ ":__support_math_common_constants",
":__support_math_exp10f_utils",
- ":common_constants",
],
)
@@ -2558,6 +2547,16 @@ libc_support_library(
)
libc_support_library(
+ name = "__support_math_common_constants",
+ hdrs = ["src/__support/math/common_constants.h"],
+ deps = [
+ ":__support_math_acosh_float_constants",
+ ":__support_math_exp_constants",
+ ":__support_number_pair",
+ ],
+)
+
+libc_support_library(
name = "__support_math_cos",
hdrs = ["src/__support/math/cos.h"],
deps = [
@@ -2632,8 +2631,8 @@ libc_support_library(
":__support_fputil_polyeval",
":__support_fputil_rounding_mode",
":__support_macros_optimization",
+ ":__support_math_common_constants",
":__support_sincosf_utils",
- ":common_constants",
],
)
@@ -2879,6 +2878,24 @@ libc_support_library(
)
libc_support_library(
+ name = "__support_math_exp2",
+ hdrs = ["src/__support/math/exp2.h"],
+ deps = [
+ ":__support_fputil_double_double",
+ ":__support_fputil_dyadic_float",
+ ":__support_fputil_multiply_add",
+ ":__support_fputil_nearest_integer",
+ ":__support_fputil_polyeval",
+ ":__support_fputil_rounding_mode",
+ ":__support_fputil_triple_double",
+ ":__support_integer_literals",
+ ":__support_macros_optimization",
+ ":__support_math_common_constants",
+ ":__support_math_exp_utils",
+ ],
+)
+
+libc_support_library(
name = "__support_math_exp10",
hdrs = ["src/__support/math/exp10.h"],
deps = [
@@ -3652,17 +3669,7 @@ libc_math_function(
libc_math_function(
name = "exp2",
additional_deps = [
- ":__support_fputil_double_double",
- ":__support_fputil_dyadic_float",
- ":__support_fputil_multiply_add",
- ":__support_fputil_nearest_integer",
- ":__support_fputil_polyeval",
- ":__support_fputil_rounding_mode",
- ":__support_fputil_triple_double",
- ":__support_integer_literals",
- ":__support_macros_optimization",
- ":__support_math_exp_utils",
- ":common_constants",
+ ":__support_math_exp2",
],
)
@@ -3706,7 +3713,7 @@ libc_math_function(
":__support_fputil_triple_double",
":__support_integer_literals",
":__support_macros_optimization",
- ":common_constants",
+ ":__support_math_common_constants",
],
)
@@ -3720,7 +3727,7 @@ libc_math_function(
":__support_fputil_rounding_mode",
":__support_macros_optimization",
":__support_macros_properties_cpu_features",
- ":common_constants",
+ ":__support_math_common_constants",
],
)
@@ -4233,7 +4240,7 @@ libc_math_function(
":__support_integer_literals",
":__support_macros_optimization",
":__support_macros_properties_cpu_features",
- ":common_constants",
+ ":__support_math_common_constants",
":log_range_reduction",
],
)
@@ -4246,7 +4253,7 @@ libc_math_function(
":__support_fputil_polyeval",
":__support_macros_optimization",
":__support_macros_properties_cpu_features",
- ":common_constants",
+ ":__support_math_common_constants",
],
)
@@ -4268,7 +4275,7 @@ libc_math_function(
":__support_integer_literals",
":__support_macros_optimization",
":__support_macros_properties_cpu_features",
- ":common_constants",
+ ":__support_math_common_constants",
":log_range_reduction",
],
)
@@ -4281,7 +4288,7 @@ libc_math_function(
":__support_fputil_polyeval",
":__support_macros_optimization",
":__support_macros_properties_cpu_features",
- ":common_constants",
+ ":__support_math_common_constants",
],
)
@@ -4303,7 +4310,7 @@ libc_math_function(
":__support_integer_literals",
":__support_macros_optimization",
":__support_macros_properties_cpu_features",
- ":common_constants",
+ ":__support_math_common_constants",
],
)
@@ -4315,7 +4322,7 @@ libc_math_function(
":__support_fputil_polyeval",
":__support_macros_optimization",
":__support_macros_properties_cpu_features",
- ":common_constants",
+ ":__support_math_common_constants",
],
)
@@ -4330,7 +4337,7 @@ libc_math_function(
":__support_integer_literals",
":__support_macros_optimization",
":__support_macros_properties_cpu_features",
- ":common_constants",
+ ":__support_math_common_constants",
":log_range_reduction",
],
)
@@ -4342,7 +4349,7 @@ libc_math_function(
":__support_fputil_multiply_add",
":__support_fputil_polyeval",
":__support_macros_optimization",
- ":common_constants",
+ ":__support_math_common_constants",
],
)
@@ -4488,7 +4495,7 @@ libc_math_function(
":__support_fputil_nearest_integer",
":__support_fputil_polyeval",
":__support_fputil_sqrt",
- ":common_constants",
+ ":__support_math_common_constants",
],
)
@@ -4503,7 +4510,7 @@ libc_math_function(
":__support_fputil_triple_double",
":__support_macros_optimization",
":__support_math_exp10f",
- ":common_constants",
+ ":__support_math_common_constants",
":exp2f_impl",
],
)
@@ -4664,7 +4671,7 @@ libc_math_function(
":__support_fputil_rounding_mode",
":__support_macros_optimization",
":__support_math_sinhfcoshf_utils",
- ":common_constants",
+ ":__support_math_common_constants",
],
)
@@ -4771,7 +4778,7 @@ libc_math_function(
":__support_macros_optimization",
":__support_macros_properties_cpu_features",
":__support_math_exp10f_utils",
- ":common_constants",
+ ":__support_math_common_constants",
],
)