diff options
115 files changed, 3135 insertions, 2296 deletions
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index d2e5bd2..ab536ad 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -250,6 +250,8 @@ Non-comprehensive list of changes in this release - ``__builtin_assume_dereferenceable`` now accepts non-constant size operands. +- Fixed a crash when the second argument to ``__builtin_assume_aligned`` was not constant (#GH161314) + New Compiler Flags ------------------ - New option ``-fno-sanitize-debug-trap-reasons`` added to disable emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``). @@ -452,6 +454,7 @@ Bug Fixes to AST Handling Miscellaneous Bug Fixes ^^^^^^^^^^^^^^^^^^^^^^^ +- Fixed missing diagnostics of ``diagnose_if`` on templates involved in initialization. (#GH160776) Miscellaneous Clang Crashes Fixed ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index f319b17..c961222 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -465,11 +465,16 @@ public: return nullptr; if (Value *Result = ConstantEmitter(CGF).tryEmitConstantExpr(E)) { - if (E->isGLValue()) + if (E->isGLValue()) { + // This was already converted to an rvalue when it was constant + // evaluated. + if (E->hasAPValueResult() && !E->getAPValueResult().isLValue()) + return Result; return CGF.EmitLoadOfScalar( Address(Result, CGF.convertTypeForLoadStore(E->getType()), CGF.getContext().getTypeAlignInChars(E->getType())), /*Volatile*/ false, E->getType(), E->getExprLoc()); + } return Result; } return Visit(E->getSubExpr()); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 7ce3513..3cc61b1 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5954,6 +5954,9 @@ bool Sema::BuiltinAssumeAligned(CallExpr *TheCall) { if (Result > Sema::MaximumAlignment) Diag(TheCall->getBeginLoc(), diag::warn_assume_aligned_too_great) << SecondArg->getSourceRange() << Sema::MaximumAlignment; + + TheCall->setArg(1, + ConstantExpr::Create(Context, SecondArg, APValue(Result))); } if (NumArgs > 2) { diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 8413090..11d2d5c 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -264,14 +264,6 @@ class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> { UnsignedOrNone OuterPackSubstIndex; - TemplateArgument getPackSubstitutedTemplateArgument(TemplateArgument Arg) { - assert(*SemaRef.ArgPackSubstIndex < Arg.pack_size()); - Arg = Arg.pack_begin()[*SemaRef.ArgPackSubstIndex]; - if (Arg.isPackExpansion()) - Arg = Arg.getPackExpansionPattern(); - return Arg; - } - bool shouldVisitTemplateInstantiations() const { return true; } public: @@ -294,7 +286,7 @@ public: assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); - Arg = getPackSubstitutedTemplateArgument(Arg); + Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg); } UsedTemplateArgs.push_back( @@ -312,7 +304,7 @@ public: if (NTTP->isParameterPack() && SemaRef.ArgPackSubstIndex) { assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); - Arg = getPackSubstitutedTemplateArgument(Arg); + Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg); } UsedTemplateArgs.push_back( @@ -325,8 +317,11 @@ public: } bool TraverseDecl(Decl *D) { - if (auto *VD = dyn_cast<ValueDecl>(D)) + if (auto *VD = dyn_cast<ValueDecl>(D)) { + if (auto *Var = dyn_cast<VarDecl>(VD)) + TraverseStmt(Var->getInit()); return TraverseType(VD->getType()); + } return inherited::TraverseDecl(D); } @@ -363,6 +358,14 @@ public: return inherited::TraverseTemplateArgument(Arg); } + bool TraverseSizeOfPackExpr(SizeOfPackExpr *SOPE) { + return TraverseDecl(SOPE->getPack()); + } + + bool VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *E) { + return inherited::TraverseStmt(E->getReplacement()); + } + void VisitConstraint(const NormalizedConstraintWithParamMapping &Constraint) { if (!Constraint.hasParameterMapping()) { for (const auto &List : TemplateArgs) @@ -2083,8 +2086,8 @@ bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) { /*UpdateArgsWithConversions=*/false)) return true; auto TemplateArgs = *MLTAL; - TemplateArgs.replaceOutermostTemplateArguments( - TemplateArgs.getAssociatedDecl(0).first, CTAI.SugaredConverted); + TemplateArgs.replaceOutermostTemplateArguments(CSE->getNamedConcept(), + CTAI.SugaredConverted); return SubstituteParameterMappings(SemaRef, &TemplateArgs, ArgsAsWritten, InFoldExpr) .substitute(CC.getNormalizedConstraint()); diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 0d0d2c0..922fcac 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -7539,7 +7539,7 @@ PerformConstructorInitialization(Sema &S, // Only check access if all of that succeeded. S.CheckConstructorAccess(Loc, Constructor, Step.Function.FoundDecl, Entity); - if (S.DiagnoseUseOfDecl(Step.Function.FoundDecl, Loc)) + if (S.DiagnoseUseOfOverloadedDecl(Constructor, Loc)) return ExprError(); if (const ArrayType *AT = S.Context.getAsArrayType(Entity.getType())) @@ -8092,7 +8092,7 @@ ExprResult InitializationSequence::Perform(Sema &S, S.CheckConstructorAccess(Kind.getLocation(), Constructor, FoundFn, Entity); - if (S.DiagnoseUseOfDecl(FoundFn, Kind.getLocation())) + if (S.DiagnoseUseOfOverloadedDecl(Constructor, Kind.getLocation())) return ExprError(); CastKind = CK_ConstructorConversion; @@ -8102,7 +8102,7 @@ ExprResult InitializationSequence::Perform(Sema &S, CXXConversionDecl *Conversion = cast<CXXConversionDecl>(Fn); S.CheckMemberOperatorAccess(Kind.getLocation(), CurInit.get(), nullptr, FoundFn); - if (S.DiagnoseUseOfDecl(FoundFn, Kind.getLocation())) + if (S.DiagnoseUseOfOverloadedDecl(Conversion, Kind.getLocation())) return ExprError(); CurInit = S.BuildCXXMemberCallExpr(CurInit.get(), FoundFn, Conversion, diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 6bba505..3baa977 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -6718,6 +6718,10 @@ struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor { } return true; } + + bool TraverseSizeOfPackExpr(SizeOfPackExpr *SOPE) override { + return TraverseDecl(SOPE->getPack()); + } }; } diff --git a/clang/test/Parser/cxx0x-lambda-expressions.cpp b/clang/test/Parser/cxx0x-lambda-expressions.cpp index f90f8ce..5b57c7f 100644 --- a/clang/test/Parser/cxx0x-lambda-expressions.cpp +++ b/clang/test/Parser/cxx0x-lambda-expressions.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++03 -Wno-c99-designator %s -Wno-c++11-extensions +// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++03 -Wno-c99-designator %s -Wno-c++11-extensions -Wno-local-type-template-args // RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++11 -Wno-c99-designator %s // RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx17ext,cxx20ext,cxx23ext -std=c++14 -Wno-c99-designator %s // RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx20ext,cxx23ext -std=c++17 -Wno-c99-designator %s diff --git a/clang/test/SemaCXX/builtin-assume-aligned.cpp b/clang/test/SemaCXX/builtin-assume-aligned.cpp index 48bd841..30296c7 100644 --- a/clang/test/SemaCXX/builtin-assume-aligned.cpp +++ b/clang/test/SemaCXX/builtin-assume-aligned.cpp @@ -47,3 +47,16 @@ constexpr void *s1 = __builtin_assume_aligned(x, 32); constexpr void *s2 = __builtin_assume_aligned(x, 32, 5); constexpr void *s3 = __builtin_assume_aligned(x, 32, -1); + +constexpr int add(int a, int b) { + return a+b; +} +constexpr void *c1 = __builtin_assume_aligned(p, add(1,1)); +constexpr void *c2 = __builtin_assume_aligned(p, add(2,1)); // expected-error {{not a power of 2}} + +constexpr long kAlignment = 128; +long AllocateAlignedBytes_payload; +void AllocateAlignedBytes() { + void *m = __builtin_assume_aligned( + reinterpret_cast<void *>(AllocateAlignedBytes_payload), kAlignment); +} diff --git a/clang/test/SemaCXX/diagnose_if.cpp b/clang/test/SemaCXX/diagnose_if.cpp index 1b9e660..0af8bb7 100644 --- a/clang/test/SemaCXX/diagnose_if.cpp +++ b/clang/test/SemaCXX/diagnose_if.cpp @@ -1,5 +1,7 @@ // RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14 +// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++20 -verify=expected,cxx20 // RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14 -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++20 -verify=expected,cxx20 -fexperimental-new-constant-interpreter #define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__))) @@ -665,3 +667,28 @@ void run() { switch (constexpr Foo i = 2) { default: break; } // expected-error{{oh no}} } } + +namespace GH160776 { + +struct ConstructorTemplate { + template <class T> + explicit ConstructorTemplate(T x) + _diagnose_if(sizeof(T) == sizeof(char), "oh no", "error") {} // expected-note {{diagnose_if}} + + template <class T> +#if __cplusplus >= 202002L + requires (sizeof(T) == 1) // cxx20-note {{evaluated to false}} +#endif + operator T() _diagnose_if(sizeof(T) == sizeof(char), "oh no", "error") { // expected-note {{diagnose_if}} \ + // cxx20-note {{constraints not satisfied}} + return T{}; + } +}; + +void run() { + ConstructorTemplate x('1'); // expected-error {{oh no}} + char y = x; // expected-error {{oh no}} + int z = x; // cxx20-error {{no viable conversion}} +} + +} diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp index 8ea8e32..f9d7cfc 100644 --- a/clang/test/SemaCXX/lambda-expressions.cpp +++ b/clang/test/SemaCXX/lambda-expressions.cpp @@ -149,7 +149,8 @@ namespace PR12031 { void f(int i, X x); void g() { const int v = 10; - f(v, [](){}); + f(v, [](){}); // cxx03-warning {{template argument uses local type}} \ + // cxx03-note {{while substituting}} } } @@ -572,26 +573,37 @@ namespace PR27994 { struct A { template <class T> A(T); }; template <class T> -struct B { +struct B { // #PR27994_B int x; - A a = [&] { int y = x; }; - A b = [&] { [&] { [&] { int y = x; }; }; }; - A d = [&](auto param) { int y = x; }; // cxx03-cxx11-error {{'auto' not allowed in lambda parameter}} - A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx03-cxx11-error 2 {{'auto' not allowed in lambda parameter}} + A a = [&] { int y = x; }; // cxx03-warning {{template argument uses unnamed type}} \ + // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}} + A b = [&] { [&] { [&] { int y = x; }; }; }; // cxx03-warning {{template argument uses unnamed type}} \ + // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}} + A d = [&](auto param) { int y = x; }; // cxx03-cxx11-error {{'auto' not allowed in lambda parameter}} \ + // cxx03-warning {{template argument uses unnamed type}} \ + // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}} + A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx03-cxx11-error 2 {{'auto' not allowed in lambda parameter}} \ + // cxx03-warning {{template argument uses unnamed type}} \ + // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}} }; B<int> b; +// cxx03-note@#PR27994_B 4{{in instantiation of default member initializer}} +// cxx03-note@-2 4{{in evaluation of exception}} template <class T> struct C { struct D { + // cxx03-note@-1 {{in instantiation of default member initializer}} int x; - A f = [&] { int y = x; }; + A f = [&] { int y = x; }; // cxx03-warning {{template argument uses unnamed type}} \ + // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}} }; }; int func() { C<int> a; decltype(a)::D b; + // cxx03-note@-1 {{in evaluation of exception}} } } @@ -606,8 +618,12 @@ struct S1 { void foo1() { auto s0 = S1([name=]() {}); // expected-error {{expected expression}} + // cxx03-warning@-1 {{template argument uses local type}} \ + // cxx03-note@-1 {{while substituting deduced template arguments}} auto s1 = S1([name=name]() {}); // expected-error {{use of undeclared identifier 'name'; did you mean 'name1'?}} // cxx03-cxx11-warning@-1 {{initialized lambda captures are a C++14 extension}} + // cxx03-warning@-2 {{template argument uses local type}} \ + // cxx03-note@-2 {{while substituting deduced template arguments}} } } diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp index 6d29f8b..e5e081f 100644 --- a/clang/test/SemaTemplate/concepts.cpp +++ b/clang/test/SemaTemplate/concepts.cpp @@ -1333,4 +1333,75 @@ static_assert(__cpp17_iterator<not_move_constructible>); \ // expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<parameter_mapping_regressions::case3::not_move_constructible>' evaluated to false}} } +namespace case4 { + +template<bool b> +concept bool_ = b; + +template<typename... Ts> +concept unary = bool_<sizeof...(Ts) == 1>; + +static_assert(!unary<>); +static_assert(unary<void>); + +} + +namespace case5 { + +template<int size> +concept true1 = size == size; + +template<typename... Ts> +concept true2 = true1<sizeof...(Ts)>; + +template<typename... Ts> +concept true3 = true2<Ts...>; + +static_assert(true3<void>); + +} + +namespace case6 { + +namespace std { +template <int __v> +struct integral_constant { + static const int value = __v; +}; + +template <class _Tp, class... _Args> +constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...); + +template <class _From, class _To> +constexpr bool is_convertible_v = __is_convertible(_From, _To); + +template <class> +struct tuple_size; + +template <class _Tp> +constexpr decltype(sizeof(int)) tuple_size_v = tuple_size<_Tp>::value; +} // namespace std + +template <int N, int X> +concept FixedExtentConstructibleFromExtent = X == N; + +template <int Extent> +struct span { + int static constexpr extent = Extent; + template <typename R, int N = std::tuple_size_v<R>> + requires(FixedExtentConstructibleFromExtent<extent, N>) + span(R); +}; + +template <class, int> +struct array {}; + +template <class _Tp, decltype(sizeof(int)) _Size> +struct std::tuple_size<array<_Tp, _Size>> : integral_constant<_Size> {}; + +static_assert(std::is_convertible_v<array<int, 3>, span<3>>); +static_assert(!std::is_constructible_v<span<4>, array<int, 3>>); + +} + } diff --git a/libc/shared/math.h b/libc/shared/math.h index 4b2a0d8..924d0cb 100644 --- a/libc/shared/math.h +++ b/libc/shared/math.h @@ -47,6 +47,7 @@ #include "math/exp10f16.h" #include "math/exp10m1f.h" #include "math/exp10m1f16.h" +#include "math/exp2.h" #include "math/expf.h" #include "math/expf16.h" #include "math/frexpf.h" diff --git a/libc/shared/math/exp2.h b/libc/shared/math/exp2.h new file mode 100644 index 0000000..6f1e143 --- /dev/null +++ b/libc/shared/math/exp2.h @@ -0,0 +1,23 @@ +//===-- Shared exp2 function ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_MATH_EXP2_H +#define LLVM_LIBC_SHARED_MATH_EXP2_H + +#include "shared/libc_common.h" +#include "src/__support/math/exp2.h" + +namespace LIBC_NAMESPACE_DECL { +namespace shared { + +using math::exp2; + +} // namespace shared +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SHARED_MATH_EXP2_H diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt index 98f9bb42..4130fdf 100644 --- a/libc/src/__support/math/CMakeLists.txt +++ b/libc/src/__support/math/CMakeLists.txt @@ -374,6 +374,15 @@ add_header_library( ) add_header_library( + common_constants + HDRS + common_constants.h + DEPENDS + libc.src.__support.macros.config + libc.src.__support.number_pair +) + +add_header_library( cos HDRS cos.h @@ -705,6 +714,28 @@ add_header_library( ) add_header_library( + exp2 + HDRS + exp2.h + DEPENDS + .common_constants + .exp_utils + libc.src.__support.CPP.bit + libc.src.__support.CPP.optional + libc.src.__support.FPUtil.dyadic_float + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.nearest_integer + libc.src.__support.FPUtil.polyeval + libc.src.__support.FPUtil.rounding_mode + libc.src.__support.FPUtil.triple_double + libc.src.__support.integer_literals + libc.src.__support.macros.optimization + libc.src.errno.errno +) + +add_header_library( exp10 HDRS exp10.h diff --git a/libc/src/math/generic/common_constants.cpp b/libc/src/__support/math/common_constants.h index 2a15df2..53abbfe 100644 --- a/libc/src/math/generic/common_constants.cpp +++ b/libc/src/__support/math/common_constants.h @@ -6,12 +6,29 @@ // //===----------------------------------------------------------------------===// -#include "common_constants.h" +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H + #include "src/__support/macros/config.h" #include "src/__support/number_pair.h" namespace LIBC_NAMESPACE_DECL { +namespace common_constants_internal { + +// log(2) generated by Sollya with: +// > a = 2^-43 * nearestint(2^43*log(2)); +// LSB = 2^-43 is chosen so that e_x * LOG_2_HI is exact for -1075 < e_x < 1024. +static constexpr double LOG_2_HI = 0x1.62e42fefa38p-1; // LSB = 2^-43 +// > b = round(log10(2) - a, D, RN); +static constexpr double LOG_2_LO = 0x1.ef35793c7673p-45; // LSB = 2^-97 + +// Minimax polynomial for (log(1 + x) - x)/x^2, generated by sollya with: +// > P = fpminimax((log(1 + x) - x)/x^2, 5, [|D...|], [-2^-8, 2^-7]); +constexpr double LOG_COEFFS[6] = {-0x1.fffffffffffffp-2, 0x1.5555555554a9bp-2, + -0x1.0000000094567p-2, 0x1.99999dcc9823cp-3, + -0x1.55550ac2e537ap-3, 0x1.21a02c4e624d7p-3}; + // Range reduction constants for logarithms. // r(0) = 1, r(127) = 0.5 // r(k) = 2^-8 * ceil(2^8 * (1 - 2^-8) / (1 + k*2^-7)) @@ -19,7 +36,7 @@ namespace LIBC_NAMESPACE_DECL { // precision, and -2^-8 <= v < 2^-7. // TODO(lntue): Add reference to how the constants are derived after the // resulting paper is ready. -alignas(8) const float R[128] = { +alignas(8) static constexpr float R[128] = { 0x1p0, 0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1, 0x1.ecp-1, 0x1.e8p-1, 0x1.e4p-1, 0x1.ep-1, 0x1.dep-1, 0x1.dap-1, 0x1.d6p-1, 0x1.d4p-1, 0x1.dp-1, 0x1.ccp-1, 0x1.cap-1, 0x1.c6p-1, 0x1.c4p-1, 0x1.cp-1, 0x1.bep-1, 0x1.bap-1, @@ -40,7 +57,7 @@ alignas(8) const float R[128] = { 0x1.0ap-1, 0x1.08p-1, 0x1.08p-1, 0x1.06p-1, 0x1.06p-1, 0x1.04p-1, 0x1.04p-1, 0x1.02p-1, 0x1.0p-1}; -const double RD[128] = { +static constexpr double RD[128] = { 0x1p0, 0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1, 0x1.ecp-1, 0x1.e8p-1, 0x1.e4p-1, 0x1.ep-1, 0x1.dep-1, 0x1.dap-1, 0x1.d6p-1, 0x1.d4p-1, 0x1.dp-1, 0x1.ccp-1, 0x1.cap-1, 0x1.c6p-1, 0x1.c4p-1, 0x1.cp-1, 0x1.bep-1, 0x1.bap-1, @@ -65,7 +82,7 @@ const double RD[128] = { // available. // Generated by Sollya with the formula: CD[i] = RD[i]*(1 + i*2^-7) - 1 // for RD[i] defined on the table above. -const double CD[128] = { +static constexpr double CD[128] = { 0.0, -0x1p-14, -0x1p-12, -0x1.2p-11, -0x1p-10, -0x1.9p-10, -0x1.2p-9, -0x1.88p-9, -0x1p-8, -0x1.9p-11, -0x1.fp-10, -0x1.9cp-9, -0x1p-12, -0x1.cp-10, -0x1.bp-9, -0x1.5p-11, -0x1.4p-9, 0x1p-14, @@ -90,7 +107,7 @@ const double CD[128] = { -0x1p-14, -0x1p-8, }; -const double LOG_R[128] = { +static constexpr double LOG_R[128] = { 0x0.0000000000000p0, 0x1.010157588de71p-7, 0x1.0205658935847p-6, 0x1.8492528c8cabfp-6, 0x1.0415d89e74444p-5, 0x1.466aed42de3eap-5, 0x1.894aa149fb343p-5, 0x1.ccb73cdddb2ccp-5, 0x1.08598b59e3a07p-4, @@ -135,7 +152,7 @@ const double LOG_R[128] = { 0x1.5707a26bb8c66p-1, 0x1.5af405c3649ep-1, 0x1.5af405c3649ep-1, 0x1.5ee82aa24192p-1, 0x0.000000000000p0}; -const double LOG2_R[128] = { +static constexpr double LOG2_R[128] = { 0x0.0000000000000p+0, 0x1.72c7ba20f7327p-7, 0x1.743ee861f3556p-6, 0x1.184b8e4c56af8p-5, 0x1.77394c9d958d5p-5, 0x1.d6ebd1f1febfep-5, 0x1.1bb32a600549dp-4, 0x1.4c560fe68af88p-4, 0x1.7d60496cfbb4cp-4, @@ -188,7 +205,7 @@ const double LOG2_R[128] = { // print("{", -c, ",", -b, "},"); // }; // We replace LOG_R[0] with log10(1.0) == 0.0 -alignas(16) const NumberPair<double> LOG_R_DD[128] = { +alignas(16) static constexpr NumberPair<double> LOG_R_DD[128] = { {0.0, 0.0}, {-0x1.0c76b999d2be8p-46, 0x1.010157589p-7}, {-0x1.3dc5b06e2f7d2p-45, 0x1.0205658938p-6}, @@ -324,7 +341,7 @@ alignas(16) const NumberPair<double> LOG_R_DD[128] = { // Output range: // [-0x1.3ffcp-15, 0x1.3e3dp-15] // We store S2[i] = 2^16 (r(i - 2^6) - 1). -alignas(8) const int S2[193] = { +alignas(8) static constexpr int S2[193] = { 0x101, 0xfd, 0xf9, 0xf5, 0xf1, 0xed, 0xe9, 0xe5, 0xe1, 0xdd, 0xd9, 0xd5, 0xd1, 0xcd, 0xc9, 0xc5, 0xc1, 0xbd, 0xb9, 0xb4, 0xb0, 0xac, 0xa8, 0xa4, 0xa0, 0x9c, 0x98, @@ -348,7 +365,7 @@ alignas(8) const int S2[193] = { -0x1cd, -0x1d1, -0x1d5, -0x1d9, -0x1dd, -0x1e0, -0x1e4, -0x1e8, -0x1ec, -0x1f0, -0x1f4, -0x1f8, -0x1fc}; -const double R2[193] = { +static constexpr double R2[193] = { 0x1.0101p0, 0x1.00fdp0, 0x1.00f9p0, 0x1.00f5p0, 0x1.00f1p0, 0x1.00edp0, 0x1.00e9p0, 0x1.00e5p0, 0x1.00e1p0, 0x1.00ddp0, 0x1.00d9p0, 0x1.00d5p0, 0x1.00d1p0, 0x1.00cdp0, 0x1.00c9p0, @@ -395,7 +412,7 @@ const double R2[193] = { // Output range: // [-0x1.01928p-22 , 0x1p-22] // We store S[i] = 2^21 (r(i - 80) - 1). -alignas(8) const int S3[161] = { +alignas(8) static constexpr int S3[161] = { 0x50, 0x4f, 0x4e, 0x4d, 0x4c, 0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46, 0x45, 0x44, 0x43, 0x42, 0x41, 0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30, @@ -418,7 +435,7 @@ alignas(8) const int S3[161] = { // Output range: // [-0x1.0002143p-29 , 0x1p-29] // We store S[i] = 2^28 (r(i - 65) - 1). -alignas(8) const int S4[130] = { +alignas(8) static constexpr int S4[130] = { 0x41, 0x40, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, @@ -439,7 +456,7 @@ alignas(8) const int S4[130] = { // Table is generated with Sollya as follow: // > display = hexadecimal; // > for i from -104 to 89 do { D(exp(i)); }; -const double EXP_M1[195] = { +static constexpr double EXP_M1[195] = { 0x1.f1e6b68529e33p-151, 0x1.525be4e4e601dp-149, 0x1.cbe0a45f75eb1p-148, 0x1.3884e838aea68p-146, 0x1.a8c1f14e2af5dp-145, 0x1.20a717e64a9bdp-143, 0x1.8851d84118908p-142, 0x1.0a9bdfb02d240p-140, 0x1.6a5bea046b42ep-139, @@ -511,7 +528,7 @@ const double EXP_M1[195] = { // Table is generated with Sollya as follow: // > display = hexadecimal; // > for i from 0 to 127 do { D(exp(i / 128)); }; -const double EXP_M2[128] = { +static constexpr double EXP_M2[128] = { 0x1.0000000000000p0, 0x1.0202015600446p0, 0x1.04080ab55de39p0, 0x1.06122436410ddp0, 0x1.08205601127edp0, 0x1.0a32a84e9c1f6p0, 0x1.0c49236829e8cp0, 0x1.0e63cfa7ab09dp0, 0x1.1082b577d34edp0, @@ -557,4 +574,8 @@ const double EXP_M2[128] = { 0x1.568bb722dd593p1, 0x1.593b7d72305bbp1, }; +} // namespace common_constants_internal + } // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H diff --git a/libc/src/__support/math/exp2.h b/libc/src/__support/math/exp2.h new file mode 100644 index 0000000..7eaa465 --- /dev/null +++ b/libc/src/__support/math/exp2.h @@ -0,0 +1,425 @@ +//===-- Implementation header for exp2 --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H + +#include "common_constants.h" // Lookup tables EXP2_MID1 and EXP_M2. +#include "exp_constants.h" +#include "exp_utils.h" // ziv_test_denorm. +#include "src/__support/CPP/bit.h" +#include "src/__support/CPP/optional.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/double_double.h" +#include "src/__support/FPUtil/dyadic_float.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/nearest_integer.h" +#include "src/__support/FPUtil/rounding_mode.h" +#include "src/__support/FPUtil/triple_double.h" +#include "src/__support/common.h" +#include "src/__support/integer_literals.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY + +namespace LIBC_NAMESPACE_DECL { + +namespace math { + +namespace exp2_internal { + +using namespace common_constants_internal; + +using fputil::DoubleDouble; +using fputil::TripleDouble; +using Float128 = typename fputil::DyadicFloat<128>; + +using LIBC_NAMESPACE::operator""_u128; + +// Error bounds: +// Errors when using double precision. +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE +constexpr double ERR_D = 0x1.0p-63; +#else +constexpr double ERR_D = 0x1.8p-63; +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE + +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +// Errors when using double-double precision. +constexpr double ERR_DD = 0x1.0p-100; +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + +// Polynomial approximations with double precision. Generated by Sollya with: +// > P = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]); +// > P; +// Error bounds: +// | output - (2^dx - 1) / dx | < 1.5 * 2^-52. +LIBC_INLINE static double poly_approx_d(double dx) { + // dx^2 + double dx2 = dx * dx; + double c0 = + fputil::multiply_add(dx, 0x1.ebfbdff82c58ep-3, 0x1.62e42fefa39efp-1); + double c1 = + fputil::multiply_add(dx, 0x1.3b2aba7a95a89p-7, 0x1.c6b08e8fc0c0ep-5); + double p = fputil::multiply_add(dx2, c1, c0); + return p; +} + +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +// Polynomial approximation with double-double precision. Generated by Solya +// with: +// > P = fpminimax((2^x - 1)/x, 5, [|DD...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]); +// Error bounds: +// | output - 2^(dx) | < 2^-101 +LIBC_INLINE static constexpr DoubleDouble +poly_approx_dd(const DoubleDouble &dx) { + // Taylor polynomial. + constexpr DoubleDouble COEFFS[] = { + {0, 0x1p0}, + {0x1.abc9e3b39824p-56, 0x1.62e42fefa39efp-1}, + {-0x1.5e43a53e4527bp-57, 0x1.ebfbdff82c58fp-3}, + {-0x1.d37963a9444eep-59, 0x1.c6b08d704a0cp-5}, + {0x1.4eda1a81133dap-62, 0x1.3b2ab6fba4e77p-7}, + {-0x1.c53fd1ba85d14p-64, 0x1.5d87fe7a265a5p-10}, + {0x1.d89250b013eb8p-70, 0x1.430912f86cb8ep-13}, + }; + + DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2], + COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]); + return p; +} + +// Polynomial approximation with 128-bit precision: +// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7 +// For |dx| < 2^-13 + 2^-30: +// | output - exp(dx) | < 2^-126. +LIBC_INLINE static constexpr Float128 poly_approx_f128(const Float128 &dx) { + constexpr Float128 COEFFS_128[]{ + {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0 + {Sign::POS, -128, 0xb17217f7'd1cf79ab'c9e3b398'03f2f6af_u128}, + {Sign::POS, -128, 0x3d7f7bff'058b1d50'de2d60dd'9c9a1d9f_u128}, + {Sign::POS, -132, 0xe35846b8'2505fc59'9d3b15d9'e7fb6897_u128}, + {Sign::POS, -134, 0x9d955b7d'd273b94e'184462f6'bcd2b9e7_u128}, + {Sign::POS, -137, 0xaec3ff3c'53398883'39ea1bb9'64c51a89_u128}, + {Sign::POS, -138, 0x2861225f'345c396a'842c5341'8fa8ae61_u128}, + {Sign::POS, -144, 0xffe5fe2d'109a319d'7abeb5ab'd5ad2079_u128}, + }; + + Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2], + COEFFS_128[3], COEFFS_128[4], COEFFS_128[5], + COEFFS_128[6], COEFFS_128[7]); + return p; +} + +// Compute 2^(x) using 128-bit precision. +// TODO(lntue): investigate triple-double precision implementation for this +// step. +LIBC_INLINE static constexpr Float128 exp2_f128(double x, int hi, int idx1, + int idx2) { + Float128 dx = Float128(x); + + // TODO: Skip recalculating exp_mid1 and exp_mid2. + Float128 exp_mid1 = + fputil::quick_add(Float128(EXP2_MID1[idx1].hi), + fputil::quick_add(Float128(EXP2_MID1[idx1].mid), + Float128(EXP2_MID1[idx1].lo))); + + Float128 exp_mid2 = + fputil::quick_add(Float128(EXP2_MID2[idx2].hi), + fputil::quick_add(Float128(EXP2_MID2[idx2].mid), + Float128(EXP2_MID2[idx2].lo))); + + Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2); + + Float128 p = poly_approx_f128(dx); + + Float128 r = fputil::quick_mul(exp_mid, p); + + r.exponent += hi; + + return r; +} + +// Compute 2^x with double-double precision. +LIBC_INLINE static DoubleDouble +exp2_double_double(double x, const DoubleDouble &exp_mid) { + DoubleDouble dx({0, x}); + + // Degree-6 polynomial approximation in double-double precision. + // | p - 2^x | < 2^-103. + DoubleDouble p = poly_approx_dd(dx); + + // Error bounds: 2^-102. + DoubleDouble r = fputil::quick_mult(exp_mid, p); + + return r; +} +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + +// When output is denormal. +LIBC_INLINE static double exp2_denorm(double x) { + // Range reduction. + int k = + static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19); + double kd = static_cast<double>(k); + + uint32_t idx1 = (k >> 6) & 0x3f; + uint32_t idx2 = k & 0x3f; + + int hi = k >> 12; + + DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi}; + DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi}; + DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2); + + // |dx| < 2^-13 + 2^-30. + double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact + + double mid_lo = dx * exp_mid.hi; + + // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4. + double p = poly_approx_d(dx); + + double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo); + +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D) + .value(); +#else + if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D); + LIBC_LIKELY(r.has_value())) + return r.value(); + + // Use double-double + DoubleDouble r_dd = exp2_double_double(dx, exp_mid); + + if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD); + LIBC_LIKELY(r.has_value())) + return r.value(); + + // Use 128-bit precision + Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2); + + return static_cast<double>(r_f128); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS +} + +// Check for exceptional cases when: +// * log2(1 - 2^-54) < x < log2(1 + 2^-53) +// * x >= 1024 +// * x <= -1022 +// * x is inf or nan +LIBC_INLINE static constexpr double set_exceptional(double x) { + using FPBits = typename fputil::FPBits<double>; + FPBits xbits(x); + + uint64_t x_u = xbits.uintval(); + uint64_t x_abs = xbits.abs().uintval(); + + // |x| < log2(1 + 2^-53) + if (x_abs <= 0x3ca71547652b82fd) { + // 2^(x) ~ 1 + x/2 + return fputil::multiply_add(x, 0.5, 1.0); + } + + // x <= -1022 || x >= 1024 or inf/nan. + if (x_u > 0xc08ff00000000000) { + // x <= -1075 or -inf/nan + if (x_u >= 0xc090cc0000000000) { + // exp(-Inf) = 0 + if (xbits.is_inf()) + return 0.0; + + // exp(nan) = nan + if (xbits.is_nan()) + return x; + + if (fputil::quick_get_round() == FE_UPWARD) + return FPBits::min_subnormal().get_val(); + fputil::set_errno_if_required(ERANGE); + fputil::raise_except_if_required(FE_UNDERFLOW); + return 0.0; + } + + return exp2_denorm(x); + } + + // x >= 1024 or +inf/nan + // x is finite + if (x_u < 0x7ff0'0000'0000'0000ULL) { + int rounding = fputil::quick_get_round(); + if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO) + return FPBits::max_normal().get_val(); + + fputil::set_errno_if_required(ERANGE); + fputil::raise_except_if_required(FE_OVERFLOW); + } + // x is +inf or nan + return x + FPBits::inf().get_val(); +} + +} // namespace exp2_internal + +LIBC_INLINE static constexpr double exp2(double x) { + using namespace exp2_internal; + using FPBits = typename fputil::FPBits<double>; + FPBits xbits(x); + + uint64_t x_u = xbits.uintval(); + + // x < -1022 or x >= 1024 or log2(1 - 2^-54) < x < log2(1 + 2^-53). + if (LIBC_UNLIKELY(x_u > 0xc08ff00000000000 || + (x_u <= 0xbc971547652b82fe && x_u >= 0x4090000000000000) || + x_u <= 0x3ca71547652b82fd)) { + return set_exceptional(x); + } + + // Now -1075 < x <= log2(1 - 2^-54) or log2(1 + 2^-53) < x < 1024 + + // Range reduction: + // Let x = (hi + mid1 + mid2) + lo + // in which: + // hi is an integer + // mid1 * 2^6 is an integer + // mid2 * 2^12 is an integer + // then: + // 2^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 2^(lo). + // With this formula: + // - multiplying by 2^hi is exact and cheap, simply by adding the exponent + // field. + // - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables. + // - 2^(lo) ~ 1 + a0*lo + a1 * lo^2 + ... + // + // We compute (hi + mid1 + mid2) together by perform the rounding on x * 2^12. + // Since |x| < |-1075)| < 2^11, + // |x * 2^12| < 2^11 * 2^12 < 2^23, + // So we can fit the rounded result round(x * 2^12) in int32_t. + // Thus, the goal is to be able to use an additional addition and fixed width + // shift to get an int32_t representing round(x * 2^12). + // + // Assuming int32_t using 2-complement representation, since the mantissa part + // of a double precision is unsigned with the leading bit hidden, if we add an + // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^25 to the product, the + // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be + // considered as a proper 2-complement representations of x*2^12. + // + // One small problem with this approach is that the sum (x*2^12 + C) in + // double precision is rounded to the least significant bit of the dorminant + // factor C. In order to minimize the rounding errors from this addition, we + // want to minimize e1. Another constraint that we want is that after + // shifting the mantissa so that the least significant bit of int32_t + // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without + // any adjustment. So combining these 2 requirements, we can choose + // C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence + // after right shifting the mantissa, the resulting int32_t has correct sign. + // With this choice of C, the number of mantissa bits we need to shift to the + // right is: 52 - 33 = 19. + // + // Moreover, since the integer right shifts are equivalent to rounding down, + // we can add an extra 0.5 so that it will become round-to-nearest, tie-to- + // +infinity. So in particular, we can compute: + // hmm = x * 2^12 + C, + // where C = 2^33 + 2^32 + 2^-1, then if + // k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19), + // the reduced argument: + // lo = x - 2^-12 * k is bounded by: + // |lo| <= 2^-13 + 2^-12*2^-19 + // = 2^-13 + 2^-31. + // + // Finally, notice that k only uses the mantissa of x * 2^12, so the + // exponent 2^12 is not needed. So we can simply define + // C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and + // k = int32_t(lower 51 bits of double(x + C) >> 19). + + // Rounding errors <= 2^-31. + int k = + static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19); + double kd = static_cast<double>(k); + + uint32_t idx1 = (k >> 6) & 0x3f; + uint32_t idx2 = k & 0x3f; + + int hi = k >> 12; + + DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi}; + DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi}; + DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2); + + // |dx| < 2^-13 + 2^-30. + double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact + + // We use the degree-4 polynomial to approximate 2^(lo): + // 2^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4 = 1 + lo * P(lo) + // So that the errors are bounded by: + // |P(lo) - (2^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58 + // Let P_ be an evaluation of P where all intermediate computations are in + // double precision. Using either Horner's or Estrin's schemes, the evaluated + // errors can be bounded by: + // |P_(lo) - P(lo)| < 2^-51 + // => |lo * P_(lo) - (2^lo - 1) | < 2^-64 + // => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-63. + // Since we approximate + // 2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo, + // We use the expression: + // (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~ + // ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo) + // with errors bounded by 2^-63. + + double mid_lo = dx * exp_mid.hi; + + // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4. + double p = poly_approx_d(dx); + + double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo); + +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + // To multiply by 2^hi, a fast way is to simply add hi to the exponent + // field. + int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN; + double r = + cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo)); + return r; +#else + double upper = exp_mid.hi + (lo + ERR_D); + double lower = exp_mid.hi + (lo - ERR_D); + + if (LIBC_LIKELY(upper == lower)) { + // To multiply by 2^hi, a fast way is to simply add hi to the exponent + // field. + int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN; + double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper)); + return r; + } + + // Use double-double + DoubleDouble r_dd = exp2_double_double(dx, exp_mid); + + double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD); + double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD); + + if (LIBC_LIKELY(upper_dd == lower_dd)) { + // To multiply by 2^hi, a fast way is to simply add hi to the exponent + // field. + int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN; + double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd)); + return r; + } + + // Use 128-bit precision + Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2); + + return static_cast<double>(r_f128); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS +} + +} // namespace math + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 99c1b08..28ea475 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -1448,21 +1448,7 @@ add_entrypoint_object( HDRS ../exp2.h DEPENDS - .common_constants - libc.src.__support.CPP.bit - libc.src.__support.CPP.optional - libc.src.__support.FPUtil.dyadic_float - libc.src.__support.FPUtil.fenv_impl - libc.src.__support.FPUtil.fp_bits - libc.src.__support.FPUtil.multiply_add - libc.src.__support.FPUtil.nearest_integer - libc.src.__support.FPUtil.polyeval - libc.src.__support.FPUtil.rounding_mode - libc.src.__support.FPUtil.triple_double - libc.src.__support.integer_literals - libc.src.__support.macros.optimization - libc.src.__support.math.exp_utils - libc.src.errno.errno + libc.src.__support.math.exp2 ) add_header_library( @@ -1613,7 +1599,6 @@ add_entrypoint_object( HDRS ../expm1.h DEPENDS - .common_constants libc.src.__support.CPP.bit libc.src.__support.FPUtil.dyadic_float libc.src.__support.FPUtil.fenv_impl @@ -1624,6 +1609,7 @@ add_entrypoint_object( libc.src.__support.FPUtil.triple_double libc.src.__support.integer_literals libc.src.__support.macros.optimization + libc.src.__support.math.common_constants libc.src.errno.errno ) @@ -1634,7 +1620,6 @@ add_entrypoint_object( HDRS ../expm1f.h DEPENDS - .common_constants libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits @@ -1643,6 +1628,7 @@ add_entrypoint_object( libc.src.__support.FPUtil.polyeval libc.src.__support.FPUtil.rounding_mode libc.src.__support.macros.optimization + libc.src.__support.math.common_constants libc.src.errno.errno ) @@ -1673,7 +1659,6 @@ add_entrypoint_object( HDRS ../powf.h DEPENDS - .common_constants .exp2f_impl libc.src.__support.math.exp10f libc.src.__support.CPP.bit @@ -1685,6 +1670,7 @@ add_entrypoint_object( libc.src.__support.FPUtil.sqrt libc.src.__support.FPUtil.triple_double libc.src.__support.macros.optimization + libc.src.__support.math.common_constants libc.src.errno.errno ) @@ -1695,7 +1681,6 @@ add_entrypoint_object( HDRS ../pow.h DEPENDS - .common_constants libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.__support.CPP.bit @@ -1707,6 +1692,7 @@ add_entrypoint_object( libc.src.__support.FPUtil.polyeval libc.src.__support.FPUtil.sqrt libc.src.__support.macros.optimization + libc.src.__support.math.common_constants ) add_entrypoint_object( @@ -2043,26 +2029,14 @@ add_entrypoint_object( libc.src.__support.macros.properties.types ) -add_object_library( - common_constants - HDRS - common_constants.h - SRCS - common_constants.cpp - DEPENDS - libc.src.__support.math.exp_constants - libc.src.__support.math.acosh_float_constants - libc.src.__support.number_pair -) - add_header_library( log_range_reduction HDRS log_range_reduction.h DEPENDS - .common_constants - libc.src.__support.uint128 libc.src.__support.FPUtil.dyadic_float + libc.src.__support.math.common_constants + libc.src.__support.uint128 ) add_entrypoint_object( @@ -2072,7 +2046,6 @@ add_entrypoint_object( HDRS ../log10.h DEPENDS - .common_constants .log_range_reduction libc.src.__support.FPUtil.double_double libc.src.__support.FPUtil.dyadic_float @@ -2082,6 +2055,7 @@ add_entrypoint_object( libc.src.__support.FPUtil.polyeval libc.src.__support.integer_literals libc.src.__support.macros.optimization + libc.src.__support.math.common_constants ) add_entrypoint_object( @@ -2091,12 +2065,12 @@ add_entrypoint_object( HDRS ../log10f.h DEPENDS - .common_constants libc.src.__support.FPUtil.except_value_utils libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.fma libc.src.__support.FPUtil.polyeval + libc.src.__support.math.common_constants ) add_entrypoint_object( @@ -2126,7 +2100,6 @@ add_entrypoint_object( HDRS ../log1p.h DEPENDS - .common_constants libc.src.__support.FPUtil.double_double libc.src.__support.FPUtil.dyadic_float libc.src.__support.FPUtil.fenv_impl @@ -2135,6 +2108,7 @@ add_entrypoint_object( libc.src.__support.FPUtil.polyeval libc.src.__support.integer_literals libc.src.__support.macros.optimization + libc.src.__support.math.common_constants ) add_entrypoint_object( @@ -2144,13 +2118,13 @@ add_entrypoint_object( HDRS ../log1pf.h DEPENDS - .common_constants libc.src.__support.FPUtil.except_value_utils libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.fma libc.src.__support.FPUtil.polyeval libc.src.__support.macros.optimization + libc.src.__support.math.common_constants ) add_entrypoint_object( @@ -2160,7 +2134,6 @@ add_entrypoint_object( HDRS ../log2.h DEPENDS - .common_constants .log_range_reduction libc.src.__support.FPUtil.double_double libc.src.__support.FPUtil.dyadic_float @@ -2170,6 +2143,7 @@ add_entrypoint_object( libc.src.__support.FPUtil.polyeval libc.src.__support.integer_literals libc.src.__support.macros.optimization + libc.src.__support.math.common_constants ) add_entrypoint_object( @@ -2179,13 +2153,13 @@ add_entrypoint_object( HDRS ../log2f.h DEPENDS - .common_constants libc.src.__support.FPUtil.except_value_utils libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.fma libc.src.__support.FPUtil.polyeval libc.src.__support.macros.optimization + libc.src.__support.math.common_constants ) add_entrypoint_object( @@ -2215,7 +2189,6 @@ add_entrypoint_object( HDRS ../log.h DEPENDS - .common_constants .log_range_reduction libc.src.__support.FPUtil.double_double libc.src.__support.FPUtil.dyadic_float @@ -2225,6 +2198,7 @@ add_entrypoint_object( libc.src.__support.FPUtil.polyeval libc.src.__support.integer_literals libc.src.__support.macros.optimization + libc.src.__support.math.common_constants ) add_entrypoint_object( @@ -2234,7 +2208,6 @@ add_entrypoint_object( HDRS ../logf.h DEPENDS - .common_constants libc.src.__support.FPUtil.except_value_utils libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits @@ -2242,6 +2215,7 @@ add_entrypoint_object( libc.src.__support.FPUtil.polyeval libc.src.__support.macros.optimization libc.src.__support.macros.properties.cpu_features + libc.src.__support.math.common_constants ) add_entrypoint_object( diff --git a/libc/src/math/generic/common_constants.h b/libc/src/math/generic/common_constants.h deleted file mode 100644 index 9ee31f0..0000000 --- a/libc/src/math/generic/common_constants.h +++ /dev/null @@ -1,73 +0,0 @@ -//===-- Common constants for math functions ---------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H -#define LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H - -#include "src/__support/FPUtil/triple_double.h" -#include "src/__support/macros/config.h" -#include "src/__support/math/acosh_float_constants.h" -#include "src/__support/math/exp_constants.h" -#include "src/__support/number_pair.h" - -namespace LIBC_NAMESPACE_DECL { - -// Lookup table for range reduction constants r for logarithms. -extern const float R[128]; - -// Lookup table for range reduction constants r for logarithms. -extern const double RD[128]; - -// Lookup table for compensated constants for exact range reduction when FMA -// instructions are not available. -extern const double CD[128]; - -// Lookup table for -log(r) -extern const double LOG_R[128]; -extern const NumberPair<double> LOG_R_DD[128]; - -// Lookup table for -log2(r) -extern const double LOG2_R[128]; - -// Minimax polynomial for (log(1 + x) - x)/x^2, generated by sollya with: -// > P = fpminimax((log(1 + x) - x)/x^2, 5, [|D...|], [-2^-8, 2^-7]); -constexpr double LOG_COEFFS[6] = {-0x1.fffffffffffffp-2, 0x1.5555555554a9bp-2, - -0x1.0000000094567p-2, 0x1.99999dcc9823cp-3, - -0x1.55550ac2e537ap-3, 0x1.21a02c4e624d7p-3}; - -// Logarithm Range Reduction - Step 2, 3, and 4. -extern const int S2[193]; -extern const int S3[161]; -extern const int S4[130]; - -extern const double R2[193]; - -// log(2) generated by Sollya with: -// > a = 2^-43 * nearestint(2^43*log(2)); -// LSB = 2^-43 is chosen so that e_x * LOG_2_HI is exact for -1075 < e_x < 1024. -constexpr double LOG_2_HI = 0x1.62e42fefa38p-1; // LSB = 2^-43 -// > b = round(log10(2) - a, D, RN); -constexpr double LOG_2_LO = 0x1.ef35793c7673p-45; // LSB = 2^-97 - -// Lookup table for exp(m) with m = -104, ..., 89. -// -104 = floor(log(single precision's min denormal)) -// 89 = ceil(log(single precision's max normal)) -// Table is generated with Sollya as follow: -// > display = hexadecimal; -// > for i from -104 to 89 do { D(exp(i)); }; -extern const double EXP_M1[195]; - -// Lookup table for exp(m * 2^(-7)) with m = 0, ..., 127. -// Table is generated with Sollya as follow: -// > display = hexadecimal; -// > for i from 0 to 127 do { D(exp(i / 128)); }; -extern const double EXP_M2[128]; - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp index 154154f..20e1ff5 100644 --- a/libc/src/math/generic/exp2.cpp +++ b/libc/src/math/generic/exp2.cpp @@ -7,404 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/math/exp2.h" -#include "common_constants.h" // Lookup tables EXP2_MID1 and EXP_M2. -#include "src/__support/CPP/bit.h" -#include "src/__support/CPP/optional.h" -#include "src/__support/FPUtil/FEnvImpl.h" -#include "src/__support/FPUtil/FPBits.h" -#include "src/__support/FPUtil/PolyEval.h" -#include "src/__support/FPUtil/double_double.h" -#include "src/__support/FPUtil/dyadic_float.h" -#include "src/__support/FPUtil/multiply_add.h" -#include "src/__support/FPUtil/nearest_integer.h" -#include "src/__support/FPUtil/rounding_mode.h" -#include "src/__support/FPUtil/triple_double.h" -#include "src/__support/common.h" -#include "src/__support/integer_literals.h" -#include "src/__support/macros/config.h" -#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY -#include "src/__support/math/exp_utils.h" // ziv_test_denorm. +#include "src/__support/math/exp2.h" namespace LIBC_NAMESPACE_DECL { -using fputil::DoubleDouble; -using fputil::TripleDouble; -using Float128 = typename fputil::DyadicFloat<128>; - -using LIBC_NAMESPACE::operator""_u128; - -// Error bounds: -// Errors when using double precision. -#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE -constexpr double ERR_D = 0x1.0p-63; -#else -constexpr double ERR_D = 0x1.8p-63; -#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE - -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS -// Errors when using double-double precision. -constexpr double ERR_DD = 0x1.0p-100; -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS - -namespace { - -// Polynomial approximations with double precision. Generated by Sollya with: -// > P = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]); -// > P; -// Error bounds: -// | output - (2^dx - 1) / dx | < 1.5 * 2^-52. -LIBC_INLINE double poly_approx_d(double dx) { - // dx^2 - double dx2 = dx * dx; - double c0 = - fputil::multiply_add(dx, 0x1.ebfbdff82c58ep-3, 0x1.62e42fefa39efp-1); - double c1 = - fputil::multiply_add(dx, 0x1.3b2aba7a95a89p-7, 0x1.c6b08e8fc0c0ep-5); - double p = fputil::multiply_add(dx2, c1, c0); - return p; -} - -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS -// Polynomial approximation with double-double precision. Generated by Solya -// with: -// > P = fpminimax((2^x - 1)/x, 5, [|DD...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]); -// Error bounds: -// | output - 2^(dx) | < 2^-101 -DoubleDouble poly_approx_dd(const DoubleDouble &dx) { - // Taylor polynomial. - constexpr DoubleDouble COEFFS[] = { - {0, 0x1p0}, - {0x1.abc9e3b39824p-56, 0x1.62e42fefa39efp-1}, - {-0x1.5e43a53e4527bp-57, 0x1.ebfbdff82c58fp-3}, - {-0x1.d37963a9444eep-59, 0x1.c6b08d704a0cp-5}, - {0x1.4eda1a81133dap-62, 0x1.3b2ab6fba4e77p-7}, - {-0x1.c53fd1ba85d14p-64, 0x1.5d87fe7a265a5p-10}, - {0x1.d89250b013eb8p-70, 0x1.430912f86cb8ep-13}, - }; - - DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2], - COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]); - return p; -} - -// Polynomial approximation with 128-bit precision: -// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7 -// For |dx| < 2^-13 + 2^-30: -// | output - exp(dx) | < 2^-126. -Float128 poly_approx_f128(const Float128 &dx) { - constexpr Float128 COEFFS_128[]{ - {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0 - {Sign::POS, -128, 0xb17217f7'd1cf79ab'c9e3b398'03f2f6af_u128}, - {Sign::POS, -128, 0x3d7f7bff'058b1d50'de2d60dd'9c9a1d9f_u128}, - {Sign::POS, -132, 0xe35846b8'2505fc59'9d3b15d9'e7fb6897_u128}, - {Sign::POS, -134, 0x9d955b7d'd273b94e'184462f6'bcd2b9e7_u128}, - {Sign::POS, -137, 0xaec3ff3c'53398883'39ea1bb9'64c51a89_u128}, - {Sign::POS, -138, 0x2861225f'345c396a'842c5341'8fa8ae61_u128}, - {Sign::POS, -144, 0xffe5fe2d'109a319d'7abeb5ab'd5ad2079_u128}, - }; - - Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2], - COEFFS_128[3], COEFFS_128[4], COEFFS_128[5], - COEFFS_128[6], COEFFS_128[7]); - return p; -} - -// Compute 2^(x) using 128-bit precision. -// TODO(lntue): investigate triple-double precision implementation for this -// step. -Float128 exp2_f128(double x, int hi, int idx1, int idx2) { - Float128 dx = Float128(x); - - // TODO: Skip recalculating exp_mid1 and exp_mid2. - Float128 exp_mid1 = - fputil::quick_add(Float128(EXP2_MID1[idx1].hi), - fputil::quick_add(Float128(EXP2_MID1[idx1].mid), - Float128(EXP2_MID1[idx1].lo))); - - Float128 exp_mid2 = - fputil::quick_add(Float128(EXP2_MID2[idx2].hi), - fputil::quick_add(Float128(EXP2_MID2[idx2].mid), - Float128(EXP2_MID2[idx2].lo))); - - Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2); - - Float128 p = poly_approx_f128(dx); - - Float128 r = fputil::quick_mul(exp_mid, p); - - r.exponent += hi; - - return r; -} - -// Compute 2^x with double-double precision. -DoubleDouble exp2_double_double(double x, const DoubleDouble &exp_mid) { - DoubleDouble dx({0, x}); - - // Degree-6 polynomial approximation in double-double precision. - // | p - 2^x | < 2^-103. - DoubleDouble p = poly_approx_dd(dx); - - // Error bounds: 2^-102. - DoubleDouble r = fputil::quick_mult(exp_mid, p); - - return r; -} -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS - -// When output is denormal. -double exp2_denorm(double x) { - // Range reduction. - int k = - static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19); - double kd = static_cast<double>(k); - - uint32_t idx1 = (k >> 6) & 0x3f; - uint32_t idx2 = k & 0x3f; - - int hi = k >> 12; - - DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi}; - DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi}; - DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2); - - // |dx| < 2^-13 + 2^-30. - double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact - - double mid_lo = dx * exp_mid.hi; - - // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4. - double p = poly_approx_d(dx); - - double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo); - -#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS - return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D) - .value(); -#else - if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D); - LIBC_LIKELY(r.has_value())) - return r.value(); - - // Use double-double - DoubleDouble r_dd = exp2_double_double(dx, exp_mid); - - if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD); - LIBC_LIKELY(r.has_value())) - return r.value(); - - // Use 128-bit precision - Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2); - - return static_cast<double>(r_f128); -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS -} - -// Check for exceptional cases when: -// * log2(1 - 2^-54) < x < log2(1 + 2^-53) -// * x >= 1024 -// * x <= -1022 -// * x is inf or nan -double set_exceptional(double x) { - using FPBits = typename fputil::FPBits<double>; - FPBits xbits(x); - - uint64_t x_u = xbits.uintval(); - uint64_t x_abs = xbits.abs().uintval(); - - // |x| < log2(1 + 2^-53) - if (x_abs <= 0x3ca71547652b82fd) { - // 2^(x) ~ 1 + x/2 - return fputil::multiply_add(x, 0.5, 1.0); - } - - // x <= -1022 || x >= 1024 or inf/nan. - if (x_u > 0xc08ff00000000000) { - // x <= -1075 or -inf/nan - if (x_u >= 0xc090cc0000000000) { - // exp(-Inf) = 0 - if (xbits.is_inf()) - return 0.0; - - // exp(nan) = nan - if (xbits.is_nan()) - return x; - - if (fputil::quick_get_round() == FE_UPWARD) - return FPBits::min_subnormal().get_val(); - fputil::set_errno_if_required(ERANGE); - fputil::raise_except_if_required(FE_UNDERFLOW); - return 0.0; - } - - return exp2_denorm(x); - } - - // x >= 1024 or +inf/nan - // x is finite - if (x_u < 0x7ff0'0000'0000'0000ULL) { - int rounding = fputil::quick_get_round(); - if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO) - return FPBits::max_normal().get_val(); - - fputil::set_errno_if_required(ERANGE); - fputil::raise_except_if_required(FE_OVERFLOW); - } - // x is +inf or nan - return x + FPBits::inf().get_val(); -} - -} // namespace - -LLVM_LIBC_FUNCTION(double, exp2, (double x)) { - using FPBits = typename fputil::FPBits<double>; - FPBits xbits(x); - - uint64_t x_u = xbits.uintval(); - - // x < -1022 or x >= 1024 or log2(1 - 2^-54) < x < log2(1 + 2^-53). - if (LIBC_UNLIKELY(x_u > 0xc08ff00000000000 || - (x_u <= 0xbc971547652b82fe && x_u >= 0x4090000000000000) || - x_u <= 0x3ca71547652b82fd)) { - return set_exceptional(x); - } - - // Now -1075 < x <= log2(1 - 2^-54) or log2(1 + 2^-53) < x < 1024 - - // Range reduction: - // Let x = (hi + mid1 + mid2) + lo - // in which: - // hi is an integer - // mid1 * 2^6 is an integer - // mid2 * 2^12 is an integer - // then: - // 2^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 2^(lo). - // With this formula: - // - multiplying by 2^hi is exact and cheap, simply by adding the exponent - // field. - // - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables. - // - 2^(lo) ~ 1 + a0*lo + a1 * lo^2 + ... - // - // We compute (hi + mid1 + mid2) together by perform the rounding on x * 2^12. - // Since |x| < |-1075)| < 2^11, - // |x * 2^12| < 2^11 * 2^12 < 2^23, - // So we can fit the rounded result round(x * 2^12) in int32_t. - // Thus, the goal is to be able to use an additional addition and fixed width - // shift to get an int32_t representing round(x * 2^12). - // - // Assuming int32_t using 2-complement representation, since the mantissa part - // of a double precision is unsigned with the leading bit hidden, if we add an - // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^25 to the product, the - // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be - // considered as a proper 2-complement representations of x*2^12. - // - // One small problem with this approach is that the sum (x*2^12 + C) in - // double precision is rounded to the least significant bit of the dorminant - // factor C. In order to minimize the rounding errors from this addition, we - // want to minimize e1. Another constraint that we want is that after - // shifting the mantissa so that the least significant bit of int32_t - // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without - // any adjustment. So combining these 2 requirements, we can choose - // C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence - // after right shifting the mantissa, the resulting int32_t has correct sign. - // With this choice of C, the number of mantissa bits we need to shift to the - // right is: 52 - 33 = 19. - // - // Moreover, since the integer right shifts are equivalent to rounding down, - // we can add an extra 0.5 so that it will become round-to-nearest, tie-to- - // +infinity. So in particular, we can compute: - // hmm = x * 2^12 + C, - // where C = 2^33 + 2^32 + 2^-1, then if - // k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19), - // the reduced argument: - // lo = x - 2^-12 * k is bounded by: - // |lo| <= 2^-13 + 2^-12*2^-19 - // = 2^-13 + 2^-31. - // - // Finally, notice that k only uses the mantissa of x * 2^12, so the - // exponent 2^12 is not needed. So we can simply define - // C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and - // k = int32_t(lower 51 bits of double(x + C) >> 19). - - // Rounding errors <= 2^-31. - int k = - static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19); - double kd = static_cast<double>(k); - - uint32_t idx1 = (k >> 6) & 0x3f; - uint32_t idx2 = k & 0x3f; - - int hi = k >> 12; - - DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi}; - DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi}; - DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2); - - // |dx| < 2^-13 + 2^-30. - double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact - - // We use the degree-4 polynomial to approximate 2^(lo): - // 2^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4 = 1 + lo * P(lo) - // So that the errors are bounded by: - // |P(lo) - (2^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58 - // Let P_ be an evaluation of P where all intermediate computations are in - // double precision. Using either Horner's or Estrin's schemes, the evaluated - // errors can be bounded by: - // |P_(lo) - P(lo)| < 2^-51 - // => |lo * P_(lo) - (2^lo - 1) | < 2^-64 - // => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-63. - // Since we approximate - // 2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo, - // We use the expression: - // (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~ - // ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo) - // with errors bounded by 2^-63. - - double mid_lo = dx * exp_mid.hi; - - // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4. - double p = poly_approx_d(dx); - - double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo); - -#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS - // To multiply by 2^hi, a fast way is to simply add hi to the exponent - // field. - int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN; - double r = - cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo)); - return r; -#else - double upper = exp_mid.hi + (lo + ERR_D); - double lower = exp_mid.hi + (lo - ERR_D); - - if (LIBC_LIKELY(upper == lower)) { - // To multiply by 2^hi, a fast way is to simply add hi to the exponent - // field. - int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN; - double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper)); - return r; - } - - // Use double-double - DoubleDouble r_dd = exp2_double_double(dx, exp_mid); - - double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD); - double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD); - - if (LIBC_LIKELY(upper_dd == lower_dd)) { - // To multiply by 2^hi, a fast way is to simply add hi to the exponent - // field. - int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN; - double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd)); - return r; - } - - // Use 128-bit precision - Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2); - - return static_cast<double>(r_f128); -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS -} +LLVM_LIBC_FUNCTION(double, exp2, (double x)) { return math::exp2(x); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp index c360554..a3d0c1a 100644 --- a/libc/src/math/generic/expm1.cpp +++ b/libc/src/math/generic/expm1.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/math/expm1.h" -#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2. #include "src/__support/CPP/bit.h" #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" @@ -22,6 +21,8 @@ #include "src/__support/integer_literals.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2. +#include "src/__support/math/exp_constants.h" #if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0) #define LIBC_MATH_EXPM1_SKIP_ACCURATE_PASS @@ -59,6 +60,8 @@ constexpr double MLOG_2_EXP2_M12_LO = 0x1.b0e2633fe0685p-79; namespace { +using namespace common_constants_internal; + // Polynomial approximations with double precision: // Return expm1(dx) / x ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24. // For |dx| < 2^-13 + 2^-30: diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp index b2967e2..72c8aa3 100644 --- a/libc/src/math/generic/expm1f.cpp +++ b/libc/src/math/generic/expm1f.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/math/expm1f.h" -#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2. #include "src/__support/FPUtil/BasicOperations.h" #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FMA.h" @@ -20,10 +19,12 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA +#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2. namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { + using namespace common_constants_internal; using FPBits = typename fputil::FPBits<float>; FPBits xbits(x); diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp index 0cd4424..66ce059 100644 --- a/libc/src/math/generic/log.cpp +++ b/libc/src/math/generic/log.cpp @@ -18,8 +18,8 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY -#include "common_constants.h" #include "log_range_reduction.h" +#include "src/__support/math/common_constants.h" namespace LIBC_NAMESPACE_DECL { @@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128; namespace { +using namespace common_constants_internal; + #ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS // A simple upper bound for the error of e_x * log(2) - log(r). constexpr double HI_ERR = 0x1.0p-85; diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp index 1c4e559..95f24fa 100644 --- a/libc/src/math/generic/log10.cpp +++ b/libc/src/math/generic/log10.cpp @@ -18,8 +18,8 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY -#include "common_constants.h" #include "log_range_reduction.h" +#include "src/__support/math/common_constants.h" namespace LIBC_NAMESPACE_DECL { @@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128; namespace { +using namespace common_constants_internal; + constexpr fputil::DoubleDouble LOG10_E = {0x1.95355baaafad3p-57, 0x1.bcb7b1526e50ep-2}; @@ -739,6 +741,7 @@ double log10_accurate(int e_x, int index, double m_x) { } // namespace LLVM_LIBC_FUNCTION(double, log10, (double x)) { + using namespace common_constants_internal; using FPBits_t = typename fputil::FPBits<double>; FPBits_t xbits(x); diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp index 81e7cdb..6b9cc5d 100644 --- a/libc/src/math/generic/log10f.cpp +++ b/libc/src/math/generic/log10f.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/math/log10f.h" -#include "common_constants.h" // Lookup table for (1/f) #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FMA.h" #include "src/__support/FPUtil/FPBits.h" @@ -18,6 +17,7 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/cpu_features.h" +#include "src/__support/math/common_constants.h" // Lookup table for (1/f) // This is an algorithm for log10(x) in single precision which is // correctly rounded for all rounding modes, based on the implementation of @@ -104,6 +104,7 @@ static constexpr double LOG10_R[128] = { 0x1.30cb3a7bb3625p-2, 0x1.34413509f79ffp-2}; LLVM_LIBC_FUNCTION(float, log10f, (float x)) { + using namespace common_constants_internal; constexpr double LOG10_2 = 0x1.34413509f79ffp-2; using FPBits = typename fputil::FPBits<float>; diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp index 09f465a..1595981 100644 --- a/libc/src/math/generic/log1p.cpp +++ b/libc/src/math/generic/log1p.cpp @@ -18,7 +18,7 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY -#include "common_constants.h" +#include "src/__support/math/common_constants.h" namespace LIBC_NAMESPACE_DECL { @@ -29,6 +29,8 @@ using LIBC_NAMESPACE::operator""_u128; namespace { +using namespace common_constants_internal; + // R1[i] = 2^-8 * nearestint( 2^8 / (1 + i * 2^-7) ) constexpr double R1[129] = { 0x1p0, 0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1, 0x1.ecp-1, 0x1.eap-1, diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp index 16b1b34..f0289c2 100644 --- a/libc/src/math/generic/log1pf.cpp +++ b/libc/src/math/generic/log1pf.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/math/log1pf.h" -#include "common_constants.h" // Lookup table for (1/f) and log(f) #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FMA.h" #include "src/__support/FPUtil/FPBits.h" @@ -18,6 +17,8 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/cpu_features.h" +#include "src/__support/math/acosh_float_constants.h" +#include "src/__support/math/common_constants.h" // Lookup table for (1/f) and log(f) // This is an algorithm for log10(x) in single precision which is // correctly rounded for all rounding modes. @@ -38,6 +39,7 @@ namespace internal { // We don't need to treat denormal and 0 LIBC_INLINE float log(double x) { using namespace acoshf_internal; + using namespace common_constants_internal; constexpr double LOG_2 = 0x1.62e42fefa39efp-1; using FPBits = typename fputil::FPBits<double>; diff --git a/libc/src/math/generic/log2.cpp b/libc/src/math/generic/log2.cpp index 27ca2fc..f0c0ae3 100644 --- a/libc/src/math/generic/log2.cpp +++ b/libc/src/math/generic/log2.cpp @@ -18,8 +18,8 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY -#include "common_constants.h" #include "log_range_reduction.h" +#include "src/__support/math/common_constants.h" namespace LIBC_NAMESPACE_DECL { @@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128; namespace { +using namespace common_constants_internal; + constexpr fputil::DoubleDouble LOG2_E = {0x1.777d0ffda0d24p-56, 0x1.71547652b82fep0}; @@ -859,6 +861,7 @@ double log2_accurate(int e_x, int index, double m_x) { } // namespace LLVM_LIBC_FUNCTION(double, log2, (double x)) { + using namespace common_constants_internal; using FPBits_t = typename fputil::FPBits<double>; FPBits_t xbits(x); diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp index cff718e..7353f03 100644 --- a/libc/src/math/generic/log2f.cpp +++ b/libc/src/math/generic/log2f.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/math/log2f.h" -#include "common_constants.h" // Lookup table for (1/f) #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" @@ -15,7 +14,8 @@ #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" -#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/math/common_constants.h" // Lookup table for (1/f) // This is a correctly-rounded algorithm for log2(x) in single precision with // round-to-nearest, tie-to-even mode from the RLIBM project at: @@ -55,6 +55,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float, log2f, (float x)) { + using namespace common_constants_internal; using FPBits = typename fputil::FPBits<float>; FPBits xbits(x); diff --git a/libc/src/math/generic/log_range_reduction.h b/libc/src/math/generic/log_range_reduction.h index 8c94230..7484506 100644 --- a/libc/src/math/generic/log_range_reduction.h +++ b/libc/src/math/generic/log_range_reduction.h @@ -9,9 +9,9 @@ #ifndef LLVM_LIBC_SRC_MATH_GENERIC_LOG_RANGE_REDUCTION_H #define LLVM_LIBC_SRC_MATH_GENERIC_LOG_RANGE_REDUCTION_H -#include "common_constants.h" #include "src/__support/FPUtil/dyadic_float.h" #include "src/__support/macros/config.h" +#include "src/__support/math/common_constants.h" #include "src/__support/uint128.h" namespace LIBC_NAMESPACE_DECL { @@ -36,6 +36,7 @@ struct LogRR { LIBC_INLINE fputil::DyadicFloat<128> log_range_reduction(double m_x, const LogRR &log_table, fputil::DyadicFloat<128> &sum) { + using namespace common_constants_internal; using Float128 = typename fputil::DyadicFloat<128>; using MType = typename Float128::MantissaType; diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp index e8d2ba2..4d2947d 100644 --- a/libc/src/math/generic/logf.cpp +++ b/libc/src/math/generic/logf.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/math/logf.h" -#include "common_constants.h" // Lookup table for (1/f) and log(f) #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" @@ -17,6 +16,7 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/cpu_features.h" +#include "src/__support/math/common_constants.h" // Lookup table for (1/f) and log(f) // This is an algorithm for log(x) in single precision which is correctly // rounded for all rounding modes, based on the implementation of log(x) from @@ -53,6 +53,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float, logf, (float x)) { + using namespace common_constants_internal; constexpr double LOG_2 = 0x1.62e42fefa39efp-1; using FPBits = typename fputil::FPBits<float>; diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp index 43e99a7..c9f685b 100644 --- a/libc/src/math/generic/pow.cpp +++ b/libc/src/math/generic/pow.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/math/pow.h" -#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2. #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" #include "src/__support/CPP/bit.h" @@ -21,6 +20,8 @@ #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2. +#include "src/__support/math/exp_constants.h" // Lookup tables EXP_M1 and EXP_M2. namespace LIBC_NAMESPACE_DECL { @@ -28,6 +29,8 @@ using fputil::DoubleDouble; namespace { +using namespace common_constants_internal; + // Constants for log2(x) range reduction, generated by Sollya with: // > for i from 0 to 127 do { // r = 2^-8 * ceil( 2^8 * (1 - 2^(-8)) / (1 + i*2^-7) ); diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp index a45ef51..12246e9 100644 --- a/libc/src/math/generic/powf.cpp +++ b/libc/src/math/generic/powf.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/math/powf.h" -#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2. #include "src/__support/CPP/bit.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" @@ -15,10 +14,13 @@ #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/nearest_integer.h" #include "src/__support/FPUtil/sqrt.h" // Speedup for powf(x, 1/2) = sqrtf(x) +#include "src/__support/FPUtil/triple_double.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2. #include "src/__support/math/exp10f.h" // Speedup for powf(10, y) = exp10f(y) +#include "src/__support/math/exp_constants.h" #include "exp2f_impl.h" // Speedup for powf(2, y) = exp2f(y) @@ -29,6 +31,8 @@ using fputil::TripleDouble; namespace { +using namespace common_constants_internal; + #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS alignas(16) constexpr DoubleDouble LOG2_R_DD[128] = { {0.0, 0.0}, diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt index ea4634c..040f635 100644 --- a/libc/test/shared/CMakeLists.txt +++ b/libc/test/shared/CMakeLists.txt @@ -40,6 +40,7 @@ add_fp_unittest( libc.src.__support.math.exp10m1f16 libc.src.__support.math.erff libc.src.__support.math.exp + libc.src.__support.math.exp2 libc.src.__support.math.exp10 libc.src.__support.math.exp10f libc.src.__support.math.exp10f16 diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp index 1722193..ef2e7b8 100644 --- a/libc/test/shared/shared_math_test.cpp +++ b/libc/test/shared/shared_math_test.cpp @@ -80,6 +80,7 @@ TEST(LlvmLibcSharedMathTest, AllDouble) { EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::cos(0.0)); EXPECT_FP_EQ(0x0p+0, LIBC_NAMESPACE::shared::dsqrtl(0.0)); EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp(0.0)); + EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp2(0.0)); EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp10(0.0)); } diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst index eae693b1..f86c736 100644 --- a/llvm/docs/GitHub.rst +++ b/llvm/docs/GitHub.rst @@ -567,8 +567,11 @@ branch. Just make sure to add the release milestone to the pull request. Getting admin access to CI infrastructure ========================================= -Any individual who is responsible for setting up and/or maintaining CI infrastructure for a LLVM project can -request to be granted the CI/CD role to the LLVM organization admins. The request can be made by creating -`a Github issue <https://github.com/llvm/llvm-project/issues/new>`_ and using the ``infrastructure`` label. -Applicants must include a justification for why the role is being requested. Applications are reviewed on a -case-by-case basis by the LLVM admins and the role can be revoked at any point as the LLVM admins see fit. +Any individual who is responsible for setting up and/or maintaining CI +infrastructure for a LLVM project can request to be granted the CI/CD role by +the LLVM infrastructure area team. The request can be made by creating `a +Github issue <https://github.com/llvm/llvm-project/issues/new>`_ and using the +``infrastructure`` label. Applicants must include a justification for why the +role is being requested. Applications are reviewed on a case-by-case basis by +the LLVM infrastructure area team and the role can be revoked at any point as +the area team sees fit. diff --git a/llvm/include/llvm/ADT/TypeSwitch.h b/llvm/include/llvm/ADT/TypeSwitch.h index 5bbbdf2..5657303 100644 --- a/llvm/include/llvm/ADT/TypeSwitch.h +++ b/llvm/include/llvm/ADT/TypeSwitch.h @@ -17,6 +17,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" #include <optional> namespace llvm { @@ -117,11 +118,16 @@ public: return defaultResult; } - [[nodiscard]] operator ResultT() { - assert(result && "Fell off the end of a type-switch"); - return std::move(*result); + /// Declare default as unreachable, making sure that all cases were handled. + [[nodiscard]] ResultT DefaultUnreachable( + const char *message = "Fell off the end of a type-switch") { + if (result) + return std::move(*result); + llvm_unreachable(message); } + [[nodiscard]] operator ResultT() { return DefaultUnreachable(); } + private: /// The pointer to the result of this switch statement, once known, /// null before that. @@ -158,6 +164,13 @@ public: defaultFn(this->value); } + /// Declare default as unreachable, making sure that all cases were handled. + void DefaultUnreachable( + const char *message = "Fell off the end of a type-switch") { + if (!foundMatch) + llvm_unreachable(message); + } + private: /// A flag detailing if we have already found a match. bool foundMatch = false; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 7a4abe9..5d3b233 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1551,12 +1551,6 @@ public: OperandValueInfo OpdInfo = {OK_AnyValue, OP_None}, const Instruction *I = nullptr) const; - /// \return The cost of VP Load and Store instructions. - LLVM_ABI InstructionCost getVPMemoryOpCost( - unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, - const Instruction *I = nullptr) const; - /// \return The cost of masked Load and Store instructions. LLVM_ABI InstructionCost getMaskedMemoryOpCost( unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 566e1cf..4cd607c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -841,14 +841,6 @@ public: return 1; } - virtual InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, - Align Alignment, - unsigned AddressSpace, - TTI::TargetCostKind CostKind, - const Instruction *I) const { - return 1; - } - virtual InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index 6c12cd3..7a2e9ad 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -464,10 +464,10 @@ public: const char *getData64bitsDirective() const { return Data64bitsDirective; } bool supportsSignedData() const { return SupportsSignedData; } - /// Targets can implement this method to specify a section to switch to if the - /// translation unit doesn't have any trampolines that require an executable - /// stack. - virtual MCSection *getNonexecutableStackSection(MCContext &Ctx) const { + /// Targets can implement this method to specify a section to switch to + /// depending on whether the translation unit has any trampolines that require + /// an executable stack. + virtual MCSection *getStackSection(MCContext &Ctx, bool Exec) const { return nullptr; } diff --git a/llvm/include/llvm/MC/MCAsmInfoELF.h b/llvm/include/llvm/MC/MCAsmInfoELF.h index e067888..095ee4d 100644 --- a/llvm/include/llvm/MC/MCAsmInfoELF.h +++ b/llvm/include/llvm/MC/MCAsmInfoELF.h @@ -15,7 +15,7 @@ namespace llvm { class MCAsmInfoELF : public MCAsmInfo { virtual void anchor(); - MCSection *getNonexecutableStackSection(MCContext &Ctx) const override; + MCSection *getStackSection(MCContext &Ctx, bool Exec) const override; void printSwitchToSection(const MCSection &, uint32_t, const Triple &, raw_ostream &) const final; bool useCodeAlign(const MCSection &Sec) const final; diff --git a/llvm/include/llvm/MC/TargetRegistry.h b/llvm/include/llvm/MC/TargetRegistry.h index 019ee60..570d4c0 100644 --- a/llvm/include/llvm/MC/TargetRegistry.h +++ b/llvm/include/llvm/MC/TargetRegistry.h @@ -389,6 +389,7 @@ public: /// @name Feature Constructors /// @{ + // TODO(boomanaiden154): Remove this function after LLVM 22 branches. [[deprecated("Use overload accepting Triple instead")]] MCAsmInfo *createMCAsmInfo(const MCRegisterInfo &MRI, StringRef TheTriple, const MCTargetOptions &Options) const { @@ -440,6 +441,7 @@ public: return MCInstrAnalysisCtorFn(Info); } + // TODO(boomanaiden154): Remove this function after LLVM 22 branches. [[deprecated("Use overload accepting Triple instead")]] MCRegisterInfo *createMCRegInfo(StringRef TT) const { if (!MCRegInfoCtorFn) @@ -454,6 +456,7 @@ public: return MCRegInfoCtorFn(TT); } + // TODO(boomanaiden154): Remove this function after LLVM 22 branches. [[deprecated("Use overload accepting Triple instead")]] MCSubtargetInfo *createMCSubtargetInfo(StringRef TheTriple, StringRef CPU, StringRef Features) const { @@ -496,16 +499,6 @@ public: JIT); } - [[deprecated("Use overload accepting Triple instead")]] - TargetMachine *createTargetMachine( - StringRef TT, StringRef CPU, StringRef Features, - const TargetOptions &Options, std::optional<Reloc::Model> RM, - std::optional<CodeModel::Model> CM = std::nullopt, - CodeGenOptLevel OL = CodeGenOptLevel::Default, bool JIT = false) const { - return createTargetMachine(Triple(TT), CPU, Features, Options, RM, CM, OL, - JIT); - } - /// createMCAsmBackend - Create a target specific assembly parser. MCAsmBackend *createMCAsmBackend(const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, @@ -599,6 +592,7 @@ public: return nullptr; } + // TODO(boomanaiden154): Remove this function after LLVM 22 branches. [[deprecated("Use overload accepting Triple instead")]] MCRelocationInfo *createMCRelocationInfo(StringRef TT, MCContext &Ctx) const { return createMCRelocationInfo(Triple(TT), Ctx); @@ -616,6 +610,7 @@ public: return Fn(TT, Ctx); } + // TODO(boomanaiden154): Remove this function after LLVM 22 branches. [[deprecated("Use overload accepting Triple instead")]] MCSymbolizer * createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo, diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 07f4a8e..0d978d4 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4164,6 +4164,10 @@ static Value *simplifyFCmpInst(CmpPredicate Pred, Value *LHS, Value *RHS, return ConstantInt::get(RetTy, Pred == CmpInst::FCMP_UNO); } + if (std::optional<bool> Res = + isImpliedByDomCondition(Pred, LHS, RHS, Q.CxtI, Q.DL)) + return ConstantInt::getBool(RetTy, *Res); + const APFloat *C = nullptr; match(RHS, m_APFloatAllowPoison(C)); std::optional<KnownFPClass> FullKnownClassLHS; diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 1eda7a7..a42c061 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/ConstantFPRange.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -9474,6 +9475,69 @@ isImpliedCondICmps(CmpPredicate LPred, const Value *L0, const Value *L1, return std::nullopt; } +/// Return true if LHS implies RHS (expanded to its components as "R0 RPred R1") +/// is true. Return false if LHS implies RHS is false. Otherwise, return +/// std::nullopt if we can't infer anything. +static std::optional<bool> +isImpliedCondFCmps(FCmpInst::Predicate LPred, const Value *L0, const Value *L1, + FCmpInst::Predicate RPred, const Value *R0, const Value *R1, + const DataLayout &DL, bool LHSIsTrue) { + // The rest of the logic assumes the LHS condition is true. If that's not the + // case, invert the predicate to make it so. + if (!LHSIsTrue) + LPred = FCmpInst::getInversePredicate(LPred); + + // We can have non-canonical operands, so try to normalize any common operand + // to L0/R0. + if (L0 == R1) { + std::swap(R0, R1); + RPred = FCmpInst::getSwappedPredicate(RPred); + } + if (R0 == L1) { + std::swap(L0, L1); + LPred = FCmpInst::getSwappedPredicate(LPred); + } + if (L1 == R1) { + // If we have L0 == R0 and L1 == R1, then make L1/R1 the constants. + if (L0 != R0 || match(L0, m_ImmConstant())) { + std::swap(L0, L1); + LPred = ICmpInst::getSwappedCmpPredicate(LPred); + std::swap(R0, R1); + RPred = ICmpInst::getSwappedCmpPredicate(RPred); + } + } + + // Can we infer anything when the two compares have matching operands? + if (L0 == R0 && L1 == R1) { + if ((LPred & RPred) == LPred) + return true; + if ((LPred & ~RPred) == LPred) + return false; + } + + // See if we can infer anything if operand-0 matches and we have at least one + // constant. + const APFloat *L1C, *R1C; + if (L0 == R0 && match(L1, m_APFloat(L1C)) && match(R1, m_APFloat(R1C))) { + if (std::optional<ConstantFPRange> DomCR = + ConstantFPRange::makeExactFCmpRegion(LPred, *L1C)) { + if (std::optional<ConstantFPRange> ImpliedCR = + ConstantFPRange::makeExactFCmpRegion(RPred, *R1C)) { + if (ImpliedCR->contains(*DomCR)) + return true; + } + if (std::optional<ConstantFPRange> ImpliedCR = + ConstantFPRange::makeExactFCmpRegion( + FCmpInst::getInversePredicate(RPred), *R1C)) { + if (ImpliedCR->contains(*DomCR)) + return false; + } + } + } + + return std::nullopt; +} + /// Return true if LHS implies RHS is true. Return false if LHS implies RHS is /// false. Otherwise, return std::nullopt if we can't infer anything. We /// expect the RHS to be an icmp and the LHS to be an 'and', 'or', or a 'select' @@ -9529,15 +9593,24 @@ llvm::isImpliedCondition(const Value *LHS, CmpPredicate RHSPred, LHSIsTrue = !LHSIsTrue; // Both LHS and RHS are icmps. - if (const auto *LHSCmp = dyn_cast<ICmpInst>(LHS)) - return isImpliedCondICmps(LHSCmp->getCmpPredicate(), LHSCmp->getOperand(0), - LHSCmp->getOperand(1), RHSPred, RHSOp0, RHSOp1, - DL, LHSIsTrue); - const Value *V; - if (match(LHS, m_NUWTrunc(m_Value(V)))) - return isImpliedCondICmps(CmpInst::ICMP_NE, V, - ConstantInt::get(V->getType(), 0), RHSPred, - RHSOp0, RHSOp1, DL, LHSIsTrue); + if (RHSOp0->getType()->getScalarType()->isIntOrPtrTy()) { + if (const auto *LHSCmp = dyn_cast<ICmpInst>(LHS)) + return isImpliedCondICmps(LHSCmp->getCmpPredicate(), + LHSCmp->getOperand(0), LHSCmp->getOperand(1), + RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue); + const Value *V; + if (match(LHS, m_NUWTrunc(m_Value(V)))) + return isImpliedCondICmps(CmpInst::ICMP_NE, V, + ConstantInt::get(V->getType(), 0), RHSPred, + RHSOp0, RHSOp1, DL, LHSIsTrue); + } else { + assert(RHSOp0->getType()->isFPOrFPVectorTy() && + "Expected floating point type only!"); + if (const auto *LHSCmp = dyn_cast<FCmpInst>(LHS)) + return isImpliedCondFCmps(LHSCmp->getPredicate(), LHSCmp->getOperand(0), + LHSCmp->getOperand(1), RHSPred, RHSOp0, RHSOp1, + DL, LHSIsTrue); + } /// The LHS should be an 'or', 'and', or a 'select' instruction. We expect /// the RHS to be an icmp. @@ -9574,6 +9647,13 @@ std::optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS, return InvertRHS ? !*Implied : *Implied; return std::nullopt; } + if (const FCmpInst *RHSCmp = dyn_cast<FCmpInst>(RHS)) { + if (auto Implied = isImpliedCondition( + LHS, RHSCmp->getPredicate(), RHSCmp->getOperand(0), + RHSCmp->getOperand(1), DL, LHSIsTrue, Depth)) + return InvertRHS ? !*Implied : *Implied; + return std::nullopt; + } const Value *V; if (match(RHS, m_NUWTrunc(m_Value(V)))) { diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 11efe49..10df9c1 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2866,9 +2866,11 @@ bool AsmPrinter::doFinalization(Module &M) { // If we don't have any trampolines, then we don't require stack memory // to be executable. Some targets have a directive to declare this. Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline"); - if (!InitTrampolineIntrinsic || InitTrampolineIntrinsic->use_empty()) - if (MCSection *S = MAI->getNonexecutableStackSection(OutContext)) - OutStreamer->switchSection(S); + bool HasTrampolineUses = + InitTrampolineIntrinsic && !InitTrampolineIntrinsic->use_empty(); + MCSection *S = MAI->getStackSection(OutContext, /*Exec=*/HasTrampolineUses); + if (S) + OutStreamer->switchSection(S); if (TM.Options.EmitAddrsig) { // Emit address-significance attributes for all globals. diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 2f3b7a2..3c41bbe 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -1657,12 +1657,6 @@ bool TargetInstrInfo::isReMaterializableImpl( // same virtual register, though. if (MO.isDef() && Reg != DefReg) return false; - - // Don't allow any virtual-register uses. Rematting an instruction with - // virtual register uses would length the live ranges of the uses, which - // is not necessarily a good idea, certainly not "trivial". - if (MO.isUse()) - return false; } // Everything checked out. diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 8b5965b..df0c85b 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -2994,6 +2994,8 @@ LLVMValueRef LLVMIsATerminatorInst(LLVMValueRef Inst) { LLVMDbgRecordRef LLVMGetFirstDbgRecord(LLVMValueRef Inst) { Instruction *Instr = unwrap<Instruction>(Inst); + if (!Instr->DebugMarker) + return nullptr; auto I = Instr->DebugMarker->StoredDbgRecords.begin(); if (I == Instr->DebugMarker->StoredDbgRecords.end()) return nullptr; @@ -3002,6 +3004,8 @@ LLVMDbgRecordRef LLVMGetFirstDbgRecord(LLVMValueRef Inst) { LLVMDbgRecordRef LLVMGetLastDbgRecord(LLVMValueRef Inst) { Instruction *Instr = unwrap<Instruction>(Inst); + if (!Instr->DebugMarker) + return nullptr; auto I = Instr->DebugMarker->StoredDbgRecords.rbegin(); if (I == Instr->DebugMarker->StoredDbgRecords.rend()) return nullptr; diff --git a/llvm/lib/MC/MCAsmInfoELF.cpp b/llvm/lib/MC/MCAsmInfoELF.cpp index cdae9d7..98090d3 100644 --- a/llvm/lib/MC/MCAsmInfoELF.cpp +++ b/llvm/lib/MC/MCAsmInfoELF.cpp @@ -27,12 +27,13 @@ using namespace llvm; void MCAsmInfoELF::anchor() {} -MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const { +MCSection *MCAsmInfoELF::getStackSection(MCContext &Ctx, bool Exec) const { // Solaris doesn't know/doesn't care about .note.GNU-stack sections, so // don't emit them. if (Ctx.getTargetTriple().isOSSolaris()) return nullptr; - return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0); + return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, + Exec ? ELF::SHF_EXECINSTR : 0U); } bool MCAsmInfoELF::useCodeAlign(const MCSection &Sec) const { diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp index 2881d7c..1bc1b92 100644 --- a/llvm/lib/MC/MCELFStreamer.cpp +++ b/llvm/lib/MC/MCELFStreamer.cpp @@ -54,7 +54,7 @@ void MCELFStreamer::initSections(bool NoExecStack, const MCSubtargetInfo &STI) { &STI); if (NoExecStack) - switchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx)); + switchSection(Ctx.getAsmInfo()->getStackSection(Ctx, /*Exec=*/false)); } void MCELFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) { diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td index 54e27a6..f4326de 100644 --- a/llvm/lib/Target/ARM/ARMInstrCDE.td +++ b/llvm/lib/Target/ARM/ARMInstrCDE.td @@ -268,6 +268,7 @@ class CDE_Vec_Instr<bit acc, dag oops, dag iops, string asm, string cstr, !con(iops, (ins vpred:$vp)), asm, !strconcat(cstr, vpred.vpred_constraint)>, CDE_RequiresQReg { + bits<0> vp; } diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 9dffd94..e244134 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -409,6 +409,7 @@ class MVE_p<dag oops, dag iops, InstrItinClass itin, string iname, !strconcat(iname, "${vp}", !if(!eq(suffix, ""), "", !strconcat(".", suffix))), ops, !strconcat(cstr, vpred.vpred_constraint), vecsize, pattern> { + bits<0> vp; let Inst{31-29} = 0b111; let Inst{27-26} = 0b11; } diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index d358913..e67db8e 100644 --- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -639,6 +639,43 @@ static DecodeStatus DecodeCCOutOperand(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeVpredNOperand(MCInst &Inst, + const MCDisassembler *Decoder) { + const auto *D = static_cast<const ARMDisassembler *>(Decoder); + unsigned VCC = D->VPTBlock.getVPTPred(); + MCRegister CondReg = VCC == ARMVCC::None ? ARM::NoRegister : ARM::P0; + + Inst.addOperand(MCOperand::createImm(VCC)); // $cond + Inst.addOperand(MCOperand::createReg(CondReg)); // $cond_reg + Inst.addOperand(MCOperand::createReg(ARM::NoRegister)); // $tp_reg + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeVpredROperand(MCInst &Inst, + const MCDisassembler *Decoder) { + const auto *D = static_cast<const ARMDisassembler *>(Decoder); + unsigned VCC = D->VPTBlock.getVPTPred(); + MCRegister CondReg = VCC == ARMVCC::None ? ARM::NoRegister : ARM::P0; + + Inst.addOperand(MCOperand::createImm(VCC)); // $cond + Inst.addOperand(MCOperand::createReg(CondReg)); // $cond_reg + Inst.addOperand(MCOperand::createReg(ARM::NoRegister)); // $tp_reg + + // The last sub-operand ($inactive) is tied to an output operand. + // The output operand has already been decoded, so just copy it. + const MCInstrDesc &MCID = D->MCII->get(Inst.getOpcode()); + unsigned InactiveOpIdx = Inst.getNumOperands(); + int TiedOpIdx = MCID.getOperandConstraint(InactiveOpIdx, MCOI::TIED_TO); + assert(TiedOpIdx >= 0 && + "Inactive register in vpred_r is not tied to an output!"); + + // Make a copy of the operand to ensure it is not invalidated when MI grows. + Inst.addOperand(MCOperand(Inst.getOperand(TiedOpIdx))); // $inactive + + return MCDisassembler::Success; +} + static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder) { @@ -2777,6 +2814,7 @@ static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst, unsigned Insn, Inst.addOperand(MCOperand::createImm(imm)); + Check(S, DecodeVpredROperand(Inst, Decoder)); return S; } @@ -2802,6 +2840,7 @@ static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn, if (!fieldFromInstruction(Insn, 12, 1)) // I bit clear => need input FPSCR Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV)); + Check(S, DecodeVpredROperand(Inst, Decoder)); return S; } @@ -5466,30 +5505,6 @@ static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val, return S; } -static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - // The vpred_r operand type includes an MQPR register field derived - // from the encoding. But we don't actually want to add an operand - // to the MCInst at this stage, because AddThumbPredicate will do it - // later, and will infer the register number from the TIED_TO - // constraint. So this is a deliberately empty decoder method that - // will inhibit the auto-generated disassembly code from adding an - // operand at all. - return MCDisassembler::Success; -} - -[[maybe_unused]] static DecodeStatus -DecodeVpredNOperand(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder) { - // Similar to above, we want to ensure that no operands are added for the - // vpred operands. (This is marked "maybe_unused" for the moment; because - // DecoderEmitter currently (wrongly) omits operands with no instruction bits, - // the decoder doesn't actually call it yet. That will be addressed in a - // future change.) - return MCDisassembler::Success; -} - static DecodeStatus DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder) { @@ -5668,6 +5683,7 @@ DecodeMVE_MEM_pre(MCInst &Inst, unsigned Val, uint64_t Address, if (!Check(S, AddrDecoder(Inst, addr, Address, Decoder))) return MCDisassembler::Fail; + Check(S, DecodeVpredNOperand(Inst, Decoder)); return S; } @@ -5871,7 +5887,7 @@ static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn, return MCDisassembler::Fail; if (!Check(S, DecodeVCVTImmOperand(Inst, imm6, Address, Decoder))) return MCDisassembler::Fail; - + Check(S, DecodeVpredROperand(Inst, Decoder)); return S; } @@ -5906,6 +5922,7 @@ static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address, if (!Check(S, predicate_decoder(Inst, fc, Address, Decoder))) return MCDisassembler::Fail; + Check(S, DecodeVpredNOperand(Inst, Decoder)); return S; } @@ -5916,6 +5933,7 @@ static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address, unsigned Rn = fieldFromInstruction(Insn, 16, 4); if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder))) return MCDisassembler::Fail; + Check(S, DecodeVpredNOperand(Inst, Decoder)); return S; } @@ -5925,6 +5943,7 @@ static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, DecodeStatus S = MCDisassembler::Success; Inst.addOperand(MCOperand::createReg(ARM::VPR)); Inst.addOperand(MCOperand::createReg(ARM::VPR)); + Check(S, DecodeVpredNOperand(Inst, Decoder)); return S; } @@ -6199,15 +6218,13 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const { (isVectorPredicable(MI) && ITBlock.instrInITBlock())) S = SoftFail; - // If we're in an IT/VPT block, base the predicate on that. Otherwise, + // If we're in an IT block, base the predicate on that. Otherwise, // assume a predicate of AL. unsigned CC = ARMCC::AL; - unsigned VCC = ARMVCC::None; if (ITBlock.instrInITBlock()) { CC = ITBlock.getITCC(); ITBlock.advanceITState(); } else if (VPTBlock.instrInVPTBlock()) { - VCC = VPTBlock.getVPTPred(); VPTBlock.advanceVPTState(); } @@ -6230,34 +6247,6 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const { Check(S, SoftFail); } - MCInst::iterator VCCI = MI.begin(); - unsigned VCCPos; - for (VCCPos = 0; VCCPos < MCID.NumOperands; ++VCCPos, ++VCCI) { - if (ARM::isVpred(MCID.operands()[VCCPos].OperandType) || VCCI == MI.end()) - break; - } - - if (isVectorPredicable(MI)) { - VCCI = MI.insert(VCCI, MCOperand::createImm(VCC)); - ++VCCI; - if (VCC == ARMVCC::None) - VCCI = MI.insert(VCCI, MCOperand::createReg(0)); - else - VCCI = MI.insert(VCCI, MCOperand::createReg(ARM::P0)); - ++VCCI; - VCCI = MI.insert(VCCI, MCOperand::createReg(0)); - ++VCCI; - if (MCID.operands()[VCCPos].OperandType == ARM::OPERAND_VPRED_R) { - int TiedOp = MCID.getOperandConstraint(VCCPos + 3, MCOI::TIED_TO); - assert(TiedOp >= 0 && - "Inactive register in vpred_r is not tied to an output!"); - // Copy the operand to ensure it's not invalidated when MI grows. - MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp))); - } - } else if (VCC != ARMVCC::None) { - Check(S, SoftFail); - } - return S; } diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h index dfd896f..8d8066a 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h @@ -49,7 +49,7 @@ public: DwarfUsesRelocationsAcrossSections = enable; } - MCSection *getNonexecutableStackSection(MCContext &Ctx) const override { + MCSection *getStackSection(MCContext &Ctx, bool Exec) const override { return nullptr; } }; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 3df448d..8f60e50 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -17,6 +17,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/OverflowInstAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -42,6 +43,7 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include <cassert> +#include <optional> #include <utility> #define DEBUG_TYPE "instcombine" @@ -1451,10 +1453,16 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel, return nullptr; }; - if (Instruction *R = ReplaceOldOpWithNewOp(CmpLHS, CmpRHS)) - return R; - if (Instruction *R = ReplaceOldOpWithNewOp(CmpRHS, CmpLHS)) - return R; + bool CanReplaceCmpLHSWithRHS = canReplacePointersIfEqual(CmpLHS, CmpRHS, DL); + if (CanReplaceCmpLHSWithRHS) { + if (Instruction *R = ReplaceOldOpWithNewOp(CmpLHS, CmpRHS)) + return R; + } + bool CanReplaceCmpRHSWithLHS = canReplacePointersIfEqual(CmpRHS, CmpLHS, DL); + if (CanReplaceCmpRHSWithLHS) { + if (Instruction *R = ReplaceOldOpWithNewOp(CmpRHS, CmpLHS)) + return R; + } auto *FalseInst = dyn_cast<Instruction>(FalseVal); if (!FalseInst) @@ -1469,12 +1477,14 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel, // Example: // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1 SmallVector<Instruction *> DropFlags; - if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ, - /* AllowRefinement */ false, - &DropFlags) == TrueVal || - simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ, - /* AllowRefinement */ false, - &DropFlags) == TrueVal) { + if ((CanReplaceCmpLHSWithRHS && + simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ, + /* AllowRefinement */ false, + &DropFlags) == TrueVal) || + (CanReplaceCmpRHSWithLHS && + simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ, + /* AllowRefinement */ false, + &DropFlags) == TrueVal)) { for (Instruction *I : DropFlags) { I->dropPoisonGeneratingAnnotations(); Worklist.add(I); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll index ed68723..41f7ab8 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll @@ -1219,14 +1219,14 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; ; GISEL-LABEL: test_shl_i1024: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: sub sp, sp, #416 -; GISEL-NEXT: stp x28, x27, [sp, #320] ; 16-byte Folded Spill -; GISEL-NEXT: stp x26, x25, [sp, #336] ; 16-byte Folded Spill -; GISEL-NEXT: stp x24, x23, [sp, #352] ; 16-byte Folded Spill -; GISEL-NEXT: stp x22, x21, [sp, #368] ; 16-byte Folded Spill -; GISEL-NEXT: stp x20, x19, [sp, #384] ; 16-byte Folded Spill -; GISEL-NEXT: stp x29, x30, [sp, #400] ; 16-byte Folded Spill -; GISEL-NEXT: .cfi_def_cfa_offset 416 +; GISEL-NEXT: sub sp, sp, #432 +; GISEL-NEXT: stp x28, x27, [sp, #336] ; 16-byte Folded Spill +; GISEL-NEXT: stp x26, x25, [sp, #352] ; 16-byte Folded Spill +; GISEL-NEXT: stp x24, x23, [sp, #368] ; 16-byte Folded Spill +; GISEL-NEXT: stp x22, x21, [sp, #384] ; 16-byte Folded Spill +; GISEL-NEXT: stp x20, x19, [sp, #400] ; 16-byte Folded Spill +; GISEL-NEXT: stp x29, x30, [sp, #416] ; 16-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 432 ; GISEL-NEXT: .cfi_offset w30, -8 ; GISEL-NEXT: .cfi_offset w29, -16 ; GISEL-NEXT: .cfi_offset w19, -24 @@ -1242,38 +1242,44 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: ldp x10, x11, [x1] ; GISEL-NEXT: mov w8, w2 ; GISEL-NEXT: lsr x9, x8, #6 -; GISEL-NEXT: and x16, x8, #0x3f +; GISEL-NEXT: and x12, x8, #0x3f +; GISEL-NEXT: str x0, [sp, #144] ; 8-byte Folded Spill +; GISEL-NEXT: and x14, x8, #0x3f ; GISEL-NEXT: mov w13, #64 ; =0x40 -; GISEL-NEXT: sub x21, x13, x16 -; GISEL-NEXT: str x0, [sp, #112] ; 8-byte Folded Spill -; GISEL-NEXT: mov x24, x16 -; GISEL-NEXT: lsl x25, x10, x16 +; GISEL-NEXT: and x16, x8, #0x3f +; GISEL-NEXT: lsl x0, x10, x12 ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: lsr x26, x10, x21 -; GISEL-NEXT: lsl x2, x11, x16 -; GISEL-NEXT: lsr x23, x11, x21 -; GISEL-NEXT: mov x22, x21 -; GISEL-NEXT: csel x12, x25, xzr, eq +; GISEL-NEXT: sub x2, x13, x14 +; GISEL-NEXT: lsr x3, x10, x2 +; GISEL-NEXT: lsl x6, x11, x14 +; GISEL-NEXT: and x14, x8, #0x3f +; GISEL-NEXT: csel x12, x0, xzr, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: str x1, [sp, #312] ; 8-byte Folded Spill +; GISEL-NEXT: lsr x20, x11, x2 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: str x23, [sp, #208] ; 8-byte Folded Spill +; GISEL-NEXT: mov x24, x0 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: stp x24, x22, [sp, #40] ; 16-byte Folded Spill +; GISEL-NEXT: mov x7, x3 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: mov x28, x1 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: and x21, x8, #0x3f ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: str x6, [sp, #24] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: str x28, [sp, #304] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: str x7, [sp, #272] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: str x20, [sp, #112] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #10 ; GISEL-NEXT: csel x12, xzr, x12, eq @@ -1290,13 +1296,13 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x10, x10, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x10, [sp, #192] ; 8-byte Folded Spill -; GISEL-NEXT: csel x10, xzr, x26, eq +; GISEL-NEXT: str x10, [sp, #232] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, xzr, x3, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x10, x2, x10 +; GISEL-NEXT: orr x10, x6, x10 ; GISEL-NEXT: csel x10, x10, xzr, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: csel x10, x25, x10, eq +; GISEL-NEXT: csel x10, x0, x10, eq ; GISEL-NEXT: cmp x9, #2 ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #3 @@ -1327,25 +1333,24 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x13, xzr, x13, eq ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: lsl x20, x12, x16 +; GISEL-NEXT: lsl x26, x12, x14 ; GISEL-NEXT: csel x11, x11, x13, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #184] ; 8-byte Folded Spill -; GISEL-NEXT: csel x11, xzr, x23, eq +; GISEL-NEXT: str x11, [sp, #224] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x20, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x20, x11 -; GISEL-NEXT: lsr x15, x12, x21 -; GISEL-NEXT: lsl x14, x10, x16 +; GISEL-NEXT: orr x11, x26, x11 +; GISEL-NEXT: lsr x15, x12, x2 +; GISEL-NEXT: lsl x30, x10, x16 ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: lsr x17, x10, x21 -; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: lsr x17, x10, x2 +; GISEL-NEXT: csel x13, xzr, x3, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: str x20, [sp, #8] ; 8-byte Folded Spill -; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: orr x13, x6, x13 ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: csel x11, x0, x11, eq ; GISEL-NEXT: cmp x9, #3 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #4 @@ -1375,23 +1380,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #176] ; 8-byte Folded Spill +; GISEL-NEXT: str x11, [sp, #216] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, xzr, x15, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x14, x11 +; GISEL-NEXT: orr x11, x30, x11 ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: csel x12, xzr, x20, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: orr x12, x26, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: csel x12, xzr, x3, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: orr x12, x6, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: csel x11, x0, x11, eq ; GISEL-NEXT: cmp x9, #4 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #5 @@ -1421,33 +1426,33 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: lsl x0, x12, x16 ; GISEL-NEXT: csel x10, x10, x13, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x10, [sp, #168] ; 8-byte Folded Spill +; GISEL-NEXT: str x10, [sp, #208] ; 8-byte Folded Spill ; GISEL-NEXT: csel x10, xzr, x17, eq ; GISEL-NEXT: cmp x9, #0 ; GISEL-NEXT: orr x10, x0, x10 -; GISEL-NEXT: lsr x27, x12, x21 +; GISEL-NEXT: lsr x4, x12, x2 ; GISEL-NEXT: lsl x19, x11, x16 ; GISEL-NEXT: csel x10, x10, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: lsr x3, x11, x21 +; GISEL-NEXT: mov x16, x15 ; GISEL-NEXT: csel x13, xzr, x15, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: stp x27, x0, [sp, #240] ; 16-byte Folded Spill -; GISEL-NEXT: orr x13, x14, x13 -; GISEL-NEXT: mov x7, x3 +; GISEL-NEXT: str x4, [sp, #248] ; 8-byte Folded Spill +; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: str x0, [sp, #48] ; 8-byte Folded Spill ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x23, eq +; GISEL-NEXT: csel x13, xzr, x20, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x13, x20, x13 +; GISEL-NEXT: orr x13, x26, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: csel x13, xzr, x3, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: orr x13, x6, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: csel x10, x25, x10, eq +; GISEL-NEXT: csel x10, x24, x10, eq ; GISEL-NEXT: cmp x9, #5 ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #6 @@ -1473,8 +1478,8 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x10, x12, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x10, [sp, #160] ; 8-byte Folded Spill -; GISEL-NEXT: csel x10, xzr, x27, eq +; GISEL-NEXT: str x10, [sp, #200] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, xzr, x4, eq ; GISEL-NEXT: cmp x9, #0 ; GISEL-NEXT: orr x10, x19, x10 ; GISEL-NEXT: csel x10, x10, xzr, eq @@ -1486,20 +1491,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x12, xzr, x15, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x12, x14, x12 +; GISEL-NEXT: and x15, x8, #0x3f +; GISEL-NEXT: orr x12, x30, x12 ; GISEL-NEXT: csel x10, x12, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: csel x12, xzr, x20, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: orr x12, x26, x12 ; GISEL-NEXT: csel x10, x12, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: csel x12, xzr, x3, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: lsr x3, x11, x2 +; GISEL-NEXT: orr x12, x6, x12 ; GISEL-NEXT: csel x10, x12, x10, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: csel x10, x25, x10, eq +; GISEL-NEXT: csel x10, x24, x10, eq ; GISEL-NEXT: cmp x9, #6 ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #7 @@ -1522,21 +1529,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x13, xzr, x13, eq ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: lsl x4, x12, x16 +; GISEL-NEXT: lsl x22, x12, x15 ; GISEL-NEXT: csel x11, x11, x13, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #152] ; 8-byte Folded Spill +; GISEL-NEXT: str x11, [sp, #192] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, xzr, x3, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x4, x11 -; GISEL-NEXT: lsl x30, x10, x16 -; GISEL-NEXT: lsr x28, x10, x21 +; GISEL-NEXT: orr x11, x22, x11 +; GISEL-NEXT: lsl x5, x10, x15 +; GISEL-NEXT: lsr x27, x10, x2 ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x27, eq +; GISEL-NEXT: csel x13, xzr, x4, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: str x30, [sp, #200] ; 8-byte Folded Spill +; GISEL-NEXT: mov x25, x27 ; GISEL-NEXT: orr x13, x19, x13 +; GISEL-NEXT: mov x14, x5 +; GISEL-NEXT: str x27, [sp, #328] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x13, xzr, x17, eq @@ -1544,30 +1553,29 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: orr x13, x0, x13 ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x15, eq +; GISEL-NEXT: csel x13, xzr, x16, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x13, x14, x13 +; GISEL-NEXT: orr x13, x30, x13 ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x23, eq +; GISEL-NEXT: csel x13, xzr, x20, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x13, x20, x13 +; GISEL-NEXT: orr x13, x26, x13 ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: csel x13, xzr, x7, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: orr x13, x6, x13 ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: lsr x13, x12, x21 -; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: lsr x13, x12, x2 +; GISEL-NEXT: csel x11, x24, x11, eq ; GISEL-NEXT: cmp x9, #7 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: mov x6, x13 +; GISEL-NEXT: mov x15, x13 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: str x6, [sp, #256] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #10 ; GISEL-NEXT: csel x11, xzr, x11, eq @@ -1584,18 +1592,18 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #144] ; 8-byte Folded Spill +; GISEL-NEXT: str x11, [sp, #184] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, xzr, x13, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x30, x11 +; GISEL-NEXT: orr x11, x5, x11 ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x12, xzr, x3, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x12, x4, x12 +; GISEL-NEXT: orr x12, x22, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x27, eq +; GISEL-NEXT: csel x12, xzr, x4, eq ; GISEL-NEXT: cmp x9, #2 ; GISEL-NEXT: orr x12, x19, x12 ; GISEL-NEXT: csel x11, x12, x11, eq @@ -1605,22 +1613,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: orr x12, x0, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x15, eq +; GISEL-NEXT: csel x12, xzr, x16, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x12, x14, x12 +; GISEL-NEXT: orr x12, x30, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: csel x12, xzr, x20, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: orr x12, x26, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: csel x12, xzr, x7, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: orr x12, x6, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: csel x11, x24, x11, eq ; GISEL-NEXT: cmp x9, #8 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #9 @@ -1635,39 +1643,34 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #14 ; GISEL-NEXT: csel x12, xzr, x11, eq -; GISEL-NEXT: ldp x11, x5, [x1, #64] +; GISEL-NEXT: ldp x11, x1, [x1, #64] ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x12, x10, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: lsl x21, x11, x16 -; GISEL-NEXT: str x12, [sp, #136] ; 8-byte Folded Spill -; GISEL-NEXT: csel x12, xzr, x28, eq +; GISEL-NEXT: lsl x23, x11, x21 +; GISEL-NEXT: str x12, [sp, #176] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x27, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x12, x21, x12 -; GISEL-NEXT: lsr x10, x11, x22 -; GISEL-NEXT: mov x16, x19 +; GISEL-NEXT: orr x12, x23, x12 +; GISEL-NEXT: lsr x21, x11, x2 +; GISEL-NEXT: str x23, [sp, #288] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, x12, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: mov x1, x16 ; GISEL-NEXT: csel x13, xzr, x13, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: str x16, [sp, #304] ; 8-byte Folded Spill -; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: orr x13, x5, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x13, xzr, x3, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: lsl x3, x5, x24 -; GISEL-NEXT: orr x13, x4, x13 +; GISEL-NEXT: orr x13, x22, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: stp x21, x3, [sp, #216] ; 16-byte Folded Spill -; GISEL-NEXT: csel x13, xzr, x27, eq +; GISEL-NEXT: csel x13, xzr, x4, eq ; GISEL-NEXT: cmp x9, #3 ; GISEL-NEXT: orr x13, x19, x13 -; GISEL-NEXT: mov x19, x28 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x13, xzr, x17, eq @@ -1675,27 +1678,30 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: orr x13, x0, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x15, eq +; GISEL-NEXT: csel x13, xzr, x16, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x13, x14, x13 +; GISEL-NEXT: orr x13, x30, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x23, eq +; GISEL-NEXT: csel x13, xzr, x20, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x13, x20, x13 +; GISEL-NEXT: orr x13, x26, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: csel x13, xzr, x7, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: orr x13, x6, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: csel x12, x25, x12, eq +; GISEL-NEXT: and x13, x8, #0x3f +; GISEL-NEXT: csel x12, x24, x12, eq ; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: lsl x10, x1, x13 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #10 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: stp x10, x15, [sp, #312] ; 16-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #12 ; GISEL-NEXT: csel x12, xzr, x12, eq @@ -1708,69 +1714,69 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x11, x11, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #128] ; 8-byte Folded Spill -; GISEL-NEXT: csel x11, xzr, x10, eq +; GISEL-NEXT: str x11, [sp, #168] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x21, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: orr x11, x10, x11 +; GISEL-NEXT: mov x10, x23 ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x28, eq +; GISEL-NEXT: csel x12, xzr, x27, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: mov x28, x4 -; GISEL-NEXT: orr x12, x21, x12 -; GISEL-NEXT: str x28, [sp, #32] ; 8-byte Folded Spill +; GISEL-NEXT: mov x27, x24 +; GISEL-NEXT: orr x12, x23, x12 +; GISEL-NEXT: mov x23, x15 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x6, eq +; GISEL-NEXT: csel x12, xzr, x15, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x12, x30, x12 +; GISEL-NEXT: mov x15, x22 +; GISEL-NEXT: orr x12, x5, x12 +; GISEL-NEXT: mov x5, x3 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x7, eq +; GISEL-NEXT: stp x14, x5, [sp, #256] ; 16-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x3, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x12, x4, x12 -; GISEL-NEXT: mov x4, x20 +; GISEL-NEXT: mov x5, x4 +; GISEL-NEXT: orr x12, x22, x12 +; GISEL-NEXT: lsr x22, x1, x2 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x27, eq +; GISEL-NEXT: csel x12, xzr, x4, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: mov x27, x2 -; GISEL-NEXT: orr x12, x16, x12 -; GISEL-NEXT: mov x16, x17 +; GISEL-NEXT: str x22, [sp, #240] ; 8-byte Folded Spill +; GISEL-NEXT: orr x12, x19, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x12, xzr, x17, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: mov x17, x15 ; GISEL-NEXT: orr x12, x0, x12 -; GISEL-NEXT: lsr x0, x5, x22 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x15, eq +; GISEL-NEXT: csel x12, xzr, x16, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: ldr x15, [sp, #312] ; 8-byte Folded Reload -; GISEL-NEXT: orr x12, x14, x12 -; GISEL-NEXT: str x0, [sp, #280] ; 8-byte Folded Spill +; GISEL-NEXT: orr x12, x30, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: csel x12, xzr, x20, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: mov x23, x25 -; GISEL-NEXT: orr x12, x20, x12 -; GISEL-NEXT: str x23, [sp, #288] ; 8-byte Folded Spill +; GISEL-NEXT: orr x12, x26, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: csel x12, xzr, x7, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: orr x12, x2, x12 -; GISEL-NEXT: mov x2, x3 +; GISEL-NEXT: mov x7, x14 +; GISEL-NEXT: orr x12, x6, x12 +; GISEL-NEXT: mov x6, x28 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: csel x11, x24, x11, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: mov x25, x26 +; GISEL-NEXT: ldr x24, [x6, #88] ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: ldr x6, [sp, #272] ; 8-byte Folded Reload ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #12 ; GISEL-NEXT: csel x11, xzr, x11, eq @@ -1780,80 +1786,84 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x12, xzr, x11, eq +; GISEL-NEXT: ldr x11, [x28, #80] ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: csel x12, x5, x12, eq -; GISEL-NEXT: ldp x11, x5, [x15, #80] +; GISEL-NEXT: csel x12, x1, x12, eq +; GISEL-NEXT: mov x28, x2 ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x12, [sp, #120] ; 8-byte Folded Spill -; GISEL-NEXT: mov x15, x7 -; GISEL-NEXT: csel x12, xzr, x0, eq +; GISEL-NEXT: lsl x2, x11, x13 +; GISEL-NEXT: str x12, [sp, #160] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x22, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: str x15, [sp, #24] ; 8-byte Folded Spill -; GISEL-NEXT: lsl x20, x11, x24 -; GISEL-NEXT: orr x12, x20, x12 -; GISEL-NEXT: str x20, [sp, #232] ; 8-byte Folded Spill +; GISEL-NEXT: ldr x1, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: str x28, [sp, #16] ; 8-byte Folded Spill +; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: str x2, [sp, #280] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, x12, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x10, eq +; GISEL-NEXT: csel x13, xzr, x21, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x13, x3, x13 -; GISEL-NEXT: lsl x3, x5, x24 +; GISEL-NEXT: orr x13, x1, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x19, eq +; GISEL-NEXT: csel x13, xzr, x25, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: stp x19, x3, [sp, #264] ; 16-byte Folded Spill -; GISEL-NEXT: orr x13, x21, x13 +; GISEL-NEXT: mov x25, x16 +; GISEL-NEXT: orr x13, x10, x13 +; GISEL-NEXT: mov x10, x30 +; GISEL-NEXT: str x25, [sp, #80] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x6, eq +; GISEL-NEXT: csel x13, xzr, x23, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: mov x23, x3 +; GISEL-NEXT: orr x13, x14, x13 +; GISEL-NEXT: mov x14, x17 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x7, eq -; GISEL-NEXT: ldp x7, x30, [sp, #240] ; 16-byte Folded Reload +; GISEL-NEXT: stp x19, x14, [sp, #64] ; 16-byte Folded Spill +; GISEL-NEXT: csel x13, xzr, x3, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x13, x28, x13 +; GISEL-NEXT: mov x3, x21 +; GISEL-NEXT: orr x13, x15, x13 +; GISEL-NEXT: str x3, [sp, #32] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x7, eq +; GISEL-NEXT: csel x13, xzr, x4, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x13, x1, x13 -; GISEL-NEXT: mov x1, x14 +; GISEL-NEXT: mov x4, x0 +; GISEL-NEXT: orr x13, x19, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x16, eq +; GISEL-NEXT: csel x13, xzr, x17, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: mov x17, x27 +; GISEL-NEXT: orr x13, x0, x13 +; GISEL-NEXT: ldr x0, [sp, #24] ; 8-byte Folded Reload ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x17, eq +; GISEL-NEXT: csel x13, xzr, x16, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: orr x13, x14, x13 -; GISEL-NEXT: ldr x14, [sp, #208] ; 8-byte Folded Reload +; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: ldp x30, x16, [sp, #320] ; 16-byte Folded Reload ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x14, eq +; GISEL-NEXT: csel x13, xzr, x20, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: orr x13, x4, x13 -; GISEL-NEXT: mov x4, x10 +; GISEL-NEXT: orr x13, x26, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: csel x13, xzr, x6, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: mov x26, x27 -; GISEL-NEXT: orr x13, x27, x13 -; GISEL-NEXT: lsr x27, x11, x22 +; GISEL-NEXT: orr x13, x0, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: mov x13, x23 -; GISEL-NEXT: csel x12, x23, x12, eq +; GISEL-NEXT: lsr x13, x11, x28 +; GISEL-NEXT: csel x12, x27, x12, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: str x27, [sp, #64] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #12 -; GISEL-NEXT: mov x23, x20 +; GISEL-NEXT: str x13, [sp, #96] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #13 ; GISEL-NEXT: csel x12, xzr, x12, eq @@ -1864,71 +1874,77 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x11, x11, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #104] ; 8-byte Folded Spill -; GISEL-NEXT: csel x11, xzr, x27, eq +; GISEL-NEXT: str x11, [sp, #152] ; 8-byte Folded Spill +; GISEL-NEXT: and x11, x8, #0x3f +; GISEL-NEXT: lsl x27, x24, x11 +; GISEL-NEXT: csel x11, xzr, x13, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: orr x11, x27, x11 +; GISEL-NEXT: str x27, [sp, #56] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x0, eq +; GISEL-NEXT: csel x12, xzr, x22, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: mov x0, x7 -; GISEL-NEXT: orr x12, x20, x12 -; GISEL-NEXT: mov x20, x16 +; GISEL-NEXT: mov x22, x2 +; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: mov x2, x14 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x10, eq +; GISEL-NEXT: csel x12, xzr, x21, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: ldr x10, [sp, #312] ; 8-byte Folded Reload -; GISEL-NEXT: orr x12, x2, x12 -; GISEL-NEXT: ldr x2, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x21, [sp, #288] ; 8-byte Folded Reload +; GISEL-NEXT: orr x12, x1, x12 +; GISEL-NEXT: mov x1, x27 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x19, eq +; GISEL-NEXT: csel x12, xzr, x16, eq ; GISEL-NEXT: cmp x9, #3 ; GISEL-NEXT: orr x12, x21, x12 -; GISEL-NEXT: ldr x21, [sp, #200] ; 8-byte Folded Reload ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x6, eq +; GISEL-NEXT: csel x12, xzr, x30, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x12, x21, x12 +; GISEL-NEXT: orr x12, x7, x12 +; GISEL-NEXT: mov x7, x15 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x15, eq +; GISEL-NEXT: str x7, [sp, #40] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x23, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x12, x28, x12 +; GISEL-NEXT: orr x12, x15, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x7, eq +; GISEL-NEXT: csel x12, xzr, x5, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: mov x7, x17 -; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: mov x5, x19 +; GISEL-NEXT: orr x12, x19, x12 +; GISEL-NEXT: mov x19, x7 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x16, eq +; GISEL-NEXT: csel x12, xzr, x14, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: orr x12, x30, x12 +; GISEL-NEXT: lsr x14, x24, x28 +; GISEL-NEXT: orr x12, x4, x12 +; GISEL-NEXT: mov x4, x10 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x17, eq +; GISEL-NEXT: csel x12, xzr, x25, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: mov x17, x24 -; GISEL-NEXT: orr x12, x1, x12 +; GISEL-NEXT: orr x12, x10, x12 +; GISEL-NEXT: ldr x10, [sp, #304] ; 8-byte Folded Reload ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x14, eq -; GISEL-NEXT: ldr x14, [sp, #8] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, xzr, x20, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: orr x12, x14, x12 +; GISEL-NEXT: orr x12, x26, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x25, eq +; GISEL-NEXT: csel x12, xzr, x6, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: orr x12, x26, x12 +; GISEL-NEXT: orr x12, x0, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: csel x11, x17, x11, eq ; GISEL-NEXT: cmp x9, #12 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #13 @@ -1937,393 +1953,395 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x12, xzr, x11, eq +; GISEL-NEXT: ldp x11, x6, [x10, #96] ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: ldp x11, x10, [x10, #96] -; GISEL-NEXT: csel x12, x5, x12, eq -; GISEL-NEXT: str x12, [sp, #96] ; 8-byte Folded Spill -; GISEL-NEXT: mov x12, x22 -; GISEL-NEXT: lsr x22, x5, x22 -; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: mov x5, x27 -; GISEL-NEXT: lsl x24, x11, x24 -; GISEL-NEXT: str x10, [sp, #296] ; 8-byte Folded Spill -; GISEL-NEXT: csel x10, xzr, x22, eq +; GISEL-NEXT: and x10, x8, #0x3f +; GISEL-NEXT: csel x12, x24, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x24, [sp, #248] ; 8-byte Folded Reload +; GISEL-NEXT: lsl x15, x11, x10 +; GISEL-NEXT: csel x10, xzr, x14, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: str x22, [sp, #16] ; 8-byte Folded Spill -; GISEL-NEXT: orr x10, x24, x10 +; GISEL-NEXT: str x12, [sp, #136] ; 8-byte Folded Spill +; GISEL-NEXT: ldr x12, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: orr x10, x15, x10 +; GISEL-NEXT: str x15, [sp, #296] ; 8-byte Folded Spill +; GISEL-NEXT: mov x15, x13 ; GISEL-NEXT: csel x10, x10, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x27, eq +; GISEL-NEXT: csel x13, xzr, x13, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: ldr x27, [sp, #280] ; 8-byte Folded Reload -; GISEL-NEXT: orr x13, x3, x13 -; GISEL-NEXT: mov x3, x26 +; GISEL-NEXT: orr x13, x27, x13 +; GISEL-NEXT: ldr x27, [sp, #240] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x13, xzr, x27, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x13, x23, x13 -; GISEL-NEXT: mov x23, x4 +; GISEL-NEXT: orr x13, x22, x13 +; GISEL-NEXT: ldr x22, [sp, #272] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x4, eq -; GISEL-NEXT: ldp x4, x16, [sp, #216] ; 16-byte Folded Reload +; GISEL-NEXT: csel x13, xzr, x3, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x13, x16, x13 +; GISEL-NEXT: orr x13, x12, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x19, eq +; GISEL-NEXT: csel x13, xzr, x16, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: mov x19, x1 -; GISEL-NEXT: orr x13, x4, x13 +; GISEL-NEXT: mov x16, x17 +; GISEL-NEXT: orr x13, x21, x13 +; GISEL-NEXT: ldp x23, x21, [sp, #256] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x6, eq +; GISEL-NEXT: csel x13, xzr, x30, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: mov x6, x14 -; GISEL-NEXT: orr x13, x21, x13 +; GISEL-NEXT: mov x30, x0 +; GISEL-NEXT: orr x13, x23, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x15, eq +; GISEL-NEXT: csel x13, xzr, x21, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x13, x28, x13 +; GISEL-NEXT: orr x13, x7, x13 +; GISEL-NEXT: mov x7, x14 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x0, eq +; GISEL-NEXT: csel x13, xzr, x24, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: mov x0, x23 -; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: orr x13, x5, x13 +; GISEL-NEXT: ldr x5, [sp, #48] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x20, eq +; GISEL-NEXT: csel x13, xzr, x2, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: orr x13, x30, x13 -; GISEL-NEXT: ldr x30, [sp, #208] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x2, [sp, #296] ; 8-byte Folded Reload +; GISEL-NEXT: orr x13, x5, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x7, eq +; GISEL-NEXT: csel x13, xzr, x25, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: orr x13, x1, x13 +; GISEL-NEXT: mov x25, x6 +; GISEL-NEXT: orr x13, x4, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x30, eq +; GISEL-NEXT: csel x13, xzr, x20, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: orr x13, x14, x13 -; GISEL-NEXT: ldp x14, x2, [sp, #264] ; 16-byte Folded Reload +; GISEL-NEXT: orr x13, x26, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x25, eq +; GISEL-NEXT: csel x13, xzr, x22, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: orr x13, x26, x13 -; GISEL-NEXT: ldr x26, [sp, #288] ; 8-byte Folded Reload +; GISEL-NEXT: orr x13, x0, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: cmp x9, #12 -; GISEL-NEXT: lsr x13, x11, x12 -; GISEL-NEXT: csel x10, x26, x10, eq +; GISEL-NEXT: lsr x13, x11, x28 +; GISEL-NEXT: csel x10, x17, x10, eq ; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: ldr x17, [sp, #80] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #14 -; GISEL-NEXT: str x13, [sp, #72] ; 8-byte Folded Spill +; GISEL-NEXT: str x13, [sp, #104] ; 8-byte Folded Spill ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x10, [sp, #88] ; 8-byte Folded Spill -; GISEL-NEXT: ldr x10, [sp, #296] ; 8-byte Folded Reload -; GISEL-NEXT: lsl x11, x10, x17 +; GISEL-NEXT: str x10, [sp, #128] ; 8-byte Folded Spill +; GISEL-NEXT: and x10, x8, #0x3f +; GISEL-NEXT: lsl x11, x6, x10 ; GISEL-NEXT: csel x10, xzr, x13, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: ldr x17, [sp, #232] ; 8-byte Folded Reload -; GISEL-NEXT: ldr x13, [sp, #256] ; 8-byte Folded Reload +; GISEL-NEXT: ldp x0, x13, [sp, #280] ; 16-byte Folded Reload +; GISEL-NEXT: mov x6, x16 ; GISEL-NEXT: orr x10, x11, x10 -; GISEL-NEXT: str x11, [sp, #56] ; 8-byte Folded Spill +; GISEL-NEXT: str x11, [sp, #88] ; 8-byte Folded Spill ; GISEL-NEXT: csel x10, x10, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x22, eq +; GISEL-NEXT: csel x11, xzr, x14, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x11, x24, x11 +; GISEL-NEXT: orr x11, x2, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x5, eq +; GISEL-NEXT: csel x11, xzr, x15, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x11, x2, x11 -; GISEL-NEXT: ldp x12, x5, [sp, #240] ; 16-byte Folded Reload +; GISEL-NEXT: mov x15, x3 +; GISEL-NEXT: orr x11, x1, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x11, xzr, x27, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: mov x27, x30 -; GISEL-NEXT: orr x11, x17, x11 +; GISEL-NEXT: orr x11, x0, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x23, eq +; GISEL-NEXT: csel x11, xzr, x3, eq +; GISEL-NEXT: ldp x14, x3, [sp, #320] ; 16-byte Folded Reload ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: mov x23, x20 -; GISEL-NEXT: orr x11, x16, x11 -; GISEL-NEXT: ldr x16, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: orr x11, x12, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x14, eq +; GISEL-NEXT: csel x11, xzr, x3, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x11, x4, x11 +; GISEL-NEXT: orr x11, x13, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x13, eq +; GISEL-NEXT: csel x11, xzr, x14, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x11, x21, x11 -; GISEL-NEXT: ldr x21, [sp, #296] ; 8-byte Folded Reload +; GISEL-NEXT: orr x11, x23, x11 +; GISEL-NEXT: mov x23, x5 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x15, eq +; GISEL-NEXT: csel x11, xzr, x21, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: orr x11, x28, x11 +; GISEL-NEXT: mov x21, x4 +; GISEL-NEXT: orr x11, x19, x11 +; GISEL-NEXT: ldp x12, x19, [sp, #64] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x12, eq +; GISEL-NEXT: csel x11, xzr, x24, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: orr x11, x16, x11 +; GISEL-NEXT: orr x11, x12, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x20, eq +; GISEL-NEXT: csel x11, xzr, x19, eq ; GISEL-NEXT: cmp x9, #9 ; GISEL-NEXT: orr x11, x5, x11 +; GISEL-NEXT: mov x5, x30 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x7, eq +; GISEL-NEXT: csel x11, xzr, x17, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: orr x11, x1, x11 -; GISEL-NEXT: ldr x1, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: orr x11, x4, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x30, eq +; GISEL-NEXT: csel x11, xzr, x20, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: orr x11, x6, x11 +; GISEL-NEXT: orr x11, x26, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x25, eq +; GISEL-NEXT: csel x11, xzr, x22, eq ; GISEL-NEXT: cmp x9, #12 -; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: orr x11, x30, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: cmp x9, #13 -; GISEL-NEXT: csel x10, x26, x10, eq +; GISEL-NEXT: csel x10, x16, x10, eq ; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: ldr x16, [sp, #304] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x11, xzr, x10, eq ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: csel x11, x21, x11, eq -; GISEL-NEXT: ldp x10, x20, [x1, #112] -; GISEL-NEXT: str x11, [sp, #80] ; 8-byte Folded Spill -; GISEL-NEXT: ldp x11, x4, [sp, #40] ; 16-byte Folded Reload +; GISEL-NEXT: ldp x10, x4, [x16, #112] +; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: str x11, [sp, #120] ; 8-byte Folded Spill +; GISEL-NEXT: lsr x11, x25, x28 +; GISEL-NEXT: and x16, x8, #0x3f ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: lsr x21, x21, x4 -; GISEL-NEXT: lsl x28, x10, x11 -; GISEL-NEXT: csel x1, xzr, x21, eq -; GISEL-NEXT: str x21, [sp, #296] ; 8-byte Folded Spill +; GISEL-NEXT: ldr x25, [sp, #88] ; 8-byte Folded Reload +; GISEL-NEXT: lsl x24, x10, x16 +; GISEL-NEXT: csel x1, xzr, x11, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x1, x28, x1 -; GISEL-NEXT: ldr x21, [sp, #72] ; 8-byte Folded Reload -; GISEL-NEXT: str x28, [sp, #312] ; 8-byte Folded Spill +; GISEL-NEXT: ldp x16, x28, [sp, #96] ; 16-byte Folded Reload +; GISEL-NEXT: orr x1, x24, x1 ; GISEL-NEXT: csel x1, x1, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: ldr x28, [sp, #56] ; 8-byte Folded Reload -; GISEL-NEXT: csel x30, xzr, x21, eq +; GISEL-NEXT: csel x30, xzr, x28, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x30, x28, x30 +; GISEL-NEXT: orr x30, x25, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x22, eq +; GISEL-NEXT: csel x30, xzr, x7, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: ldr x22, [sp, #64] ; 8-byte Folded Reload -; GISEL-NEXT: orr x30, x24, x30 +; GISEL-NEXT: orr x30, x2, x30 +; GISEL-NEXT: ldr x2, [sp, #56] ; 8-byte Folded Reload ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x22, eq +; GISEL-NEXT: csel x30, xzr, x16, eq ; GISEL-NEXT: cmp x9, #3 ; GISEL-NEXT: orr x30, x2, x30 -; GISEL-NEXT: ldr x2, [sp, #280] ; 8-byte Folded Reload ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x2, eq +; GISEL-NEXT: csel x30, xzr, x27, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x30, x17, x30 -; GISEL-NEXT: ldr x17, [sp, #224] ; 8-byte Folded Reload +; GISEL-NEXT: mov x27, x13 +; GISEL-NEXT: orr x30, x0, x30 +; GISEL-NEXT: ldr x0, [sp, #248] ; 8-byte Folded Reload ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x0, eq +; GISEL-NEXT: csel x30, xzr, x15, eq +; GISEL-NEXT: ldr x15, [sp, #312] ; 8-byte Folded Reload ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x30, x17, x30 +; GISEL-NEXT: orr x30, x15, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x14, eq -; GISEL-NEXT: ldr x14, [sp, #216] ; 8-byte Folded Reload +; GISEL-NEXT: csel x30, xzr, x3, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x30, x14, x30 +; GISEL-NEXT: ldr x3, [sp, #40] ; 8-byte Folded Reload +; GISEL-NEXT: orr x30, x13, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x13, eq -; GISEL-NEXT: ldr x13, [sp, #200] ; 8-byte Folded Reload +; GISEL-NEXT: csel x30, xzr, x14, eq +; GISEL-NEXT: ldp x13, x14, [sp, #256] ; 16-byte Folded Reload ; GISEL-NEXT: cmp x9, #7 ; GISEL-NEXT: orr x30, x13, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x15, eq -; GISEL-NEXT: ldr x15, [sp, #32] ; 8-byte Folded Reload +; GISEL-NEXT: csel x30, xzr, x14, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: orr x30, x15, x30 +; GISEL-NEXT: orr x30, x3, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x12, eq +; GISEL-NEXT: csel x30, xzr, x0, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: orr x30, x16, x30 +; GISEL-NEXT: orr x30, x12, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x23, eq +; GISEL-NEXT: csel x30, xzr, x19, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: orr x30, x5, x30 +; GISEL-NEXT: orr x30, x23, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x7, eq +; GISEL-NEXT: csel x30, xzr, x17, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: orr x30, x19, x30 +; GISEL-NEXT: orr x30, x21, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x27, eq +; GISEL-NEXT: csel x30, xzr, x20, eq ; GISEL-NEXT: cmp x9, #12 -; GISEL-NEXT: orr x30, x6, x30 +; GISEL-NEXT: mov x20, x26 +; GISEL-NEXT: orr x30, x26, x30 +; GISEL-NEXT: mov x26, x5 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x25, eq +; GISEL-NEXT: csel x30, xzr, x22, eq ; GISEL-NEXT: cmp x9, #13 -; GISEL-NEXT: orr x30, x3, x30 +; GISEL-NEXT: orr x30, x5, x30 +; GISEL-NEXT: ldr x5, [sp, #16] ; 8-byte Folded Reload ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: cmp x9, #14 -; GISEL-NEXT: lsr x30, x10, x4 -; GISEL-NEXT: csel x1, x26, x1, eq +; GISEL-NEXT: csel x1, x6, x1, eq ; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: lsr x30, x10, x5 ; GISEL-NEXT: csel x1, xzr, x1, eq ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: csel x26, x10, x1, eq -; GISEL-NEXT: lsl x10, x20, x11 +; GISEL-NEXT: csel x5, x10, x1, eq +; GISEL-NEXT: and x10, x8, #0x3f ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x16, xzr, x30, eq +; GISEL-NEXT: lsl x10, x4, x10 +; GISEL-NEXT: csel x1, xzr, x30, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: ldr x11, [sp, #296] ; 8-byte Folded Reload -; GISEL-NEXT: orr x10, x10, x16 -; GISEL-NEXT: ldr x16, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: ldp x29, x30, [sp, #416] ; 16-byte Folded Reload +; GISEL-NEXT: orr x10, x10, x1 +; GISEL-NEXT: ldr x1, [sp, #296] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, x10, xzr, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x11, x16, x11 -; GISEL-NEXT: ldr x16, [sp, #272] ; 8-byte Folded Reload +; GISEL-NEXT: orr x11, x24, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x21, eq +; GISEL-NEXT: csel x11, xzr, x28, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x11, x28, x11 -; GISEL-NEXT: ldp x29, x30, [sp, #400] ; 16-byte Folded Reload +; GISEL-NEXT: orr x11, x25, x11 ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #16] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: csel x11, xzr, x7, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x11, x24, x11 +; GISEL-NEXT: orr x11, x1, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x22, eq +; GISEL-NEXT: csel x11, xzr, x16, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x11, x16, x11 -; GISEL-NEXT: ldr x16, [sp, #232] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x16, [sp, #280] ; 8-byte Folded Reload +; GISEL-NEXT: orr x11, x2, x11 ; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #240] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x2, eq +; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #5 ; GISEL-NEXT: orr x11, x16, x11 -; GISEL-NEXT: ldp x22, x21, [sp, #368] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #32] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x0, eq +; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x11, x17, x11 +; GISEL-NEXT: orr x11, x15, x11 ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #264] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #328] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: orr x11, x14, x11 +; GISEL-NEXT: orr x11, x27, x11 +; GISEL-NEXT: ldp x28, x27, [sp, #336] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #256] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #320] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #8 ; GISEL-NEXT: orr x11, x13, x11 -; GISEL-NEXT: ldr x13, [sp, #112] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x13, [sp, #144] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #24] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: csel x11, xzr, x14, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: orr x11, x15, x11 +; GISEL-NEXT: orr x11, x3, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x12, eq -; GISEL-NEXT: ldr x12, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: csel x11, xzr, x0, eq ; GISEL-NEXT: cmp x9, #10 ; GISEL-NEXT: orr x11, x12, x11 ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #192] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #232] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: str x11, [x13] -; GISEL-NEXT: ldp x12, x11, [sp, #176] ; 16-byte Folded Reload +; GISEL-NEXT: ldp x12, x11, [sp, #216] ; 16-byte Folded Reload ; GISEL-NEXT: stp x11, x12, [x13, #8] -; GISEL-NEXT: csel x11, xzr, x23, eq +; GISEL-NEXT: csel x11, xzr, x19, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: orr x11, x5, x11 -; GISEL-NEXT: ldp x24, x23, [sp, #352] ; 16-byte Folded Reload +; GISEL-NEXT: orr x11, x23, x11 +; GISEL-NEXT: ldp x24, x23, [sp, #368] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #168] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #208] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: str x11, [x13, #24] -; GISEL-NEXT: ldp x12, x11, [sp, #152] ; 16-byte Folded Reload +; GISEL-NEXT: ldp x12, x11, [sp, #192] ; 16-byte Folded Reload ; GISEL-NEXT: stp x11, x12, [x13, #32] -; GISEL-NEXT: csel x11, xzr, x7, eq +; GISEL-NEXT: csel x11, xzr, x17, eq ; GISEL-NEXT: cmp x9, #12 -; GISEL-NEXT: orr x11, x19, x11 +; GISEL-NEXT: orr x11, x21, x11 ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #144] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #184] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: str x11, [x13, #48] -; GISEL-NEXT: ldp x12, x11, [sp, #128] ; 16-byte Folded Reload +; GISEL-NEXT: ldp x12, x11, [sp, #168] ; 16-byte Folded Reload ; GISEL-NEXT: stp x11, x12, [x13, #56] -; GISEL-NEXT: csel x11, xzr, x27, eq +; GISEL-NEXT: ldr x11, [sp, #112] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x12, [sp, #136] ; 8-byte Folded Reload +; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #13 -; GISEL-NEXT: orr x11, x6, x11 -; GISEL-NEXT: ldp x28, x27, [sp, #320] ; 16-byte Folded Reload +; GISEL-NEXT: orr x11, x20, x11 +; GISEL-NEXT: ldp x20, x19, [sp, #400] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #120] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #160] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: str x11, [x13, #72] -; GISEL-NEXT: ldp x12, x11, [sp, #96] ; 16-byte Folded Reload -; GISEL-NEXT: stp x11, x12, [x13, #80] -; GISEL-NEXT: csel x11, xzr, x25, eq +; GISEL-NEXT: ldr x11, [sp, #152] ; 8-byte Folded Reload +; GISEL-NEXT: str x11, [x13, #80] +; GISEL-NEXT: csel x11, xzr, x22, eq ; GISEL-NEXT: cmp x9, #14 -; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: orr x11, x26, x11 +; GISEL-NEXT: ldp x22, x21, [sp, #384] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: cmp x9, #15 -; GISEL-NEXT: ldr x9, [sp, #288] ; 8-byte Folded Reload -; GISEL-NEXT: ldr x11, [sp, #88] ; 8-byte Folded Reload -; GISEL-NEXT: csel x9, x9, x10, eq +; GISEL-NEXT: ldr x9, [sp, #128] ; 8-byte Folded Reload +; GISEL-NEXT: ldp x26, x25, [sp, #352] ; 16-byte Folded Reload +; GISEL-NEXT: stp x12, x9, [x13, #88] +; GISEL-NEXT: csel x9, x6, x10, eq ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: ldr x8, [sp, #80] ; 8-byte Folded Reload -; GISEL-NEXT: stp x11, x8, [x13, #96] -; GISEL-NEXT: csel x8, x20, x9, eq -; GISEL-NEXT: stp x26, x8, [x13, #112] -; GISEL-NEXT: ldp x20, x19, [sp, #384] ; 16-byte Folded Reload -; GISEL-NEXT: ldp x26, x25, [sp, #336] ; 16-byte Folded Reload -; GISEL-NEXT: add sp, sp, #416 +; GISEL-NEXT: ldr x8, [sp, #120] ; 8-byte Folded Reload +; GISEL-NEXT: stp x8, x5, [x13, #104] +; GISEL-NEXT: csel x8, x4, x9, eq +; GISEL-NEXT: str x8, [x13, #120] +; GISEL-NEXT: add sp, sp, #432 ; GISEL-NEXT: ret entry: %input_val = load i1024, ptr %input, align 128 diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index 63c08dd..b215c51 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -267,7 +267,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 ; CHECK-SD-NEXT: add x10, x2, #32 ; CHECK-SD-NEXT: add x11, x0, #16 -; CHECK-SD-NEXT: mov x12, x9 +; CHECK-SD-NEXT: and x12, x8, #0xfffffff0 ; CHECK-SD-NEXT: .LBB3_4: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] @@ -313,7 +313,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-GI-NEXT: and x10, x9, #0xfffffff0 ; CHECK-GI-NEXT: add x11, x2, #32 ; CHECK-GI-NEXT: add x12, x0, #16 -; CHECK-GI-NEXT: mov x13, x10 +; CHECK-GI-NEXT: and x13, x9, #0xfffffff0 ; CHECK-GI-NEXT: xtn v0.4h, v0.4s ; CHECK-GI-NEXT: .LBB3_3: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 @@ -428,7 +428,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 ; CHECK-SD-NEXT: add x10, x2, #32 ; CHECK-SD-NEXT: add x11, x0, #16 -; CHECK-SD-NEXT: mov x12, x9 +; CHECK-SD-NEXT: and x12, x8, #0xfffffff0 ; CHECK-SD-NEXT: .LBB4_4: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] @@ -472,7 +472,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-GI-NEXT: and x8, x9, #0xfffffff0 ; CHECK-GI-NEXT: add x10, x2, #32 ; CHECK-GI-NEXT: add x11, x0, #16 -; CHECK-GI-NEXT: mov x12, x8 +; CHECK-GI-NEXT: and x12, x9, #0xfffffff0 ; CHECK-GI-NEXT: .LBB4_3: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: and w13, w1, #0xffff @@ -596,7 +596,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, ; CHECK-SD-NEXT: and x11, x10, #0xfffffff0 ; CHECK-SD-NEXT: fmov s2, w9 ; CHECK-SD-NEXT: add x8, x0, #8 -; CHECK-SD-NEXT: mov x12, x11 +; CHECK-SD-NEXT: and x12, x10, #0xfffffff0 ; CHECK-SD-NEXT: .LBB5_5: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp d3, d4, [x8, #-8] @@ -646,10 +646,10 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-NEXT: add x10, x0, #8 +; CHECK-GI-NEXT: and x11, x8, #0xfffffff0 ; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: dup v2.8h, w9 ; CHECK-GI-NEXT: and x9, x8, #0xfffffff0 -; CHECK-GI-NEXT: mov x11, x9 ; CHECK-GI-NEXT: .LBB5_5: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldp d3, d4, [x10, #-8] diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll index 4c8e589..c23e4e1 100644 --- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll +++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll @@ -17,7 +17,7 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef ; CHECK-NEXT: and x9, x8, #0xfffffff0 ; CHECK-NEXT: add x10, x1, #16 ; CHECK-NEXT: add x11, x0, #16 -; CHECK-NEXT: mov x12, x9 +; CHECK-NEXT: and x12, x8, #0xfffffff0 ; CHECK-NEXT: .LBB0_4: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp q1, q4, [x10, #-16] diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll index f6bbdf5..1770bb9 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll @@ -14,7 +14,6 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) { ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: and x12, x10, #0xfffffff0 ; CHECK-NEXT: add x13, x1, #32 -; CHECK-NEXT: add x14, x2, #16 ; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 @@ -27,52 +26,52 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) { ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB0_6 Depth 2 ; CHECK-NEXT: // Child Loop BB0_9 Depth 2 -; CHECK-NEXT: ldrsh w15, [x2, x9, lsl #1] +; CHECK-NEXT: ldrsh w14, [x2, x9, lsl #1] ; CHECK-NEXT: cmp w0, #16 ; CHECK-NEXT: b.hs .LBB0_5 ; CHECK-NEXT: // %bb.4: // in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: mov x18, xzr +; CHECK-NEXT: mov x17, xzr ; CHECK-NEXT: b .LBB0_8 ; CHECK-NEXT: .LBB0_5: // %vector.ph ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: dup v0.8h, w15 -; CHECK-NEXT: mov x16, x14 -; CHECK-NEXT: mov x17, x13 -; CHECK-NEXT: mov x18, x12 +; CHECK-NEXT: dup v0.8h, w14 +; CHECK-NEXT: add x15, x2, #16 +; CHECK-NEXT: mov x16, x13 +; CHECK-NEXT: and x17, x10, #0xfffffff0 ; CHECK-NEXT: .LBB0_6: // %vector.body ; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldp q1, q4, [x16, #-16] -; CHECK-NEXT: subs x18, x18, #16 -; CHECK-NEXT: ldp q3, q2, [x17, #-32] -; CHECK-NEXT: add x16, x16, #32 -; CHECK-NEXT: ldp q6, q5, [x17] +; CHECK-NEXT: ldp q1, q4, [x15, #-16] +; CHECK-NEXT: subs x17, x17, #16 +; CHECK-NEXT: ldp q3, q2, [x16, #-32] +; CHECK-NEXT: add x15, x15, #32 +; CHECK-NEXT: ldp q6, q5, [x16] ; CHECK-NEXT: smlal2 v2.4s, v0.8h, v1.8h ; CHECK-NEXT: smlal v3.4s, v0.4h, v1.4h ; CHECK-NEXT: smlal2 v5.4s, v0.8h, v4.8h ; CHECK-NEXT: smlal v6.4s, v0.4h, v4.4h -; CHECK-NEXT: stp q3, q2, [x17, #-32] -; CHECK-NEXT: stp q6, q5, [x17], #64 +; CHECK-NEXT: stp q3, q2, [x16, #-32] +; CHECK-NEXT: stp q6, q5, [x16], #64 ; CHECK-NEXT: b.ne .LBB0_6 ; CHECK-NEXT: // %bb.7: // %middle.block ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: cmp x12, x10 -; CHECK-NEXT: mov x18, x12 +; CHECK-NEXT: and x17, x10, #0xfffffff0 ; CHECK-NEXT: b.eq .LBB0_2 ; CHECK-NEXT: .LBB0_8: // %for.body4.us.preheader ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: add x16, x18, x8 -; CHECK-NEXT: add x17, x2, x18, lsl #1 -; CHECK-NEXT: sub x18, x10, x18 -; CHECK-NEXT: add x16, x1, x16, lsl #2 +; CHECK-NEXT: add x15, x17, x8 +; CHECK-NEXT: add x16, x2, x17, lsl #1 +; CHECK-NEXT: sub x17, x10, x17 +; CHECK-NEXT: add x15, x1, x15, lsl #2 ; CHECK-NEXT: .LBB0_9: // %for.body4.us ; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrsh w3, [x17], #2 -; CHECK-NEXT: ldr w4, [x16] -; CHECK-NEXT: subs x18, x18, #1 -; CHECK-NEXT: madd w3, w3, w15, w4 -; CHECK-NEXT: str w3, [x16], #4 +; CHECK-NEXT: ldrsh w18, [x16], #2 +; CHECK-NEXT: ldr w3, [x15] +; CHECK-NEXT: subs x17, x17, #1 +; CHECK-NEXT: madd w18, w18, w14, w3 +; CHECK-NEXT: str w18, [x15], #4 ; CHECK-NEXT: b.ne .LBB0_9 ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll index 3caac1d..74b0e69 100644 --- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll +++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll @@ -278,9 +278,9 @@ define i64 @test_and_4(i64 %x, i64 %y) { ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: .cfi_offset w30, -32 -; CHECK-GI-NEXT: and x20, x0, #0x3 ; CHECK-GI-NEXT: mov x19, x0 -; CHECK-GI-NEXT: mov x0, x20 +; CHECK-GI-NEXT: and x20, x0, #0x3 +; CHECK-GI-NEXT: and x0, x0, #0x3 ; CHECK-GI-NEXT: bl callee ; CHECK-GI-NEXT: tst x19, #0x3 ; CHECK-GI-NEXT: csel x0, x20, x0, eq diff --git a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll index e0f2155..58c01db 100644 --- a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll +++ b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll @@ -7,20 +7,16 @@ define void @foo(i64 %v1, i64 %v2, ptr %ptr) { ; CHECK-LABEL: foo: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: add x3, x0, x1 -; CHECK-NEXT: str x3, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: str x3, [x2, #8] ; CHECK-NEXT: ldr x3, [x2, #16] ; CHECK-NEXT: add x3, x0, x3 ; CHECK-NEXT: sub x3, x3, x1 ; CHECK-NEXT: str x3, [x2, #16] -; CHECK-NEXT: ldr x3, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: add x3, x0, x1 ; CHECK-NEXT: str x3, [x2, #24] ; CHECK-NEXT: str x0, [x2, #32] ; CHECK-NEXT: str x1, [x2, #40] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %v3 = add i64 %v1, %v2 %p1 = getelementptr i64, ptr %ptr, i64 1 diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll index 5fc996a..0f62997 100644 --- a/llvm/test/CodeGen/AArch64/tbl-loops.ll +++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll @@ -23,7 +23,7 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: add x13, x1, #16 ; CHECK-NEXT: add x8, x1, x10, lsl #2 ; CHECK-NEXT: add x9, x0, x10 -; CHECK-NEXT: mov x14, x10 +; CHECK-NEXT: and x14, x11, #0x1fffffff8 ; CHECK-NEXT: .LBB0_4: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp q1, q2, [x13, #-16] @@ -194,9 +194,9 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: mov w8, #1132396544 // =0x437f0000 ; CHECK-NEXT: and x10, x11, #0x1fffffffc ; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: and x12, x11, #0x1fffffffc ; CHECK-NEXT: add x8, x1, x10, lsl #3 ; CHECK-NEXT: add x9, x0, x10, lsl #1 -; CHECK-NEXT: mov x12, x10 ; CHECK-NEXT: .LBB1_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld2 { v1.4s, v2.4s }, [x1], #32 @@ -341,7 +341,7 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI2_0] ; CHECK-NEXT: add x9, x10, x10, lsl #1 -; CHECK-NEXT: mov x12, x10 +; CHECK-NEXT: and x12, x11, #0x1fffffffc ; CHECK-NEXT: add x8, x1, x9, lsl #2 ; CHECK-NEXT: add x9, x0, x9 ; CHECK-NEXT: .LBB2_4: // %vector.body @@ -597,7 +597,7 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI3_0] ; CHECK-NEXT: add x8, x1, x10, lsl #4 ; CHECK-NEXT: add x9, x0, x10, lsl #2 -; CHECK-NEXT: mov x12, x10 +; CHECK-NEXT: and x12, x11, #0x1fffffffc ; CHECK-NEXT: .LBB3_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64 diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll index 0e68270..3e933fa 100644 --- a/llvm/test/CodeGen/AArch64/trampoline.ll +++ b/llvm/test/CodeGen/AArch64/trampoline.ll @@ -263,3 +263,9 @@ define i64 @func2() { %fp = call ptr @llvm.adjust.trampoline(ptr @trampg) ret i64 0 } + +; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the +; presence of trampolines. +; UTC_ARGS: --disable +; CHECK-LINUX: .section ".note.GNU-stack","x",@progbits +; UTC_ARGS: --enable diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll index c82b341..5bc9cdb 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -256,7 +256,7 @@ endif: ; preds = %else, %if define amdgpu_kernel void @copy1(ptr addrspace(1) %out, ptr addrspace(1) %in0) { entry: %tmp = load float, ptr addrspace(1) %in0 - %tmp1 = fcmp oeq float %tmp, 0.000000e+00 + %tmp1 = fcmp one float %tmp, 0.000000e+00 br i1 %tmp1, label %if0, label %endif if0: ; preds = %entry diff --git a/llvm/test/CodeGen/ARM/combine-movc-sub.ll b/llvm/test/CodeGen/ARM/combine-movc-sub.ll index ca5d089..8ca4c43 100644 --- a/llvm/test/CodeGen/ARM/combine-movc-sub.ll +++ b/llvm/test/CodeGen/ARM/combine-movc-sub.ll @@ -27,11 +27,11 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: sub.w r7, r2, #32 -; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: sub.w r8, r2, #32 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: add.w r6, r0, r7, lsr #5 +; CHECK-NEXT: add.w r7, r0, r8, lsr #5 ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: b .LBB0_2 @@ -44,16 +44,16 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc ; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: cmp r4, #31 ; CHECK-NEXT: ldr r0, [r1, #16] -; CHECK-NEXT: add.w r0, r0, r6, lsl #2 +; CHECK-NEXT: add.w r0, r0, r7, lsl #2 ; CHECK-NEXT: ldr r0, [r0, #40] ; CHECK-NEXT: it hi -; CHECK-NEXT: andhi r2, r7, #31 +; CHECK-NEXT: andhi r2, r8, #31 ; CHECK-NEXT: lsrs r0, r2 ; CHECK-NEXT: lsls r0, r0, #31 ; CHECK-NEXT: beq .LBB0_1 ; CHECK-NEXT: @ %bb.3: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: bl foo ; CHECK-NEXT: str.w r9, [r5, #4] ; CHECK-NEXT: b .LBB0_1 diff --git a/llvm/test/CodeGen/ARM/extract-bits.ll b/llvm/test/CodeGen/ARM/extract-bits.ll index 77deaa5..d717806 100644 --- a/llvm/test/CodeGen/ARM/extract-bits.ll +++ b/llvm/test/CodeGen/ARM/extract-bits.ll @@ -316,28 +316,28 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; ; V7A-LABEL: bextr64_a0: ; V7A: @ %bb.0: -; V7A-NEXT: .save {r4, lr} -; V7A-NEXT: push {r4, lr} -; V7A-NEXT: ldr r12, [sp, #8] -; V7A-NEXT: mov lr, #1 +; V7A-NEXT: .save {r4, r5, r11, lr} +; V7A-NEXT: push {r4, r5, r11, lr} +; V7A-NEXT: ldr lr, [sp, #16] +; V7A-NEXT: mov r5, #1 ; V7A-NEXT: lsr r0, r0, r2 -; V7A-NEXT: rsb r3, r12, #32 -; V7A-NEXT: subs r4, r12, #32 -; V7A-NEXT: lsr r3, lr, r3 -; V7A-NEXT: lslpl r3, lr, r4 -; V7A-NEXT: lsl r4, lr, r12 -; V7A-NEXT: movwpl r4, #0 -; V7A-NEXT: subs r4, r4, #1 -; V7A-NEXT: sbc r12, r3, #0 -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: orr r0, r0, r1, lsl r3 -; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsrpl r0, r1, r3 +; V7A-NEXT: rsb r12, lr, #32 +; V7A-NEXT: subs r4, lr, #32 +; V7A-NEXT: lsr r3, r5, r12 +; V7A-NEXT: lslpl r3, r5, r4 +; V7A-NEXT: lsl r5, r5, lr +; V7A-NEXT: movwpl r5, #0 +; V7A-NEXT: rsb r4, r2, #32 +; V7A-NEXT: subs r5, r5, #1 +; V7A-NEXT: sbc r3, r3, #0 +; V7A-NEXT: orr r0, r0, r1, lsl r4 +; V7A-NEXT: subs r4, r2, #32 +; V7A-NEXT: lsrpl r0, r1, r4 ; V7A-NEXT: lsr r1, r1, r2 ; V7A-NEXT: movwpl r1, #0 -; V7A-NEXT: and r0, r4, r0 -; V7A-NEXT: and r1, r12, r1 -; V7A-NEXT: pop {r4, pc} +; V7A-NEXT: and r0, r5, r0 +; V7A-NEXT: and r1, r3, r1 +; V7A-NEXT: pop {r4, r5, r11, pc} ; ; V7A-T-LABEL: bextr64_a0: ; V7A-T: @ %bb.0: @@ -434,28 +434,28 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n ; ; V7A-LABEL: bextr64_a0_arithmetic: ; V7A: @ %bb.0: -; V7A-NEXT: .save {r4, lr} -; V7A-NEXT: push {r4, lr} -; V7A-NEXT: ldr r12, [sp, #8] -; V7A-NEXT: mov lr, #1 +; V7A-NEXT: .save {r4, r5, r11, lr} +; V7A-NEXT: push {r4, r5, r11, lr} +; V7A-NEXT: ldr lr, [sp, #16] +; V7A-NEXT: mov r5, #1 ; V7A-NEXT: lsr r0, r0, r2 -; V7A-NEXT: rsb r3, r12, #32 -; V7A-NEXT: subs r4, r12, #32 -; V7A-NEXT: lsr r3, lr, r3 -; V7A-NEXT: lslpl r3, lr, r4 -; V7A-NEXT: lsl r4, lr, r12 -; V7A-NEXT: movwpl r4, #0 -; V7A-NEXT: subs r4, r4, #1 -; V7A-NEXT: sbc r12, r3, #0 -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: orr r0, r0, r1, lsl r3 -; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: rsb r12, lr, #32 +; V7A-NEXT: subs r4, lr, #32 +; V7A-NEXT: lsr r3, r5, r12 +; V7A-NEXT: lslpl r3, r5, r4 +; V7A-NEXT: lsl r5, r5, lr +; V7A-NEXT: movwpl r5, #0 +; V7A-NEXT: rsb r4, r2, #32 +; V7A-NEXT: subs r5, r5, #1 +; V7A-NEXT: sbc r3, r3, #0 +; V7A-NEXT: orr r0, r0, r1, lsl r4 +; V7A-NEXT: subs r4, r2, #32 ; V7A-NEXT: asr r2, r1, r2 -; V7A-NEXT: asrpl r0, r1, r3 ; V7A-NEXT: asrpl r2, r1, #31 -; V7A-NEXT: and r0, r4, r0 -; V7A-NEXT: and r1, r12, r2 -; V7A-NEXT: pop {r4, pc} +; V7A-NEXT: asrpl r0, r1, r4 +; V7A-NEXT: and r1, r3, r2 +; V7A-NEXT: and r0, r5, r0 +; V7A-NEXT: pop {r4, r5, r11, pc} ; ; V7A-T-LABEL: bextr64_a0_arithmetic: ; V7A-T: @ %bb.0: @@ -911,28 +911,28 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; ; V7A-LABEL: bextr64_a4_commutative: ; V7A: @ %bb.0: -; V7A-NEXT: .save {r4, lr} -; V7A-NEXT: push {r4, lr} -; V7A-NEXT: ldr r12, [sp, #8] -; V7A-NEXT: mov lr, #1 +; V7A-NEXT: .save {r4, r5, r11, lr} +; V7A-NEXT: push {r4, r5, r11, lr} +; V7A-NEXT: ldr lr, [sp, #16] +; V7A-NEXT: mov r5, #1 ; V7A-NEXT: lsr r0, r0, r2 -; V7A-NEXT: rsb r3, r12, #32 -; V7A-NEXT: subs r4, r12, #32 -; V7A-NEXT: lsr r3, lr, r3 -; V7A-NEXT: lslpl r3, lr, r4 -; V7A-NEXT: lsl r4, lr, r12 -; V7A-NEXT: movwpl r4, #0 -; V7A-NEXT: subs r4, r4, #1 -; V7A-NEXT: sbc r12, r3, #0 -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: orr r0, r0, r1, lsl r3 -; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsrpl r0, r1, r3 +; V7A-NEXT: rsb r12, lr, #32 +; V7A-NEXT: subs r4, lr, #32 +; V7A-NEXT: lsr r3, r5, r12 +; V7A-NEXT: lslpl r3, r5, r4 +; V7A-NEXT: lsl r5, r5, lr +; V7A-NEXT: movwpl r5, #0 +; V7A-NEXT: rsb r4, r2, #32 +; V7A-NEXT: subs r5, r5, #1 +; V7A-NEXT: sbc r3, r3, #0 +; V7A-NEXT: orr r0, r0, r1, lsl r4 +; V7A-NEXT: subs r4, r2, #32 +; V7A-NEXT: lsrpl r0, r1, r4 ; V7A-NEXT: lsr r1, r1, r2 ; V7A-NEXT: movwpl r1, #0 -; V7A-NEXT: and r0, r0, r4 -; V7A-NEXT: and r1, r1, r12 -; V7A-NEXT: pop {r4, pc} +; V7A-NEXT: and r0, r0, r5 +; V7A-NEXT: and r1, r1, r3 +; V7A-NEXT: pop {r4, r5, r11, pc} ; ; V7A-T-LABEL: bextr64_a4_commutative: ; V7A-T: @ %bb.0: @@ -3456,22 +3456,22 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun ; V7M-NEXT: uxtb r2, r2 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r1, #0 -; V7M-NEXT: rsb.w r12, r2, #32 +; V7M-NEXT: rsb.w r3, r2, #32 ; V7M-NEXT: lsls r1, r2 -; V7M-NEXT: sub.w r3, r2, #32 -; V7M-NEXT: lsr.w r4, r0, r12 +; V7M-NEXT: sub.w r12, r2, #32 +; V7M-NEXT: lsr.w r4, r0, r3 ; V7M-NEXT: orrs r1, r4 -; V7M-NEXT: cmp r3, #0 +; V7M-NEXT: cmp.w r12, #0 ; V7M-NEXT: it pl -; V7M-NEXT: lslpl.w r1, r0, r3 +; V7M-NEXT: lslpl.w r1, r0, r12 ; V7M-NEXT: lsl.w r0, r0, r2 -; V7M-NEXT: lsl.w r4, r1, r12 +; V7M-NEXT: lsl.w r3, r1, r3 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r0, #0 ; V7M-NEXT: lsr.w r0, r0, r2 -; V7M-NEXT: orr.w r0, r0, r4 +; V7M-NEXT: orr.w r0, r0, r3 ; V7M-NEXT: it pl -; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: lsrpl.w r0, r1, r12 ; V7M-NEXT: lsr.w r1, r1, r2 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r1, #0 @@ -3715,26 +3715,26 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; V7M-NEXT: uxtb r2, r2 ; V7M-NEXT: lsl.w r0, lr, r0 ; V7M-NEXT: orr.w r0, r0, r12 -; V7M-NEXT: rsb.w r12, r2, #32 +; V7M-NEXT: sub.w r12, r2, #32 ; V7M-NEXT: it pl ; V7M-NEXT: lsrpl.w r0, lr, r3 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: rsb.w r3, r2, #32 ; V7M-NEXT: lsls r1, r2 -; V7M-NEXT: sub.w r3, r2, #32 -; V7M-NEXT: lsr.w r4, r0, r12 -; V7M-NEXT: orrs r1, r4 -; V7M-NEXT: cmp r3, #0 +; V7M-NEXT: cmp.w r12, #0 +; V7M-NEXT: lsr.w r4, r0, r3 +; V7M-NEXT: orr.w r1, r1, r4 ; V7M-NEXT: it pl -; V7M-NEXT: lslpl.w r1, r0, r3 +; V7M-NEXT: lslpl.w r1, r0, r12 ; V7M-NEXT: lsl.w r0, r0, r2 -; V7M-NEXT: lsl.w r4, r1, r12 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: lsl.w r3, r1, r3 ; V7M-NEXT: lsr.w r0, r0, r2 -; V7M-NEXT: orr.w r0, r0, r4 +; V7M-NEXT: orr.w r0, r0, r3 ; V7M-NEXT: it pl -; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: lsrpl.w r0, r1, r12 ; V7M-NEXT: lsr.w r1, r1, r2 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r1, #0 diff --git a/llvm/test/CodeGen/ARM/extract-lowbits.ll b/llvm/test/CodeGen/ARM/extract-lowbits.ll index b483793..373d998 100644 --- a/llvm/test/CodeGen/ARM/extract-lowbits.ll +++ b/llvm/test/CodeGen/ARM/extract-lowbits.ll @@ -243,15 +243,15 @@ define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind { ; V7A: @ %bb.0: ; V7A-NEXT: .save {r11, lr} ; V7A-NEXT: push {r11, lr} -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: mov r12, #1 -; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: rsb r12, r2, #32 +; V7A-NEXT: mov lr, #1 ; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: lsl r2, lr, r2 +; V7A-NEXT: lsr r12, lr, r12 ; V7A-NEXT: movwpl r2, #0 -; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: lslpl r12, lr, r3 ; V7A-NEXT: subs r2, r2, #1 -; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: sbc r3, r12, #0 ; V7A-NEXT: and r0, r2, r0 ; V7A-NEXT: and r1, r3, r1 ; V7A-NEXT: pop {r11, pc} @@ -323,15 +323,15 @@ define i64 @bzhi64_a0_masked(i64 %val, i64 %numlowbits) nounwind { ; V7A-NEXT: .save {r11, lr} ; V7A-NEXT: push {r11, lr} ; V7A-NEXT: and r2, r2, #63 -; V7A-NEXT: mov r12, #1 -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: mov lr, #1 +; V7A-NEXT: rsb r12, r2, #32 ; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: lsl r2, lr, r2 +; V7A-NEXT: lsr r12, lr, r12 ; V7A-NEXT: movwpl r2, #0 -; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: lslpl r12, lr, r3 ; V7A-NEXT: subs r2, r2, #1 -; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: sbc r3, r12, #0 ; V7A-NEXT: and r0, r2, r0 ; V7A-NEXT: and r1, r3, r1 ; V7A-NEXT: pop {r11, pc} @@ -404,15 +404,15 @@ define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind { ; V7A: @ %bb.0: ; V7A-NEXT: .save {r11, lr} ; V7A-NEXT: push {r11, lr} -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: mov r12, #1 -; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: rsb r12, r2, #32 +; V7A-NEXT: mov lr, #1 ; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: lsl r2, lr, r2 +; V7A-NEXT: lsr r12, lr, r12 ; V7A-NEXT: movwpl r2, #0 -; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: lslpl r12, lr, r3 ; V7A-NEXT: subs r2, r2, #1 -; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: sbc r3, r12, #0 ; V7A-NEXT: and r0, r2, r0 ; V7A-NEXT: and r1, r3, r1 ; V7A-NEXT: pop {r11, pc} @@ -644,15 +644,15 @@ define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind { ; V7A: @ %bb.0: ; V7A-NEXT: .save {r11, lr} ; V7A-NEXT: push {r11, lr} -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: mov r12, #1 -; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: rsb r12, r2, #32 +; V7A-NEXT: mov lr, #1 ; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: lsl r2, lr, r2 +; V7A-NEXT: lsr r12, lr, r12 ; V7A-NEXT: movwpl r2, #0 -; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: lslpl r12, lr, r3 ; V7A-NEXT: subs r2, r2, #1 -; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: sbc r3, r12, #0 ; V7A-NEXT: and r0, r0, r2 ; V7A-NEXT: and r1, r1, r3 ; V7A-NEXT: pop {r11, pc} @@ -2144,23 +2144,23 @@ define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind { ; ; V7A-LABEL: bzhi64_d2_load: ; V7A: @ %bb.0: -; V7A-NEXT: .save {r5, r7, r11, lr} -; V7A-NEXT: push {r5, r7, r11, lr} +; V7A-NEXT: .save {r5, lr} +; V7A-NEXT: push {r5, lr} ; V7A-NEXT: rsb r3, r2, #64 -; V7A-NEXT: ldm r0, {r0, r7} -; V7A-NEXT: rsb r1, r3, #32 +; V7A-NEXT: ldm r0, {r0, r5} +; V7A-NEXT: rsb r12, r3, #32 ; V7A-NEXT: rsbs r2, r2, #32 -; V7A-NEXT: lsr r5, r0, r1 -; V7A-NEXT: orr r7, r5, r7, lsl r3 -; V7A-NEXT: lslpl r7, r0, r2 +; V7A-NEXT: lsr r1, r0, r12 +; V7A-NEXT: orr r1, r1, r5, lsl r3 +; V7A-NEXT: lslpl r1, r0, r2 ; V7A-NEXT: lsl r0, r0, r3 ; V7A-NEXT: movwpl r0, #0 ; V7A-NEXT: lsr r0, r0, r3 -; V7A-NEXT: orr r0, r0, r7, lsl r1 -; V7A-NEXT: lsr r1, r7, r3 -; V7A-NEXT: lsrpl r0, r7, r2 +; V7A-NEXT: orr r0, r0, r1, lsl r12 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: lsr r1, r1, r3 ; V7A-NEXT: movwpl r1, #0 -; V7A-NEXT: pop {r5, r7, r11, pc} +; V7A-NEXT: pop {r5, pc} ; ; V7A-T-LABEL: bzhi64_d2_load: ; V7A-T: @ %bb.0: @@ -2237,26 +2237,26 @@ define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind { ; ; V7A-LABEL: bzhi64_d3_load_indexzext: ; V7A: @ %bb.0: -; V7A-NEXT: .save {r5, r7, r11, lr} -; V7A-NEXT: push {r5, r7, r11, lr} +; V7A-NEXT: .save {r5, lr} +; V7A-NEXT: push {r5, lr} ; V7A-NEXT: rsb r1, r1, #64 -; V7A-NEXT: ldm r0, {r0, r7} +; V7A-NEXT: ldm r0, {r0, r5} ; V7A-NEXT: uxtb r2, r1 -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: lsr r5, r0, r3 -; V7A-NEXT: orr r7, r5, r7, lsl r2 +; V7A-NEXT: rsb r12, r2, #32 +; V7A-NEXT: lsr r3, r0, r12 +; V7A-NEXT: orr r3, r3, r5, lsl r2 ; V7A-NEXT: mvn r5, #31 ; V7A-NEXT: uxtab r1, r5, r1 ; V7A-NEXT: cmp r1, #0 -; V7A-NEXT: lslpl r7, r0, r1 +; V7A-NEXT: lslpl r3, r0, r1 ; V7A-NEXT: lsl r0, r0, r2 ; V7A-NEXT: movwpl r0, #0 ; V7A-NEXT: lsr r0, r0, r2 -; V7A-NEXT: orr r0, r0, r7, lsl r3 -; V7A-NEXT: lsrpl r0, r7, r1 -; V7A-NEXT: lsr r1, r7, r2 +; V7A-NEXT: orr r0, r0, r3, lsl r12 +; V7A-NEXT: lsrpl r0, r3, r1 +; V7A-NEXT: lsr r1, r3, r2 ; V7A-NEXT: movwpl r1, #0 -; V7A-NEXT: pop {r5, r7, r11, pc} +; V7A-NEXT: pop {r5, pc} ; ; V7A-T-LABEL: bzhi64_d3_load_indexzext: ; V7A-T: @ %bb.0: diff --git a/llvm/test/CodeGen/ARM/llround-conv.ll b/llvm/test/CodeGen/ARM/llround-conv.ll index 0f57e4a..f734db8 100644 --- a/llvm/test/CodeGen/ARM/llround-conv.ll +++ b/llvm/test/CodeGen/ARM/llround-conv.ll @@ -1,25 +1,71 @@ -; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP -; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT +; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16 +; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16 + +define i64 @testmsxh_builtin(half %x) { +; CHECK-SOFT-LABEL: testmsxh_builtin: +; CHECK-SOFT: @ %bb.0: @ %entry +; CHECK-SOFT-NEXT: .save {r11, lr} +; CHECK-SOFT-NEXT: push {r11, lr} +; CHECK-SOFT-NEXT: bl __aeabi_h2f +; CHECK-SOFT-NEXT: bl llroundf +; CHECK-SOFT-NEXT: pop {r11, pc} +; +; CHECK-NOFP16-LABEL: testmsxh_builtin: +; CHECK-NOFP16: @ %bb.0: @ %entry +; CHECK-NOFP16-NEXT: .save {r11, lr} +; CHECK-NOFP16-NEXT: push {r11, lr} +; CHECK-NOFP16-NEXT: vmov r0, s0 +; CHECK-NOFP16-NEXT: bl __aeabi_h2f +; CHECK-NOFP16-NEXT: vmov s0, r0 +; CHECK-NOFP16-NEXT: bl llroundf +; CHECK-NOFP16-NEXT: pop {r11, pc} +; +; CHECK-FP16-LABEL: testmsxh_builtin: +; CHECK-FP16: @ %bb.0: @ %entry +; CHECK-FP16-NEXT: .save {r11, lr} +; CHECK-FP16-NEXT: push {r11, lr} +; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-FP16-NEXT: bl llroundf +; CHECK-FP16-NEXT: pop {r11, pc} +entry: + %0 = tail call i64 @llvm.llround.i64.f16(half %x) + ret i64 %0 +} -; SOFTFP-LABEL: testmsxs_builtin: -; SOFTFP: bl llroundf -; HARDFP-LABEL: testmsxs_builtin: -; HARDFP: bl llroundf define i64 @testmsxs_builtin(float %x) { +; CHECK-LABEL: testmsxs_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl llroundf +; CHECK-NEXT: pop {r11, pc} entry: - %0 = tail call i64 @llvm.llround.f32(float %x) + %0 = tail call i64 @llvm.llround.i64.f32(float %x) ret i64 %0 } -; SOFTFP-LABEL: testmsxd_builtin: -; SOFTFP: bl llround -; HARDFP-LABEL: testmsxd_builtin: -; HARDFP: bl llround define i64 @testmsxd_builtin(double %x) { +; CHECK-LABEL: testmsxd_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl llround +; CHECK-NEXT: pop {r11, pc} entry: - %0 = tail call i64 @llvm.llround.f64(double %x) + %0 = tail call i64 @llvm.llround.i64.f64(double %x) ret i64 %0 } -declare i64 @llvm.llround.f32(float) nounwind readnone -declare i64 @llvm.llround.f64(double) nounwind readnone +define i64 @testmsxq_builtin(fp128 %x) { +; CHECK-LABEL: testmsxq_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl llroundl +; CHECK-NEXT: pop {r11, pc} +entry: + %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x) + ret i64 %0 +} diff --git a/llvm/test/CodeGen/ARM/lround-conv.ll b/llvm/test/CodeGen/ARM/lround-conv.ll index 3aaed74..03f7a0d 100644 --- a/llvm/test/CodeGen/ARM/lround-conv.ll +++ b/llvm/test/CodeGen/ARM/lround-conv.ll @@ -1,25 +1,47 @@ -; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP -; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT +; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16 +; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FPv8 +; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16 + +;define i32 @testmswh_builtin(half %x) { +;entry: +; %0 = tail call i32 @llvm.lround.i32.f16(half %x) +; ret i32 %0 +;} -; SOFTFP-LABEL: testmsws_builtin: -; SOFTFP: bl lroundf -; HARDFP-LABEL: testmsws_builtin: -; HARDFP: bl lroundf define i32 @testmsws_builtin(float %x) { +; CHECK-LABEL: testmsws_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: b lroundf entry: %0 = tail call i32 @llvm.lround.i32.f32(float %x) ret i32 %0 } -; SOFTFP-LABEL: testmswd_builtin: -; SOFTFP: bl lround -; HARDFP-LABEL: testmswd_builtin: -; HARDFP: bl lround define i32 @testmswd_builtin(double %x) { +; CHECK-LABEL: testmswd_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: b lround entry: %0 = tail call i32 @llvm.lround.i32.f64(double %x) ret i32 %0 } -declare i32 @llvm.lround.i32.f32(float) nounwind readnone -declare i32 @llvm.lround.i32.f64(double) nounwind readnone +define i32 @testmswq_builtin(fp128 %x) { +; CHECK-LABEL: testmswq_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl lroundl +; CHECK-NEXT: pop {r11, pc} +entry: + %0 = tail call i32 @llvm.lround.i32.f128(fp128 %x) + ret i32 %0 +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-FP16: {{.*}} +; CHECK-FPv8: {{.*}} +; CHECK-NOFP16: {{.*}} +; CHECK-SOFT: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll index e761d3a..33b89a4 100644 --- a/llvm/test/CodeGen/RISCV/pr69586.ll +++ b/llvm/test/CodeGen/RISCV/pr69586.ll @@ -39,119 +39,118 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: slli a2, a2, 1 ; NOREMAT-NEXT: sub sp, sp, a2 ; NOREMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb -; NOREMAT-NEXT: mv a7, a0 -; NOREMAT-NEXT: li a0, 32 -; NOREMAT-NEXT: addi a5, a7, 512 -; NOREMAT-NEXT: addi a4, a7, 1024 -; NOREMAT-NEXT: addi a6, a7, 1536 -; NOREMAT-NEXT: li t4, 1 -; NOREMAT-NEXT: li a2, 5 +; NOREMAT-NEXT: li a7, 32 +; NOREMAT-NEXT: addi s10, a0, 512 +; NOREMAT-NEXT: addi a4, a0, 1024 +; NOREMAT-NEXT: addi a6, a0, 1536 +; NOREMAT-NEXT: li t0, 1 +; NOREMAT-NEXT: li a3, 5 ; NOREMAT-NEXT: li t1, 3 -; NOREMAT-NEXT: li t0, 7 -; NOREMAT-NEXT: lui t5, 1 +; NOREMAT-NEXT: li a2, 7 +; NOREMAT-NEXT: lui t2, 1 ; NOREMAT-NEXT: li s4, 9 ; NOREMAT-NEXT: li s6, 11 ; NOREMAT-NEXT: li s9, 13 ; NOREMAT-NEXT: li ra, 15 -; NOREMAT-NEXT: lui t2, 2 +; NOREMAT-NEXT: lui a5, 2 ; NOREMAT-NEXT: lui s1, 3 ; NOREMAT-NEXT: lui t3, 4 ; NOREMAT-NEXT: lui s0, 5 ; NOREMAT-NEXT: lui s3, 6 ; NOREMAT-NEXT: lui s7, 7 -; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; NOREMAT-NEXT: slli t4, t4, 11 -; NOREMAT-NEXT: sd t4, 512(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: slli a3, a2, 9 -; NOREMAT-NEXT: sd a3, 504(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; NOREMAT-NEXT: slli t0, t0, 11 +; NOREMAT-NEXT: sd t0, 512(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli t4, a3, 9 +; NOREMAT-NEXT: sd t4, 504(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: slli t6, t1, 10 -; NOREMAT-NEXT: slli s2, t0, 9 -; NOREMAT-NEXT: add a0, a7, t5 +; NOREMAT-NEXT: slli s2, a2, 9 +; NOREMAT-NEXT: add a7, a0, t2 ; NOREMAT-NEXT: lui s11, 1 ; NOREMAT-NEXT: slli s4, s4, 9 -; NOREMAT-NEXT: slli s5, a2, 10 +; NOREMAT-NEXT: slli s5, a3, 10 ; NOREMAT-NEXT: slli s6, s6, 9 ; NOREMAT-NEXT: slli s8, t1, 11 -; NOREMAT-NEXT: vle32.v v8, (a5) +; NOREMAT-NEXT: vle32.v v8, (s10) ; NOREMAT-NEXT: slli s9, s9, 9 ; NOREMAT-NEXT: li t5, 13 ; NOREMAT-NEXT: vle32.v v10, (a4) ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: slli s10, t0, 10 +; NOREMAT-NEXT: slli s10, a2, 10 ; NOREMAT-NEXT: vle32.v v0, (a6) ; NOREMAT-NEXT: vle32.v v12, (a6) ; NOREMAT-NEXT: slli ra, ra, 9 -; NOREMAT-NEXT: vle32.v v4, (a0) -; NOREMAT-NEXT: vle32.v v20, (a0) -; NOREMAT-NEXT: add a4, a7, t2 +; NOREMAT-NEXT: vle32.v v4, (a7) +; NOREMAT-NEXT: vle32.v v20, (a7) +; NOREMAT-NEXT: add a4, a0, a5 ; NOREMAT-NEXT: vle32.v v6, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: add a4, a7, s1 +; NOREMAT-NEXT: add a4, a0, s1 ; NOREMAT-NEXT: vle32.v v28, (a4) ; NOREMAT-NEXT: vle32.v v26, (a4) -; NOREMAT-NEXT: add a4, a7, t3 +; NOREMAT-NEXT: add a4, a0, t3 ; NOREMAT-NEXT: vle32.v v24, (a4) ; NOREMAT-NEXT: vle32.v v22, (a4) -; NOREMAT-NEXT: add a4, a7, s0 -; NOREMAT-NEXT: vle32.v v14, (a7) +; NOREMAT-NEXT: add a4, a0, s0 +; NOREMAT-NEXT: vle32.v v14, (a0) ; NOREMAT-NEXT: vle32.v v18, (a4) ; NOREMAT-NEXT: vle32.v v16, (a4) -; NOREMAT-NEXT: add a4, a7, s3 +; NOREMAT-NEXT: add a4, a0, s3 ; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8 ; NOREMAT-NEXT: vle32.v v14, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: addi a0, sp, 640 -; NOREMAT-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, t4 +; NOREMAT-NEXT: addi a4, sp, 640 +; NOREMAT-NEXT: vs2r.v v8, (a4) # vscale x 16-byte Folded Spill +; NOREMAT-NEXT: add a4, a0, t0 ; NOREMAT-NEXT: vle32.v v10, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a7, a3 +; NOREMAT-NEXT: add a4, a0, t4 ; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10 ; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a7, t6 +; NOREMAT-NEXT: add a4, a0, t6 ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a7, s2 +; NOREMAT-NEXT: add a4, a0, s2 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s7 +; NOREMAT-NEXT: add a4, a0, s7 ; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8 ; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a7, s4 +; NOREMAT-NEXT: add a4, a0, s4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s5 +; NOREMAT-NEXT: add a4, a0, s5 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a7, s6 +; NOREMAT-NEXT: add a4, a0, s6 ; NOREMAT-NEXT: vle32.v v20, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s8 +; NOREMAT-NEXT: add a4, a0, s8 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a7, s9 +; NOREMAT-NEXT: add a4, a0, s9 ; NOREMAT-NEXT: vle32.v v20, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s10 +; NOREMAT-NEXT: add a4, a0, s10 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a7, ra +; NOREMAT-NEXT: add a4, a0, ra ; NOREMAT-NEXT: vle32.v v2, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: lui t4, 8 -; NOREMAT-NEXT: add a5, a7, t4 +; NOREMAT-NEXT: add a5, a0, t4 ; NOREMAT-NEXT: vle32.v v20, (a5) ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2 @@ -159,14 +158,14 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: slli a4, a4, 9 ; NOREMAT-NEXT: li s1, 17 ; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: add a4, a0, a4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6 ; NOREMAT-NEXT: li a5, 9 ; NOREMAT-NEXT: slli a4, a5, 10 ; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: add a4, a0, a4 ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: vle32.v v6, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 @@ -174,256 +173,257 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: slli a4, a4, 9 ; NOREMAT-NEXT: li t2, 19 ; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: add a4, a0, a4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: slli a3, a2, 11 +; NOREMAT-NEXT: slli a3, a3, 11 ; NOREMAT-NEXT: sd a3, 600(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; NOREMAT-NEXT: li s7, 21 ; NOREMAT-NEXT: slli a3, s7, 9 ; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v6, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 ; NOREMAT-NEXT: li a6, 11 ; NOREMAT-NEXT: slli a3, a6, 10 ; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 ; NOREMAT-NEXT: li s3, 23 ; NOREMAT-NEXT: slli a3, s3, 9 ; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 ; NOREMAT-NEXT: li s0, 25 ; NOREMAT-NEXT: slli a3, s0, 9 ; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v6, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 ; NOREMAT-NEXT: slli a3, t5, 10 ; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28 ; NOREMAT-NEXT: li t3, 27 ; NOREMAT-NEXT: slli a3, t3, 9 ; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v28, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) -; NOREMAT-NEXT: slli a2, t0, 11 +; NOREMAT-NEXT: slli a2, a2, 11 ; NOREMAT-NEXT: sd a2, 544(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; NOREMAT-NEXT: li t0, 29 ; NOREMAT-NEXT: slli a2, t0, 9 ; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28 -; NOREMAT-NEXT: li a3, 15 -; NOREMAT-NEXT: slli a2, a3, 10 +; NOREMAT-NEXT: li a7, 15 +; NOREMAT-NEXT: slli a2, a7, 10 ; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 ; NOREMAT-NEXT: li t1, 31 ; NOREMAT-NEXT: slli a2, t1, 9 ; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8 -; NOREMAT-NEXT: lui a4, 4 -; NOREMAT-NEXT: addi a0, a4, 512 -; NOREMAT-NEXT: sd a0, 496(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a0, a7, a0 -; NOREMAT-NEXT: vle32.v v8, (a0) -; NOREMAT-NEXT: vle32.v v26, (a0) +; NOREMAT-NEXT: lui a3, 4 +; NOREMAT-NEXT: addi a2, a3, 512 +; NOREMAT-NEXT: sd a2, 496(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: vle32.v v8, (a2) +; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28 ; NOREMAT-NEXT: slli a2, s1, 10 ; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 -; NOREMAT-NEXT: addi a2, a4, 1536 +; NOREMAT-NEXT: addi a2, a3, 1536 ; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: lui a4, 4 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: slli a2, a5, 11 ; NOREMAT-NEXT: sd a2, 472(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v24 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8 ; NOREMAT-NEXT: lui a5, 5 ; NOREMAT-NEXT: addi a2, a5, -1536 ; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28 ; NOREMAT-NEXT: slli a2, t2, 10 ; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: li t2, 19 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: li a3, 19 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 ; NOREMAT-NEXT: addi a2, a5, -512 ; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24 ; NOREMAT-NEXT: addi a2, a5, 512 ; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: slli a2, s7, 10 ; NOREMAT-NEXT: sd a2, 432(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26 ; NOREMAT-NEXT: addi a2, a5, 1536 ; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: slli a2, a6, 11 ; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18 ; NOREMAT-NEXT: lui a6, 6 ; NOREMAT-NEXT: addi a2, a6, -1536 ; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: slli a2, s3, 10 ; NOREMAT-NEXT: sd a2, 400(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v24 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 ; NOREMAT-NEXT: addi a2, a6, -512 ; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22 ; NOREMAT-NEXT: addi a2, a6, 512 ; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: slli a2, s0, 10 ; NOREMAT-NEXT: sd a2, 376(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v2, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18 ; NOREMAT-NEXT: addi a2, a6, 1536 ; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: slli a2, t5, 11 ; NOREMAT-NEXT: sd a2, 360(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v16 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8 ; NOREMAT-NEXT: lui s0, 7 ; NOREMAT-NEXT: addi a2, s0, -1536 ; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: slli a2, t3, 10 ; NOREMAT-NEXT: sd a2, 344(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v14 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v14, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: addi a0, sp, 640 -; NOREMAT-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload +; NOREMAT-NEXT: addi a2, sp, 640 +; NOREMAT-NEXT: vl2r.v v12, (a2) # vscale x 16-byte Folded Reload ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22 ; NOREMAT-NEXT: addi a2, s0, -512 ; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26 ; NOREMAT-NEXT: addi a2, s0, 512 ; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui t3, 7 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: slli a2, t0, 10 ; NOREMAT-NEXT: sd a2, 320(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v18 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v2, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16 ; NOREMAT-NEXT: addi a2, t3, 1536 ; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: slli a2, a3, 11 +; NOREMAT-NEXT: slli a2, a7, 11 ; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14 ; NOREMAT-NEXT: addi a2, t4, -1536 ; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v14, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: slli a2, t1, 10 ; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: addi a0, t4, -512 -; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a0, a7, a0 +; NOREMAT-NEXT: addi a2, t4, -512 +; NOREMAT-NEXT: sd a2, 280(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a0, a2 ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0 ; NOREMAT-NEXT: vle32.v v12, (a0) ; NOREMAT-NEXT: vle32.v v0, (a0) @@ -476,7 +476,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: addi s11, a0, 512 ; NOREMAT-NEXT: addi s7, a0, 1024 ; NOREMAT-NEXT: addi s3, a0, 1536 -; NOREMAT-NEXT: slli s1, t2, 11 +; NOREMAT-NEXT: slli s1, a3, 11 ; NOREMAT-NEXT: lui a0, 10 ; NOREMAT-NEXT: addi t2, a0, -1536 ; NOREMAT-NEXT: addi a7, a0, -1024 diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll index 34d4657..c68fa59 100644 --- a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll +++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll @@ -78,3 +78,10 @@ define i64 @test0(i64 %n, ptr %p) nounwind { ret i64 %ret } + +; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the +; presence of trampolines. +; UTC_ARGS: --disable +; RV64-LINUX: .section ".note.GNU-stack","x",@progbits +; RV64: .section ".note.GNU-stack","x",@progbits +; UTC_ARGS: --enable diff --git a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll index 9798077..e3ed31f 100644 --- a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll +++ b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll @@ -163,9 +163,9 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 { ; LINUX-NEXT: ld %f10, 8(%r3) ; LINUX-NEXT: ld %f0, 16(%r3) ; LINUX-NEXT: ld %f2, 24(%r3) -; LINUX-NEXT: la %r3, 16(%r2) -; LINUX-NEXT: la %r4, 48(%r2) ; LINUX-NEXT: la %r2, 176(%r15) +; LINUX-NEXT: la %r3, 16(%r13) +; LINUX-NEXT: la %r4, 48(%r13) ; LINUX-NEXT: std %f0, 176(%r15) ; LINUX-NEXT: std %f2, 184(%r15) ; LINUX-NEXT: brasl %r14, sincosl@PLT diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll index 6f986ce..c418038 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -541,11 +541,11 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(ptr nocaptur ; CHECK-NEXT: cbz r2, .LBB7_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: bic r3, r3, #7 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: add.w r12, r4, r3, lsr #3 ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r4, r1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll index 4020709..fe06601 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll @@ -16,39 +16,40 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture ; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: ldrsh.w r7, [r2] ; CHECK-NEXT: cmp r7, #1 -; CHECK-NEXT: blt.w .LBB0_6 +; CHECK-NEXT: blt .LBB0_6 ; CHECK-NEXT: @ %bb.2: @ %for.cond3.preheader.us.preheader -; CHECK-NEXT: movs r2, #252 ; CHECK-NEXT: ldr r4, [sp, #152] +; CHECK-NEXT: movs r2, #252 ; CHECK-NEXT: and.w r6, r2, r3, lsr #3 ; CHECK-NEXT: movs r2, #120 ; CHECK-NEXT: and.w r5, r2, r3, lsr #9 ; CHECK-NEXT: lsls r3, r3, #3 -; CHECK-NEXT: uxtb r3, r3 ; CHECK-NEXT: muls r6, r4, r6 +; CHECK-NEXT: uxtb r3, r3 ; CHECK-NEXT: rsb.w r2, r4, #256 -; CHECK-NEXT: vmov.i16 q2, #0xfc +; CHECK-NEXT: vmov.i16 q1, #0xfc +; CHECK-NEXT: vdup.16 q0, r6 ; CHECK-NEXT: mul lr, r5, r4 -; CHECK-NEXT: vdup.16 q4, r6 ; CHECK-NEXT: mov.w r6, #2016 -; CHECK-NEXT: vmov.i16 q6, #0xf8 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: mul r5, r3, r4 ; CHECK-NEXT: adds r3, r7, #7 +; CHECK-NEXT: vdup.16 q0, r6 ; CHECK-NEXT: bic r3, r3, #7 -; CHECK-NEXT: vdup.16 q3, lr +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vdup.16 q0, r5 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vdup.16 q0, lr ; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: vdup.16 q0, r5 -; CHECK-NEXT: lsls r1, r1, #1 +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: add.w r3, r4, r3, lsr #3 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q0, #0xf800 +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: lsls r1, r1, #1 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: vdup.16 q5, r6 -; CHECK-NEXT: vmov.i16 q7, #0x78 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.i16 q4, #0xf8 ; CHECK-NEXT: .LBB0_3: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB0_4 Depth 2 @@ -59,37 +60,31 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture ; CHECK-NEXT: @ Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vctp.16 r6 -; CHECK-NEXT: subs r6, #8 +; CHECK-NEXT: vmov.i16 q5, #0xf800 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.u16 q0, [r5] -; CHECK-NEXT: vshr.u16 q1, q0, #3 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmla.i16 q2, q1, r2 -; CHECK-NEXT: vshr.u16 q1, q2, #5 -; CHECK-NEXT: vshl.i16 q2, q0, #3 -; CHECK-NEXT: vand q3, q1, q5 -; CHECK-NEXT: vmov q1, q7 -; CHECK-NEXT: vand q2, q2, q6 -; CHECK-NEXT: vmov q7, q6 -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmov q5, q4 -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: subs r6, #8 +; CHECK-NEXT: vshr.u16 q3, q0, #3 +; CHECK-NEXT: vand q3, q3, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmla.i16 q1, q3, r2 +; CHECK-NEXT: vshl.i16 q3, q0, #3 +; CHECK-NEXT: vand q3, q3, q4 +; CHECK-NEXT: vmov q4, q6 +; CHECK-NEXT: vshr.u16 q1, q1, #5 +; CHECK-NEXT: vmla.i16 q4, q3, r2 +; CHECK-NEXT: vshr.u16 q3, q4, #11 +; CHECK-NEXT: vand q1, q1, q7 +; CHECK-NEXT: vorr q1, q1, q3 ; CHECK-NEXT: vshr.u16 q0, q0, #9 -; CHECK-NEXT: vmla.i16 q4, q2, r2 -; CHECK-NEXT: vshr.u16 q2, q4, #11 -; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vmov q6, q7 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vorr q1, q3, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmla.i16 q2, q0, r2 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.i16 q3, #0x78 +; CHECK-NEXT: vmov.i16 q4, #0xf8 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmla.i16 q3, q0, r2 +; CHECK-NEXT: vand q0, q3, q5 ; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: vmov.i16 q1, #0xfc ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrht.16 q0, [r5], #16 ; CHECK-NEXT: le lr, .LBB0_4 @@ -190,7 +185,7 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrsh.w r12, [r2, #2] ; CHECK-NEXT: cmp.w r12, #1 -; CHECK-NEXT: blt.w .LBB1_7 +; CHECK-NEXT: blt .LBB1_7 ; CHECK-NEXT: @ %bb.1: @ %for.cond3.preheader.lr.ph ; CHECK-NEXT: ldrsh.w r2, [r2] ; CHECK-NEXT: cmp r2, #1 @@ -200,71 +195,70 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: sub sp, #80 -; CHECK-NEXT: ldr r7, [sp, #168] +; CHECK-NEXT: ldr r7, [sp, #88] ; CHECK-NEXT: movs r5, #120 ; CHECK-NEXT: lsls r6, r3, #3 ; CHECK-NEXT: movs r4, #252 ; CHECK-NEXT: and.w r5, r5, r3, lsr #9 ; CHECK-NEXT: uxtb r6, r6 ; CHECK-NEXT: and.w r3, r4, r3, lsr #3 +; CHECK-NEXT: adds r4, r2, #7 ; CHECK-NEXT: muls r6, r7, r6 +; CHECK-NEXT: bic r4, r4, #7 ; CHECK-NEXT: mul lr, r3, r7 -; CHECK-NEXT: vdup.16 q0, r6 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vdup.16 q0, lr ; CHECK-NEXT: muls r5, r7, r5 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q0, #0xfc -; CHECK-NEXT: mov.w r6, #2016 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vdup.16 q0, r5 ; CHECK-NEXT: rsb.w r3, r7, #256 ; CHECK-NEXT: lsls r7, r1, #1 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vdup.16 q0, r6 +; CHECK-NEXT: sub.w r1, r4, #8 +; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: vmov.i16 q2, #0xf8 -; CHECK-NEXT: vmov.i16 q5, #0x78 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q6, #0xf800 +; CHECK-NEXT: add.w r1, r4, r1, lsr #3 +; CHECK-NEXT: vdup.16 q6, r6 +; CHECK-NEXT: mov.w r6, #2016 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vdup.16 q3, lr +; CHECK-NEXT: vdup.16 q5, r5 +; CHECK-NEXT: vdup.16 q7, r6 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB1_3: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_4 Depth 2 ; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: dlstp.16 lr, r2 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrh.u16 q0, [r5] +; CHECK-NEXT: vctp.16 r6 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q0, [r5] ; CHECK-NEXT: vshl.i16 q1, q0, #3 -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: subs r6, #8 ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmla.i16 q3, q1, r3 -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vmov.f64 d9, d5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vshr.u16 q2, q0, #9 +; CHECK-NEXT: vmov.i16 q2, #0x78 +; CHECK-NEXT: vshr.u16 q4, q0, #9 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov q2, q6 +; CHECK-NEXT: vmla.i16 q2, q1, r3 ; CHECK-NEXT: vshr.u16 q0, q0, #3 +; CHECK-NEXT: vmov.i16 q1, #0xfc ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: vmla.i16 q1, q0, r3 -; CHECK-NEXT: vand q2, q2, q5 -; CHECK-NEXT: vshr.u16 q0, q3, #11 -; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vshr.u16 q0, q2, #11 +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vmla.i16 q2, q4, r3 ; CHECK-NEXT: vshr.u16 q1, q1, #5 -; CHECK-NEXT: vmla.i16 q3, q2, r3 +; CHECK-NEXT: vmov.i16 q4, #0xf800 ; CHECK-NEXT: vand q1, q1, q7 ; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vand q1, q3, q6 +; CHECK-NEXT: vand q1, q2, q4 +; CHECK-NEXT: vmov.i16 q2, #0xf8 ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vstrh.16 q0, [r5], #16 -; CHECK-NEXT: vmov.f64 d4, d8 -; CHECK-NEXT: vmov.f64 d5, d9 -; CHECK-NEXT: letp lr, .LBB1_4 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q0, [r5], #16 +; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_3 Depth=1 ; CHECK-NEXT: adds r4, #1 @@ -272,7 +266,6 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc ; CHECK-NEXT: cmp r4, r12 ; CHECK-NEXT: bne .LBB1_3 ; CHECK-NEXT: @ %bb.6: -; CHECK-NEXT: add sp, #80 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, lr} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 07c06e1..1769c5d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -17,17 +17,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr { ; ENABLED-LABEL: varying_outer_2d_reduction: ; ENABLED: @ %bb.0: @ %entry -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; ENABLED-NEXT: sub sp, #4 ; ENABLED-NEXT: cmp r3, #1 -; ENABLED-NEXT: str r0, [sp] @ 4-byte Spill -; ENABLED-NEXT: blt .LBB0_8 -; ENABLED-NEXT: @ %bb.1: @ %for.body.lr.ph -; ENABLED-NEXT: ldr r0, [sp, #36] -; ENABLED-NEXT: add.w r12, r2, #3 -; ENABLED-NEXT: ldr.w r10, [sp] @ 4-byte Reload -; ENABLED-NEXT: mov.w r8, #0 -; ENABLED-NEXT: mov r9, r12 +; ENABLED-NEXT: it lt +; ENABLED-NEXT: bxlt lr +; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: mov r11, r0 +; ENABLED-NEXT: ldr r0, [sp, #32] +; ENABLED-NEXT: add.w r9, r2, #3 +; ENABLED-NEXT: mov.w r12, #0 +; ENABLED-NEXT: mov r10, r11 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 ; ENABLED-NEXT: b .LBB0_4 @@ -37,31 +36,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: lsrs r0, r0, #16 ; ENABLED-NEXT: sub.w r9, r9, #1 -; ENABLED-NEXT: strh.w r0, [r1, r8, lsl #1] -; ENABLED-NEXT: add.w r8, r8, #1 +; ENABLED-NEXT: strh.w r0, [r1, r12, lsl #1] +; ENABLED-NEXT: add.w r12, r12, #1 ; ENABLED-NEXT: add.w r10, r10, #2 -; ENABLED-NEXT: cmp r8, r3 +; ENABLED-NEXT: cmp r12, r3 ; ENABLED-NEXT: beq .LBB0_8 ; ENABLED-NEXT: .LBB0_4: @ %for.body ; ENABLED-NEXT: @ =>This Loop Header: Depth=1 ; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2 -; ENABLED-NEXT: cmp r2, r8 +; ENABLED-NEXT: cmp r2, r12 ; ENABLED-NEXT: ble .LBB0_2 ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 ; ENABLED-NEXT: movs r7, #1 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: sub.w r4, r2, r8 +; ENABLED-NEXT: sub.w r4, r2, r12 ; ENABLED-NEXT: vmov.i32 q1, #0x0 ; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 -; ENABLED-NEXT: sub.w r0, r12, r8 +; ENABLED-NEXT: adds r0, r2, #3 +; ENABLED-NEXT: sub.w r0, r0, r12 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 ; ENABLED-NEXT: mov r7, r10 ; ENABLED-NEXT: dls lr, r0 -; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload +; ENABLED-NEXT: mov r0, r11 ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 @@ -82,23 +82,22 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: vpsel q0, q1, q0 ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 -; ENABLED-NEXT: .LBB0_8: @ %for.end17 -; ENABLED-NEXT: add sp, #4 -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; ENABLED-NEXT: .LBB0_8: +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: bx lr ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: ; NOREDUCTIONS: @ %bb.0: @ %entry -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; NOREDUCTIONS-NEXT: sub sp, #4 ; NOREDUCTIONS-NEXT: cmp r3, #1 -; NOREDUCTIONS-NEXT: str r0, [sp] @ 4-byte Spill -; NOREDUCTIONS-NEXT: blt .LBB0_8 -; NOREDUCTIONS-NEXT: @ %bb.1: @ %for.body.lr.ph -; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] -; NOREDUCTIONS-NEXT: add.w r12, r2, #3 -; NOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload -; NOREDUCTIONS-NEXT: mov.w r8, #0 -; NOREDUCTIONS-NEXT: mov r9, r12 +; NOREDUCTIONS-NEXT: it lt +; NOREDUCTIONS-NEXT: bxlt lr +; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: mov r11, r0 +; NOREDUCTIONS-NEXT: ldr r0, [sp, #32] +; NOREDUCTIONS-NEXT: add.w r9, r2, #3 +; NOREDUCTIONS-NEXT: mov.w r12, #0 +; NOREDUCTIONS-NEXT: mov r10, r11 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 ; NOREDUCTIONS-NEXT: b .LBB0_4 @@ -108,31 +107,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: lsrs r0, r0, #16 ; NOREDUCTIONS-NEXT: sub.w r9, r9, #1 -; NOREDUCTIONS-NEXT: strh.w r0, [r1, r8, lsl #1] -; NOREDUCTIONS-NEXT: add.w r8, r8, #1 +; NOREDUCTIONS-NEXT: strh.w r0, [r1, r12, lsl #1] +; NOREDUCTIONS-NEXT: add.w r12, r12, #1 ; NOREDUCTIONS-NEXT: add.w r10, r10, #2 -; NOREDUCTIONS-NEXT: cmp r8, r3 +; NOREDUCTIONS-NEXT: cmp r12, r3 ; NOREDUCTIONS-NEXT: beq .LBB0_8 ; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body ; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1 ; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2 -; NOREDUCTIONS-NEXT: cmp r2, r8 +; NOREDUCTIONS-NEXT: cmp r2, r12 ; NOREDUCTIONS-NEXT: ble .LBB0_2 ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 ; NOREDUCTIONS-NEXT: movs r7, #1 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: sub.w r4, r2, r8 +; NOREDUCTIONS-NEXT: sub.w r4, r2, r12 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 ; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 -; NOREDUCTIONS-NEXT: sub.w r0, r12, r8 +; NOREDUCTIONS-NEXT: adds r0, r2, #3 +; NOREDUCTIONS-NEXT: sub.w r0, r0, r12 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r7, r10 ; NOREDUCTIONS-NEXT: dls lr, r0 -; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload +; NOREDUCTIONS-NEXT: mov r0, r11 ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 @@ -153,9 +153,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: vpsel q0, q1, q0 ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 -; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17 -; NOREDUCTIONS-NEXT: add sp, #4 -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; NOREDUCTIONS-NEXT: .LBB0_8: +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: bx lr entry: %conv = sext i16 %N to i32 %cmp36 = icmp sgt i16 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll index e0a61b1..78dc35b 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll @@ -49,18 +49,17 @@ define i32 @vcmp_new_vpst_combination(i32 %len, ptr nocapture readonly %arr) { ; CHECK-NEXT: cmp r0, #1 ; CHECK-NEXT: blt .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vmov.i32 q1, #0x1 +; CHECK-NEXT: vmov.i32 q0, #0x1 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: dlstp.32 lr, r0 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmovt q2, q1 -; CHECK-NEXT: vaddva.u32 r2, q2 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vaddva.u32 r2, q1 ; CHECK-NEXT: letp lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll index c8dd949..a904347 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -993,10 +993,10 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #20 +; CHECK-NEXT: sub sp, #20 ; CHECK-NEXT: cmp r3, #8 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: blo.w .LBB16_12 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: lsrs.w r12, r3, #2 @@ -1016,50 +1016,48 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: subs r1, r7, #2 ; CHECK-NEXT: rsbs r7, r4, #0 -; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: add.w r7, r3, #16 -; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: b .LBB16_6 ; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r5, r0, lsl #1 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add.w r6, r6, r0, lsl #1 ; CHECK-NEXT: b .LBB16_5 ; CHECK-NEXT: .LBB16_4: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: wls lr, r0, .LBB16_5 ; CHECK-NEXT: b .LBB16_10 ; CHECK-NEXT: .LBB16_5: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: subs.w r12, r12, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #8 -; CHECK-NEXT: add.w r0, r5, r0, lsl #1 +; CHECK-NEXT: add.w r0, r6, r0, lsl #1 ; CHECK-NEXT: add.w r5, r0, #8 ; CHECK-NEXT: beq.w .LBB16_12 ; CHECK-NEXT: .LBB16_6: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB16_8 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_11 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: ldrh.w lr, [r3, #14] ; CHECK-NEXT: vldrw.u32 q0, [r0], #8 -; CHECK-NEXT: ldrh.w r8, [r3, #12] +; CHECK-NEXT: ldrh.w r10, [r3, #12] ; CHECK-NEXT: ldrh r7, [r3, #10] ; CHECK-NEXT: ldrh r4, [r3, #8] ; CHECK-NEXT: ldrh r6, [r3, #6] ; CHECK-NEXT: ldrh.w r9, [r3, #4] ; CHECK-NEXT: ldrh.w r11, [r3, #2] -; CHECK-NEXT: ldrh.w r10, [r3] +; CHECK-NEXT: ldrh.w r8, [r3] ; CHECK-NEXT: vstrb.8 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: adds r0, r5, #2 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmul.f16 q0, q0, r10 +; CHECK-NEXT: vmul.f16 q0, q0, r8 ; CHECK-NEXT: adds r0, r5, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r11 ; CHECK-NEXT: vldrw.u32 q1, [r5, #4] @@ -1068,73 +1066,73 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: add.w r0, r5, #10 ; CHECK-NEXT: vfma.f16 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q1, [r5, #8] +; CHECK-NEXT: add.w r6, r5, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add.w r0, r5, #14 ; CHECK-NEXT: vfma.f16 q0, q1, r7 ; CHECK-NEXT: vldrw.u32 q1, [r5, #12] -; CHECK-NEXT: adds r5, #16 -; CHECK-NEXT: vfma.f16 q0, q1, r8 +; CHECK-NEXT: vfma.f16 q0, q1, r10 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vfma.f16 q0, q1, lr ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: blo .LBB16_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 ; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r5, r3, #16 ; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB16_8: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r0, [r6], #16 -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: adds r4, r5, #2 +; CHECK-NEXT: ldrh r0, [r5], #16 +; CHECK-NEXT: vldrw.u32 q1, [r6] +; CHECK-NEXT: adds r4, r6, #2 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-14] -; CHECK-NEXT: adds r4, r5, #6 +; CHECK-NEXT: ldrh r0, [r5, #-14] +; CHECK-NEXT: adds r4, r6, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-12] -; CHECK-NEXT: vldrw.u32 q1, [r5, #4] +; CHECK-NEXT: ldrh r0, [r5, #-12] +; CHECK-NEXT: vldrw.u32 q1, [r6, #4] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-10] -; CHECK-NEXT: add.w r4, r5, #10 +; CHECK-NEXT: ldrh r0, [r5, #-10] +; CHECK-NEXT: add.w r4, r6, #10 ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-8] -; CHECK-NEXT: vldrw.u32 q1, [r5, #8] +; CHECK-NEXT: ldrh r0, [r5, #-8] +; CHECK-NEXT: vldrw.u32 q1, [r6, #8] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-6] -; CHECK-NEXT: ldrh r4, [r6, #-2] +; CHECK-NEXT: ldrh r0, [r5, #-6] +; CHECK-NEXT: ldrh r4, [r5, #-2] ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-4] -; CHECK-NEXT: vldrw.u32 q1, [r5, #12] +; CHECK-NEXT: ldrh r0, [r5, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r6, #12] ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: add.w r0, r5, #14 +; CHECK-NEXT: add.w r0, r6, #14 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: adds r6, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: le lr, .LBB16_8 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add.w r5, r3, #16 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: .LBB16_11: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r4, [r6], #2 +; CHECK-NEXT: ldrh r4, [r5], #2 ; CHECK-NEXT: vldrh.u16 q1, [r0], #2 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: le lr, .LBB16_11 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_12: @ %if.end -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #20 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index 28166e4..f7b4548 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -995,46 +995,44 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: ldrh r6, [r0] -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: ldrd r4, r10, [r0, #4] +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: ldrd r7, r10, [r0, #4] ; CHECK-NEXT: sub.w r0, r6, #8 ; CHECK-NEXT: add.w r3, r0, r0, lsr #29 ; CHECK-NEXT: and r0, r0, #7 -; CHECK-NEXT: asrs r7, r3, #3 -; CHECK-NEXT: cmp r7, #1 +; CHECK-NEXT: asrs r5, r3, #3 +; CHECK-NEXT: cmp r5, #1 ; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r5, r3, #3 -; CHECK-NEXT: add.w r3, r4, r6, lsl #2 +; CHECK-NEXT: asrgt r4, r3, #3 +; CHECK-NEXT: add.w r3, r7, r6, lsl #2 ; CHECK-NEXT: sub.w r9, r3, #4 ; CHECK-NEXT: rsbs r3, r6, #0 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: add.w r3, r10, #32 -; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r4, [sp] @ 4-byte Spill +; CHECK-NEXT: str r6, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: b .LBB16_6 ; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add.w r4, r4, r0, lsl #2 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add.w r7, r7, r0, lsl #2 ; CHECK-NEXT: b .LBB16_5 ; CHECK-NEXT: .LBB16_4: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldrd r0, r9, [sp, #12] @ 8-byte Folded Reload ; CHECK-NEXT: wls lr, r0, .LBB16_5 ; CHECK-NEXT: b .LBB16_10 ; CHECK-NEXT: .LBB16_5: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: subs.w r12, r12, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: add.w r0, r4, r0, lsl #2 -; CHECK-NEXT: add.w r4, r0, #16 +; CHECK-NEXT: add.w r0, r7, r0, lsl #2 +; CHECK-NEXT: add.w r7, r0, #16 ; CHECK-NEXT: beq .LBB16_12 ; CHECK-NEXT: .LBB16_6: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -1042,76 +1040,76 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: @ Child Loop BB16_11 Depth 2 ; CHECK-NEXT: add.w lr, r10, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: ldrd r3, r7, [r10] +; CHECK-NEXT: ldrd r3, r4, [r10] ; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr} ; CHECK-NEXT: ldrd r11, r8, [r10, #24] ; CHECK-NEXT: vstrb.8 q0, [r9], #16 -; CHECK-NEXT: vldrw.u32 q0, [r4], #32 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: str.w r9, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] +; CHECK-NEXT: vldrw.u32 q0, [r7], #32 +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str.w r9, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r7, #-28] ; CHECK-NEXT: vmul.f32 q0, q0, r3 -; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] -; CHECK-NEXT: vldrw.u32 q4, [r4, #-20] -; CHECK-NEXT: vfma.f32 q0, q1, r7 -; CHECK-NEXT: vldrw.u32 q5, [r4, #-16] +; CHECK-NEXT: vldrw.u32 q6, [r7, #-24] +; CHECK-NEXT: vldrw.u32 q4, [r7, #-20] +; CHECK-NEXT: vfma.f32 q0, q1, r4 +; CHECK-NEXT: vldrw.u32 q5, [r7, #-16] ; CHECK-NEXT: vfma.f32 q0, q6, r0 -; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] +; CHECK-NEXT: vldrw.u32 q2, [r7, #-12] ; CHECK-NEXT: vfma.f32 q0, q4, r5 -; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] +; CHECK-NEXT: vldrw.u32 q3, [r7, #-8] ; CHECK-NEXT: vfma.f32 q0, q5, r6 -; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r7, #-4] ; CHECK-NEXT: vfma.f32 q0, q2, lr -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: vfma.f32 q0, q1, r8 ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: blo .LBB16_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r4, r10, #32 ; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: .LBB16_8: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11} -; CHECK-NEXT: vldrw.u32 q1, [r4], #32 -; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] -; CHECK-NEXT: vldrw.u32 q4, [r4, #-20] +; CHECK-NEXT: ldm.w r4, {r0, r3, r5, r6, r8, r11} +; CHECK-NEXT: vldrw.u32 q1, [r7], #32 +; CHECK-NEXT: vldrw.u32 q6, [r7, #-24] +; CHECK-NEXT: vldrw.u32 q4, [r7, #-20] ; CHECK-NEXT: vfma.f32 q0, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] -; CHECK-NEXT: vldrw.u32 q5, [r4, #-16] -; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] +; CHECK-NEXT: vldrw.u32 q1, [r7, #-28] +; CHECK-NEXT: vldrw.u32 q5, [r7, #-16] +; CHECK-NEXT: vldrw.u32 q2, [r7, #-12] ; CHECK-NEXT: vfma.f32 q0, q1, r3 -; CHECK-NEXT: ldrd r9, r1, [r7, #24] +; CHECK-NEXT: ldrd r9, r1, [r4, #24] ; CHECK-NEXT: vfma.f32 q0, q6, r5 -; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] +; CHECK-NEXT: vldrw.u32 q3, [r7, #-8] ; CHECK-NEXT: vfma.f32 q0, q4, r6 -; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r7, #-4] ; CHECK-NEXT: vfma.f32 q0, q5, r8 -; CHECK-NEXT: adds r7, #32 +; CHECK-NEXT: adds r4, #32 ; CHECK-NEXT: vfma.f32 q0, q2, r11 ; CHECK-NEXT: vfma.f32 q0, q3, r9 ; CHECK-NEXT: vfma.f32 q0, q1, r1 ; CHECK-NEXT: le lr, .LBB16_8 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: add.w r4, r10, #32 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: .LBB16_11: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr r0, [r7], #4 +; CHECK-NEXT: ldr r0, [r4], #4 ; CHECK-NEXT: vldrw.u32 q1, [r3], #4 ; CHECK-NEXT: vfma.f32 q0, q1, r0 ; CHECK-NEXT: le lr, .LBB16_11 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_12: -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index e8b49c1..0d86f22 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -711,8 +711,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #136 -; CHECK-NEXT: sub sp, #136 +; CHECK-NEXT: .pad #120 +; CHECK-NEXT: sub sp, #120 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill ; CHECK-NEXT: blt.w .LBB14_5 @@ -725,22 +725,20 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: subs r1, #8 ; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q2, #0x18 ; CHECK-NEXT: add.w r1, r2, r1, lsr #3 ; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill ; CHECK-NEXT: adr r1, .LCPI14_0 ; CHECK-NEXT: adr r2, .LCPI14_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: add r2, sp, #120 +; CHECK-NEXT: add r2, sp, #104 ; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill ; CHECK-NEXT: .LBB14_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 ; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: add.w r10, sp, #104 +; CHECK-NEXT: add.w r10, sp, #88 ; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload @@ -762,7 +760,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: vmov r6, r2, d4 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh.w r12, [r4] -; CHECK-NEXT: add r4, sp, #88 +; CHECK-NEXT: add r4, sp, #72 ; CHECK-NEXT: ldrh.w r11, [r5] ; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r5, [r6] @@ -807,7 +805,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.16 q3[1], r5 ; CHECK-NEXT: vmov r2, r5, d5 -; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload +; CHECK-NEXT: vmov.i16 q2, #0x18 ; CHECK-NEXT: vadd.i16 q6, q6, q2 ; CHECK-NEXT: vadd.i16 q5, q5, q2 ; CHECK-NEXT: vadd.i16 q4, q4, q2 @@ -849,7 +847,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: cmp r1, r3 ; CHECK-NEXT: bne.w .LBB14_2 ; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #136 +; CHECK-NEXT: add sp, #120 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -950,7 +948,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: adr r6, .LCPI15_9 -; CHECK-NEXT: vmov.i32 q2, #0x30 ; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill @@ -963,212 +960,213 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: .LBB15_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB15_3 Depth 2 +; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: adr r1, .LCPI15_3 -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI15_4 ; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: adr r1, .LCPI15_4 +; CHECK-NEXT: vstrw.32 q2, [sp, #296] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [r1] ; CHECK-NEXT: adr r1, .LCPI15_2 -; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vstrw.32 q2, [sp, #280] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: adr r1, .LCPI15_10 -; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q3, [r1] ; CHECK-NEXT: adr r1, .LCPI15_11 ; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #248] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [r1] -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload ; CHECK-NEXT: mov r11, r10 -; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #232] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #216] @ 16-byte Spill ; CHECK-NEXT: .LBB15_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB15_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill -; CHECK-NEXT: vmov r1, lr, d8 -; CHECK-NEXT: vadd.i32 q7, q7, r0 -; CHECK-NEXT: vmov r5, r4, d15 -; CHECK-NEXT: vadd.i32 q6, q0, r0 -; CHECK-NEXT: vmov r6, r7, d13 +; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vstrw.32 q7, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q7, q5, r0 +; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q5, q0, r0 +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vadd.i32 q6, q4, r0 +; CHECK-NEXT: vmov r5, r4, d11 +; CHECK-NEXT: vmov r1, lr, d12 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vmov r6, r7, d15 ; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vstrw.32 q4, [sp, #168] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [sp, #248] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #120] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #136] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #184] @ 16-byte Reload ; CHECK-NEXT: subs.w r11, r11, #16 -; CHECK-NEXT: ldrb.w r9, [r1] -; CHECK-NEXT: vmov r1, r3, d14 ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w r9, [r1] +; CHECK-NEXT: vmov r1, r3, d10 ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[0], r1 +; CHECK-NEXT: vmov.8 q5[0], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[1], r1 -; CHECK-NEXT: vmov r1, r3, d12 -; CHECK-NEXT: vmov.8 q7[2], r5 +; CHECK-NEXT: vmov.8 q5[1], r1 +; CHECK-NEXT: vmov r1, r3, d14 +; CHECK-NEXT: vmov.8 q5[2], r5 ; CHECK-NEXT: ldrb r5, [r6] ; CHECK-NEXT: ldrb r6, [r4] -; CHECK-NEXT: vmov.8 q7[3], r6 +; CHECK-NEXT: vmov.8 q5[3], r6 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q6[0], r1 -; CHECK-NEXT: vmov r6, r1, d2 -; CHECK-NEXT: vmov.8 q6[1], r3 -; CHECK-NEXT: vmov.8 q6[2], r5 -; CHECK-NEXT: vmov.8 q6[3], r7 +; CHECK-NEXT: vmov.8 q7[0], r1 +; CHECK-NEXT: vmov r6, r1, d4 +; CHECK-NEXT: vmov.8 q7[1], r3 +; CHECK-NEXT: vmov.8 q7[2], r5 +; CHECK-NEXT: vmov.8 q7[3], r7 ; CHECK-NEXT: ldrb.w r7, [lr] -; CHECK-NEXT: vmov.8 q6[4], r9 -; CHECK-NEXT: vmov.8 q6[5], r7 +; CHECK-NEXT: vmov.8 q7[4], r9 +; CHECK-NEXT: vmov.8 q7[5], r7 ; CHECK-NEXT: ldrb r4, [r1] -; CHECK-NEXT: vmov r1, r5, d3 -; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload +; CHECK-NEXT: vmov r1, r5, d5 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vldrw.u32 q1, [sp, #280] @ 16-byte Reload ; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, r3, d9 +; CHECK-NEXT: vmov r1, r3, d13 ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #232] @ 16-byte Reload ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q6[6], r1 -; CHECK-NEXT: vmov r1, r7, d0 -; CHECK-NEXT: vmov.8 q6[7], r3 +; CHECK-NEXT: vmov.8 q7[6], r1 +; CHECK-NEXT: vmov r1, r7, d4 +; CHECK-NEXT: vmov.8 q7[7], r3 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q7[4], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q7[5], r7 -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov.8 q5[4], r1 +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: vmov.8 q5[5], r7 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q7[6], r1 +; CHECK-NEXT: vmov.8 q5[6], r1 ; CHECK-NEXT: ldrb r1, [r6] -; CHECK-NEXT: vmov r7, r6, d0 -; CHECK-NEXT: vmov.8 q7[7], r3 -; CHECK-NEXT: vmov r3, lr, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q7[8], r1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov.8 q7[9], r4 -; CHECK-NEXT: vmov r4, r1, d0 -; CHECK-NEXT: vmov.8 q7[10], r12 -; CHECK-NEXT: vmov.8 q7[11], r5 +; CHECK-NEXT: vmov.8 q5[7], r3 +; CHECK-NEXT: vmov r7, r6, d4 +; CHECK-NEXT: vmov r3, lr, d5 +; CHECK-NEXT: vmov.8 q5[8], r1 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vmov.8 q5[9], r4 +; CHECK-NEXT: vmov r4, r1, d4 +; CHECK-NEXT: vmov.8 q5[10], r12 +; CHECK-NEXT: vmov.8 q5[11], r5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #264] @ 16-byte Reload ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[8], r4 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov.8 q6[9], r1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q7[8], r4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: vmov.8 q7[9], r1 +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #216] @ 16-byte Reload ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q6[10], r5 -; CHECK-NEXT: vmov.8 q6[11], r4 -; CHECK-NEXT: vmov.8 q6[12], r7 -; CHECK-NEXT: vmov.8 q6[13], r6 -; CHECK-NEXT: vmov.8 q6[14], r3 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov.8 q7[10], r5 +; CHECK-NEXT: vmov.8 q7[11], r4 +; CHECK-NEXT: vmov.8 q7[12], r7 +; CHECK-NEXT: vmov.8 q7[13], r6 +; CHECK-NEXT: vmov.8 q7[14], r3 +; CHECK-NEXT: vmov r1, r3, d4 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[12], r1 +; CHECK-NEXT: vmov.8 q5[12], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[13], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov.8 q5[13], r1 +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: vadd.i32 q2, q1, r0 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[14], r1 +; CHECK-NEXT: vmov.8 q5[14], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[15], r1 +; CHECK-NEXT: vmov.8 q5[15], r1 ; CHECK-NEXT: ldrb.w r1, [lr] -; CHECK-NEXT: vmov.8 q6[15], r1 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vadd.i8 q6, q6, q7 +; CHECK-NEXT: vmov.8 q7[15], r1 +; CHECK-NEXT: vmov r1, r3, d4 +; CHECK-NEXT: vadd.i8 q5, q7, q5 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: vmov.8 q7[0], r1 ; CHECK-NEXT: vmov.8 q7[1], r3 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q3, r0 -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: vadd.i32 q2, q4, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[2], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[3], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d4 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[4], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[5], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vadd.i32 q5, q5, q2 -; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q5, q5, q2 +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: vadd.i32 q2, q6, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[6], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[7], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d4 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[8], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[9], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q4, r0 -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[10], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[11], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d4 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[12], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[13], r1 -; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vmov r1, r3, d5 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[14], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[15], r1 -; CHECK-NEXT: vadd.i8 q0, q6, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload -; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q7, q7, q2 +; CHECK-NEXT: vadd.i8 q2, q5, q7 +; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload +; CHECK-NEXT: vstrb.8 q2, [r8], #16 +; CHECK-NEXT: vmov.i32 q2, #0x30 +; CHECK-NEXT: vadd.i32 q6, q6, q2 +; CHECK-NEXT: vadd.i32 q3, q3, q2 +; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [sp, #296] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vadd.i32 q4, q4, q2 +; CHECK-NEXT: vadd.i32 q6, q6, q2 ; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: vldrw.u32 q3, [sp, #136] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #248] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [sp, #168] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q6, [sp, #296] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [sp, #120] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #216] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q5, q5, q2 +; CHECK-NEXT: vadd.i32 q3, q3, q2 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q4, q4, q2 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vadd.i32 q6, q6, q2 +; CHECK-NEXT: vstrw.32 q0, [sp, #280] @ 16-byte Spill ; CHECK-NEXT: bne.w .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB15_2 Depth=1 @@ -1501,14 +1499,14 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: blt .LBB18_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adr.w lr, .LCPI18_0 +; CHECK-NEXT: adr r3, .LCPI18_0 ; CHECK-NEXT: adr r4, .LCPI18_1 ; CHECK-NEXT: adr r5, .LCPI18_2 ; CHECK-NEXT: adr r6, .LCPI18_3 ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: vldrw.u32 q2, [r4] -; CHECK-NEXT: vldrw.u32 q3, [lr] +; CHECK-NEXT: vldrw.u32 q3, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: vadd.i32 q1, q1, r1 ; CHECK-NEXT: vadd.i32 q2, q2, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll index dad856c..00a998c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -38,7 +38,7 @@ define arm_aapcs_vfpcc void @k() { ; CHECK-NEXT: vmov.i32 q5, #0x0 ; CHECK-NEXT: vpsel q6, q4, q3 ; CHECK-NEXT: vstrh.16 q6, [r0] -; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.i32 q6, #0x0 ; CHECK-NEXT: cbz r1, .LBB0_2 ; CHECK-NEXT: le .LBB0_1 ; CHECK-NEXT: .LBB0_2: @ %for.cond4.preheader @@ -135,12 +135,12 @@ vector.body115: ; preds = %vector.body115, %ve define dso_local i32 @e() #0 { ; CHECK-LABEL: e: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #408 -; CHECK-NEXT: sub sp, #408 +; CHECK-NEXT: .pad #392 +; CHECK-NEXT: sub sp, #392 ; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals ; CHECK-NEXT: vldr s15, .LCPI1_1 ; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals @@ -148,18 +148,16 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: mov r4, r7 ; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: ldr r6, [r4, #8]! -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: ldr r0, [r3, #4]! -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: movt r2, :upper16:e +; CHECK-NEXT: ldr r0, [r3, #4]! ; CHECK-NEXT: vmov r5, s15 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r4 -; CHECK-NEXT: vmov s13, r3 ; CHECK-NEXT: vldr s12, .LCPI1_0 +; CHECK-NEXT: vmov s13, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r2 ; CHECK-NEXT: vdup.32 q7, r3 ; CHECK-NEXT: vmov q6[2], q6[0], r3, r5 -; CHECK-NEXT: vstrw.32 q0, [sp, #92] +; CHECK-NEXT: vstrw.32 q0, [sp, #76] ; CHECK-NEXT: vmov q0, q7 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vmov q4, q7 @@ -168,7 +166,7 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: vmov s21, r2 ; CHECK-NEXT: movs r1, #64 ; CHECK-NEXT: vmov.f32 s20, s12 -; CHECK-NEXT: str r0, [sp, #40] +; CHECK-NEXT: str r0, [sp, #24] ; CHECK-NEXT: vmov.f32 s22, s13 ; CHECK-NEXT: str r6, [r0] ; CHECK-NEXT: vmov.f32 s23, s15 @@ -186,12 +184,12 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 ; CHECK-NEXT: vmov.32 q4[0], r8 ; CHECK-NEXT: @ implicit-def: $r2 -; CHECK-NEXT: str.w r8, [sp, #44] -; CHECK-NEXT: vstrw.32 q3, [sp, #60] -; CHECK-NEXT: strh.w r12, [sp, #406] +; CHECK-NEXT: str.w r8, [sp, #28] +; CHECK-NEXT: vstrw.32 q3, [sp, #44] +; CHECK-NEXT: strh.w r12, [sp, #390] ; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2 ; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: .LBB1_2: @ %entry @@ -199,7 +197,7 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: str.w r8, [r7] ; CHECK-NEXT: vstrw.32 q4, [r0] ; CHECK-NEXT: vstrw.32 q2, [r0] -; CHECK-NEXT: str.w r12, [sp, #324] +; CHECK-NEXT: str.w r12, [sp, #308] ; CHECK-NEXT: .LBB1_3: @ %for.cond ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: b .LBB1_3 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll index f90af3c..2587a0bb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -115,17 +115,17 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: ldr r3, [r0] ; CHECK-NEXT: add.w r11, r3, r12, lsl #2 -; CHECK-NEXT: add.w r7, r3, r12, lsl #3 -; CHECK-NEXT: lsl.w r9, r12, #3 +; CHECK-NEXT: add.w r6, r3, r12, lsl #3 +; CHECK-NEXT: lsl.w r10, r12, #3 ; CHECK-NEXT: .LBB1_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 ; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r9, r4, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w r10, r4, #1 ; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB1_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1 @@ -139,11 +139,11 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: vadd.f32 s2, s2, s3 -; CHECK-NEXT: add.w r0, r2, r10, lsl #2 +; CHECK-NEXT: add.w r0, r2, r9, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: add r11, r9 +; CHECK-NEXT: add r11, r10 ; CHECK-NEXT: vadd.f32 s6, s6, s7 -; CHECK-NEXT: add r7, r9 +; CHECK-NEXT: add r6, r10 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: vadd.f32 s2, s4, s6 @@ -228,46 +228,40 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] -; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: subs r1, #3 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo .LBB2_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r9, [r0, #8] ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r0, r3, r3, lsl #1 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 -; CHECK-NEXT: add.w r12, r1, r3, lsl #3 -; CHECK-NEXT: adds r3, #3 +; CHECK-NEXT: add.w r3, r9, #3 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add.w r10, r1, r0, lsl #2 +; CHECK-NEXT: add.w r0, r9, r9, lsl #1 ; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: add.w r10, r1, r9, lsl #2 +; CHECK-NEXT: add.w r12, r1, r9, lsl #3 +; CHECK-NEXT: add.w r1, r1, r0, lsl #2 +; CHECK-NEXT: add.w r3, r5, r3, lsr #2 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: lsl.w r11, r0, #2 -; CHECK-NEXT: add.w r1, r5, r3, lsr #2 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 -; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: adds r0, r5, #2 -; CHECK-NEXT: adds r2, r5, #1 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: mov r4, r10 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: dlstp.32 lr, r7 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: dlstp.32 lr, r9 ; CHECK-NEXT: .LBB2_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -282,31 +276,31 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 ; CHECK-NEXT: vadd.f32 s10, s10, s11 -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: adds r0, r5, #1 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: add r9, r11 +; CHECK-NEXT: add r10, r11 ; CHECK-NEXT: vadd.f32 s6, s6, s7 -; CHECK-NEXT: add.w r0, r1, r2, lsl #2 +; CHECK-NEXT: add.w r0, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: add r12, r11 ; CHECK-NEXT: vadd.f32 s2, s2, s3 -; CHECK-NEXT: add r10, r11 +; CHECK-NEXT: add r1, r11 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s8, s8, s10 ; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: vstr s8, [r0] -; CHECK-NEXT: add.w r0, r1, r5, lsl #2 -; CHECK-NEXT: adds r5, #3 +; CHECK-NEXT: add.w r0, r2, r5, lsl #2 ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r1, r0, lsl #2 +; CHECK-NEXT: adds r0, r5, #2 +; CHECK-NEXT: adds r5, #3 +; CHECK-NEXT: add.w r0, r2, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: blo .LBB2_2 ; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -394,15 +388,15 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] -; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 -; CHECK-NEXT: blo.w .LBB3_5 +; CHECK-NEXT: blo .LBB3_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: ldr r2, [r0, #8] ; CHECK-NEXT: movs r6, #1 @@ -410,34 +404,28 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: add.w r0, r2, r2, lsl #1 ; CHECK-NEXT: add.w r12, r1, r2, lsl #2 ; CHECK-NEXT: add.w r8, r1, r2, lsl #3 -; CHECK-NEXT: add.w r9, r1, r2, lsl #4 -; CHECK-NEXT: add.w r11, r1, r0, lsl #2 +; CHECK-NEXT: add.w r10, r1, r2, lsl #4 +; CHECK-NEXT: add.w r9, r1, r0, lsl #2 ; CHECK-NEXT: adds r0, r2, #3 ; CHECK-NEXT: bic r0, r0, #3 ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: add.w r0, r6, r0, lsr #2 -; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r2, [sp, #4] @ 8-byte Folded Spill ; CHECK-NEXT: lsls r0, r2, #4 -; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldrd r2, r7, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB3_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 -; CHECK-NEXT: adds r0, r6, #3 -; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill -; CHECK-NEXT: adds r0, r6, #2 -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: adds r0, r6, #1 -; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r5, r11 -; CHECK-NEXT: mov r4, r9 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: mov r4, r10 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB3_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 @@ -455,9 +443,9 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 ; CHECK-NEXT: vadd.f32 s14, s14, s15 -; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: adds r0, r6, #1 ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 @@ -471,24 +459,24 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: vstr s12, [r0] ; CHECK-NEXT: add.w r0, r1, r6, lsl #2 -; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: vstr s8, [r0] -; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: adds r0, r6, #2 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: adds r0, r6, #3 +; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-NEXT: add r12, r0 ; CHECK-NEXT: add r8, r0 -; CHECK-NEXT: add r11, r0 ; CHECK-NEXT: add r9, r0 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add r10, r0 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: blo .LBB3_2 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -588,60 +576,53 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #5 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB4_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r12, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r12, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r8, r1, r3, lsl #2 +; CHECK-NEXT: add.w r8, r1, r12, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r5, r3, #2 +; CHECK-NEXT: lsl.w r5, r12, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: add.w r1, r3, r3, lsl #2 -; CHECK-NEXT: lsls r1, r1, #2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r1, r12, r12, lsl #2 +; CHECK-NEXT: lsls r1, r1, #2 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2 -; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: add.w r10, r0, #2 +; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: add.w r11, r0, #1 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: dlstp.32 lr, r7 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB4_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add.w r9, r3, r5 ; CHECK-NEXT: vldrw.u32 q5, [r4], #16 ; CHECK-NEXT: vldrw.u32 q6, [r3], #16 -; CHECK-NEXT: add.w r12, r9, r5 +; CHECK-NEXT: add.w r10, r9, r5 ; CHECK-NEXT: vfma.f32 q3, q6, q5 ; CHECK-NEXT: vldrw.u32 q6, [r9] -; CHECK-NEXT: add.w r6, r12, r5 +; CHECK-NEXT: add.w r6, r10, r5 ; CHECK-NEXT: vfma.f32 q4, q6, q5 -; CHECK-NEXT: vldrw.u32 q6, [r12] +; CHECK-NEXT: vldrw.u32 q6, [r10] ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vfma.f32 q2, q6, q5 ; CHECK-NEXT: vldrw.u32 q6, [r6] @@ -662,30 +643,31 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s1, s16, s18 -; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: vadd.f32 s12, s12, s14 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: vadd.f32 s6, s8, s10 ; CHECK-NEXT: vstr s1, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NEXT: adds r0, #5 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: add.w r1, r2, r10, lsl #2 +; CHECK-NEXT: adds r1, r0, #2 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s0, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: adds r0, #5 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: add r8, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: blo.w .LBB4_2 +; CHECK-NEXT: blo .LBB4_2 ; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -797,63 +779,54 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #6 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB5_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r12, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r12, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r8, r1, r3, lsl #2 +; CHECK-NEXT: add.w r8, r1, r12, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r5, r3, #2 +; CHECK-NEXT: lsl.w r5, r12, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r1, r3, r3, lsl #1 +; CHECK-NEXT: add.w r1, r12, r12, lsl #1 ; CHECK-NEXT: lsls r1, r1, #3 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB5_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_3 Depth 2 -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: add.w r11, r0, #2 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #1 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: dlstp.32 lr, r7 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB5_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r12, r3, r5 +; CHECK-NEXT: add.w r10, r3, r5 ; CHECK-NEXT: vldrw.u32 q6, [r1], #16 ; CHECK-NEXT: vldrw.u32 q7, [r3], #16 -; CHECK-NEXT: add.w r10, r12, r5 +; CHECK-NEXT: add.w r11, r10, r5 ; CHECK-NEXT: vfma.f32 q4, q7, q6 -; CHECK-NEXT: vldrw.u32 q7, [r12] -; CHECK-NEXT: add.w r6, r10, r5 -; CHECK-NEXT: vfma.f32 q5, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r10] +; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: vfma.f32 q5, q7, q6 +; CHECK-NEXT: vldrw.u32 q7, [r11] ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vfma.f32 q2, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r6] @@ -885,28 +858,29 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vstr s1, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: vstr s3, [r1] -; CHECK-NEXT: add.w r1, r2, r11, lsl #2 +; CHECK-NEXT: adds r1, r0, #2 ; CHECK-NEXT: vadd.f32 s4, s4, s6 +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: vadd.f32 s6, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s0, [r1] -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #5 +; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: add r8, r1 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB5_2 ; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1030,73 +1004,64 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #72 -; CHECK-NEXT: sub sp, #72 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #7 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB6_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r10, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r10, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r9, r1, r10, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r5, r3, #2 +; CHECK-NEXT: lsl.w r5, r10, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: rsb r1, r3, r3, lsl #3 -; CHECK-NEXT: lsls r1, r1, #2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: rsb r1, r10, r10, lsl #3 +; CHECK-NEXT: lsls r1, r1, #2 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB6_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_3 Depth 2 -; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #2 ; CHECK-NEXT: add.w r8, r0, #1 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: mov r12, r7 -; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: dls lr, r6 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q6, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: mov r12, r10 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB6_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: add.w r10, r3, r5 +; CHECK-NEXT: add.w r11, r3, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 -; CHECK-NEXT: add.w r11, r10, r5 +; CHECK-NEXT: add.w r6, r11, r5 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q5, q0, q7 -; CHECK-NEXT: vldrwt.u32 q0, [r10] -; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: vldrwt.u32 q0, [r11] +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q6, q0, q7 -; CHECK-NEXT: vldrwt.u32 q0, [r11] -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill +; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q1, q0, q7 @@ -1104,26 +1069,26 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r6] -; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: adds r7, r6, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q1, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r7] +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: adds r6, r7, r5 -; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: vpstt +; CHECK-NEXT: vfmat.f32 q1, q0, q7 +; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q3, q0, q7 -; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: vldrwt.u32 q0, [r7] ; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q4, q0, q7 -; CHECK-NEXT: vldrwt.u32 q0, [r7] +; CHECK-NEXT: vldrwt.u32 q0, [r6] ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q0, q7 ; CHECK-NEXT: le lr, .LBB6_3 @@ -1138,45 +1103,45 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s9, s18, s19 ; CHECK-NEXT: vadd.f32 s11, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 s2, s3, s1 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s5, s18, s19 ; CHECK-NEXT: vadd.f32 s7, s16, s17 +; CHECK-NEXT: vadd.f32 s2, s3, s1 ; CHECK-NEXT: vadd.f32 s4, s4, s6 -; CHECK-NEXT: vstr s0, [r1] -; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s14, s14, s15 -; CHECK-NEXT: adds r0, #7 ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: add.w r1, r2, r4, lsl #2 +; CHECK-NEXT: vstr s0, [r1] +; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s8, s8, s10 ; CHECK-NEXT: vadd.f32 s6, s7, s5 -; CHECK-NEXT: vstr s4, [r1] +; CHECK-NEXT: vstr s2, [r1] +; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vadd.f32 s10, s11, s9 -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: vstr s4, [r1] +; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: vadd.f32 s12, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #5 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s10, [r1] -; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #6 +; CHECK-NEXT: adds r0, #7 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: add r9, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB6_2 ; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #72 +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1312,107 +1277,99 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB7_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r11, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r11, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r1, r11, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r6, r3, #2 +; CHECK-NEXT: lsl.w r6, r11, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: lsls r1, r3, #5 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: lsl.w r1, r11, #5 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB7_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB7_3 Depth 2 -; CHECK-NEXT: adds r1, r0, #7 -; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: ldr.w r9, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: adds r4, r0, #3 ; CHECK-NEXT: add.w r8, r0, #2 ; CHECK-NEXT: adds r1, r0, #1 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov q6, q3 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: vmov q7, q3 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: mov r10, r7 -; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill -; CHECK-NEXT: dls lr, r5 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov.i32 q6, #0x0 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i32 q7, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: mov r10, r11 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB7_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vctp.32 r10 -; CHECK-NEXT: add.w r11, r3, r6 +; CHECK-NEXT: adds r5, r3, r6 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 -; CHECK-NEXT: add.w r5, r11, r6 +; CHECK-NEXT: adds r7, r5, r6 ; CHECK-NEXT: sub.w r10, r10, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q6, q1, q0 -; CHECK-NEXT: vldrwt.u32 q1, [r11] -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill +; CHECK-NEXT: vldrwt.u32 q1, [r5] +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q7, q1, q0 -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [r5] -; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: adds r7, r5, r6 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r7] -; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: adds r5, r7, r6 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r5] +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: adds r7, r5, r6 -; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r7] ; CHECK-NEXT: adds r5, r7, r6 -; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q4, q1, q0 +; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r5] +; CHECK-NEXT: vmov q4, q5 +; CHECK-NEXT: adds r7, r5, r6 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vfmat.f32 q4, q1, q0 +; CHECK-NEXT: vldrwt.u32 q1, [r7] ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: add r5, r6 +; CHECK-NEXT: adds r5, r7, r6 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q5, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r5] -; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q3, q1, q0 ; CHECK-NEXT: le lr, .LBB7_3 @@ -1425,12 +1382,12 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vadd.f32 s6, s24, s25 ; CHECK-NEXT: vadd.f32 s5, s18, s19 ; CHECK-NEXT: vadd.f32 s7, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: vadd.f32 s9, s18, s19 ; CHECK-NEXT: vadd.f32 s11, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s14, s14, s15 ; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vadd.f32 s13, s18, s19 @@ -1445,33 +1402,33 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s3, s20, s21 -; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: add.w r1, r2, r8, lsl #2 ; CHECK-NEXT: vadd.f32 s12, s7, s5 ; CHECK-NEXT: vstr s10, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vstr s14, [r1] -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: vadd.f32 s4, s3, s1 +; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 +; CHECK-NEXT: vadd.f32 s4, s3, s1 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #5 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #6 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #7 +; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: add r12, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB7_2 ; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll index 29c4fb9..413c4a1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll @@ -1496,15 +1496,14 @@ define void @vfmasq(ptr %x, ptr %y, i32 %n) { ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB34_1: @ %for.body.preheader -; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB34_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vstrw.32 q3, [r1], #16 +; CHECK-NEXT: vmov.f32 q2, #1.000000e+01 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vstrw.32 q2, [r1], #16 ; CHECK-NEXT: letp lr, .LBB34_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -1542,15 +1541,14 @@ define void @vfmas(ptr %s1, ptr %s2, i32 %N) { ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB35_1: @ %while.body.lr.ph -; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB35_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vstrw.32 q3, [r0], #16 +; CHECK-NEXT: vmov.f32 q2, #1.000000e+01 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vstrw.32 q2, [r0], #16 ; CHECK-NEXT: letp lr, .LBB35_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll index e845070..62482c1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -287,17 +287,17 @@ define void @shlor(ptr nocapture readonly %x, ptr noalias nocapture %y, i32 %n) ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: blt .LBB5_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adr.w lr, .LCPI5_0 -; CHECK-NEXT: adr r4, .LCPI5_1 +; CHECK-NEXT: adr r4, .LCPI5_0 +; CHECK-NEXT: adr r3, .LCPI5_1 ; CHECK-NEXT: adr r5, .LCPI5_2 ; CHECK-NEXT: adr r6, .LCPI5_3 -; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: vldrw.u32 q2, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r4] ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vldrw.u32 q3, [lr] +; CHECK-NEXT: vadd.i32 q2, q2, r1 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: vadd.i32 q1, q1, r1 -; CHECK-NEXT: vadd.i32 q2, q2, r1 ; CHECK-NEXT: vadd.i32 q3, q3, r1 ; CHECK-NEXT: mov.w r12, #1 ; CHECK-NEXT: movs r4, #3 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll index f9948db..c92c2be 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -656,14 +656,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vcmp.i8 eq, q1, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q1, #0xff ; CHECK-NEXT: vpsel q5, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.u8 r0, q5[0] ; CHECK-NEXT: vmov.16 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q5[1] @@ -706,7 +704,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: vmov r2, r3, d15 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q2[3] @@ -785,6 +783,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpsel q6, q1, q7 +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: vmov r2, r3, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 @@ -853,7 +852,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -2065,14 +2063,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vcmp.i8 eq, q1, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q1, #0xff ; CHECK-NEXT: vpsel q5, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.u8 r2, q5[0] ; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[1] @@ -2115,7 +2111,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: orr.w lr, lr, r3 ; CHECK-NEXT: add r12, r2 ; CHECK-NEXT: vmov r3, r2, d15 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vmov.u8 r2, q2[3] @@ -2194,6 +2190,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpsel q6, q1, q7 +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: vmov r2, r3, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 @@ -2264,7 +2261,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: adc.w r3, r3, lr ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll index 63b1431..9f55183 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -817,16 +817,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vpsel q6, q2, q0 -; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vmov.u8 r0, q6[0] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov.16 q0[0], r0 ; CHECK-NEXT: vmov.u8 r0, q6[1] ; CHECK-NEXT: vmov.16 q0[1], r0 @@ -842,9 +840,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.16 q0[6], r0 ; CHECK-NEXT: vmov.u8 r0, q6[7] ; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vmov.u8 r2, q3[0] +; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vpsel q7, q2, q4 ; CHECK-NEXT: vmov.u16 r0, q7[2] ; CHECK-NEXT: vmov.u16 r1, q7[0] @@ -895,7 +892,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0xff ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 @@ -916,8 +913,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.u16 r3, q7[5] ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q4, q0 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 @@ -932,7 +929,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r0, s30 ; CHECK-NEXT: vmov r1, s28 -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0xff ; CHECK-NEXT: vmov r3, s16 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 @@ -960,7 +957,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 @@ -1041,7 +1038,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 @@ -1062,7 +1059,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.u16 r3, q6[5] ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q0, #0xff ; CHECK-NEXT: vpsel q0, q0, q4 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 @@ -1117,7 +1114,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: @@ -1137,16 +1133,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vpsel q5, q2, q0 -; CHECK-NEXT: vmov.s8 r2, q1[0] +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.u8 r0, q5[0] -; CHECK-NEXT: vmov.s8 r3, q3[0] +; CHECK-NEXT: vmov.s8 r2, q1[0] ; CHECK-NEXT: vmov.16 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q5[1] ; CHECK-NEXT: vmov.16 q4[1], r0 @@ -1162,9 +1156,9 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.16 q4[6], r0 ; CHECK-NEXT: vmov.u8 r0, q5[7] ; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.s8 r3, q3[0] ; CHECK-NEXT: vcmp.i16 ne, q4, zr -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vpsel q6, q2, q0 ; CHECK-NEXT: vmov.u16 r0, q6[2] ; CHECK-NEXT: vmov.u16 r1, q6[0] @@ -1198,7 +1192,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.s8 r3, q3[3] ; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 @@ -1219,7 +1213,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q6, q2, q0 ; CHECK-NEXT: vmov r2, r3, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 @@ -1273,17 +1268,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.16 q6[7], r2 ; CHECK-NEXT: vmov.s8 r0, q1[8] ; CHECK-NEXT: vcmp.i16 ne, q6, zr +; CHECK-NEXT: vmov.i8 q6, #0x0 +; CHECK-NEXT: vpsel q5, q2, q6 ; CHECK-NEXT: vmov.s8 r1, q3[8] -; CHECK-NEXT: vpsel q5, q2, q7 -; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov.u16 r2, q5[2] ; CHECK-NEXT: vmov.u16 r3, q5[0] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[3] ; CHECK-NEXT: vmov.u16 r3, q5[1] +; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vpsel q6, q2, q6 ; CHECK-NEXT: vmov r2, r3, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 @@ -1365,7 +1361,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: @@ -2296,16 +2291,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vpsel q6, q2, q0 -; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vmov.u8 r2, q6[0] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: vmov.u8 r2, q6[1] ; CHECK-NEXT: vmov.16 q0[1], r2 @@ -2321,9 +2314,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.16 q0[6], r2 ; CHECK-NEXT: vmov.u8 r2, q6[7] ; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vmov.u8 r4, q3[2] +; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vpsel q7, q2, q4 ; CHECK-NEXT: vmov.u16 r2, q7[2] ; CHECK-NEXT: vmov.u16 r3, q7[0] @@ -2374,7 +2366,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0xff ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: umull r2, r3, r3, r2 ; CHECK-NEXT: umull r4, r5, r5, r4 @@ -2395,8 +2387,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.u16 r4, q7[5] ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q4, q0 ; CHECK-NEXT: vmov r5, r4, d0 ; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 ; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 @@ -2411,7 +2403,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: vmov r2, s30 ; CHECK-NEXT: vmov r3, s28 -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0xff ; CHECK-NEXT: vmov r4, s16 ; CHECK-NEXT: umull r2, r5, r2, r5 ; CHECK-NEXT: umull r3, r4, r3, r4 @@ -2439,7 +2431,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov r5, s2 ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: umull r2, r5, r2, r5 ; CHECK-NEXT: umull r3, r4, r3, r4 @@ -2520,7 +2512,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov r5, s2 ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: umull r2, r5, r2, r5 ; CHECK-NEXT: umull r3, r4, r3, r4 @@ -2541,7 +2533,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.u16 r4, q6[5] ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q0, #0xff ; CHECK-NEXT: vpsel q0, q0, q4 ; CHECK-NEXT: vmov r5, r4, d0 ; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 @@ -2598,7 +2590,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -2619,14 +2610,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vpsel q5, q2, q0 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.u8 r2, q5[0] ; CHECK-NEXT: vmov.s8 r4, q1[2] ; CHECK-NEXT: vmov.16 q4[0], r2 @@ -2676,7 +2665,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov r2, r3, d15 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.s8 r2, q1[3] ; CHECK-NEXT: vmov.s8 r3, q3[3] @@ -2701,7 +2690,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q6, q2, q0 ; CHECK-NEXT: vmov r5, r4, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 @@ -2755,17 +2745,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.16 q6[7], r5 ; CHECK-NEXT: vmov.s8 r2, q1[8] ; CHECK-NEXT: vcmp.i16 ne, q6, zr +; CHECK-NEXT: vmov.i8 q6, #0x0 +; CHECK-NEXT: vpsel q5, q2, q6 ; CHECK-NEXT: vmov.s8 r3, q3[8] -; CHECK-NEXT: vpsel q5, q2, q7 -; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov.u16 r5, q5[2] ; CHECK-NEXT: vmov.u16 r4, q5[0] ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 ; CHECK-NEXT: vmov.u16 r5, q5[3] ; CHECK-NEXT: vmov.u16 r4, q5[1] +; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vpsel q6, q2, q6 ; CHECK-NEXT: vmov r5, r4, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 @@ -2849,7 +2840,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll index 0c349c3..cba394f 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll @@ -59,18 +59,18 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: [[MOVSX64rm32_:%[0-9]+]]:gr64_nosp = MOVSX64rm32 %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8) ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[MOV32rm2]].sub_32bit ; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16) - ; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_1]] :: (store (s64) into %stack.5) ; CHECK-NEXT: [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]] - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[MOV32rm3]] - ; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[COPY6]] + ; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm3]] ; CHECK-NEXT: [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]] ; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8) + ; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[COPY1]] :: (store (s64) into %stack.5) ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]] - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY7]], [[MOVSX64rr32_2]], implicit-def dead $eflags - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY7]], [[MOVSX64rm32_]], implicit-def dead $eflags - ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY7]], 0, $noreg + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY6]], [[MOVSX64rr32_2]], implicit-def dead $eflags + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY6]], [[MOVSX64rm32_]], implicit-def dead $eflags + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY6]], 0, $noreg ; CHECK-NEXT: MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[MOV32rm2]] ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7) ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = IMUL64rr [[COPY8]], [[MOVSX64rr32_2]], implicit-def dead $eflags @@ -87,8 +87,11 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8) ; CHECK-NEXT: CMP32rm [[MOV32rm4]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16) - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) - ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) + ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2) + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64_nosp = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) + ; CHECK-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm6]] ; CHECK-NEXT: JCC_1 %bb.5, 13, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.3 ; CHECK-NEXT: {{ $}} @@ -98,9 +101,8 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm2]], [[MOVSX64rr32_]], implicit-def dead $eflags ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm2]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1) - ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[MOV64rm2]] :: (store (s64) into %stack.13) - ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12) - ; CHECK-NEXT: undef [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm5]] + ; CHECK-NEXT: [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12) + ; CHECK-NEXT: undef [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm7]] ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) ; CHECK-NEXT: JMP_1 %bb.6 @@ -123,40 +125,30 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: bb.6.for.body17: ; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit - ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13) - ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm7]], 1, [[MOVSX64rr32_]], 0, $noreg - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOV64rm1]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]] - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64 = COPY [[MOV32rm2]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64 = COPY [[COPY1]] - ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY18]], 1, [[COPY9]], 0, $noreg - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]] - ; CHECK-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2) - ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[COPY19]], 0, $noreg - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = COPY [[COPY19]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY18]] - ; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr64_nosp = COPY [[COPY17]] - ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY16]] - ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY15]] - ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY14]] - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[COPY13]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr32 = COPY [[COPY12]] - ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = COPY [[COPY11]] - ; CHECK-NEXT: [[MOV64rm8:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) - ; CHECK-NEXT: [[LEA64_32r1:%[0-9]+]]:gr32 = COPY [[COPY10]] - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY6]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]] - ; CHECK-NEXT: PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]] + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY9]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm2]], 1, [[MOVSX64rr32_]], 0, $noreg + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY10]].sub_32bit + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[COPY7]] + ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) + ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[MOV64rm7]], 1, [[COPY10]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm]], 0, $noreg + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[COPY15]] + ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY14]] + ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY13]] + ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY12]] + ; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = COPY [[COPY11]] + ; CHECK-NEXT: [[MOV32rm8:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm8]] + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY9]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]] + ; CHECK-NEXT: PTILESTOREDV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]] ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm4]], [[MOVSX64rr32_3]], implicit-def dead $eflags - ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm8]], implicit-def dead $eflags - ; CHECK-NEXT: [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY9]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags - ; CHECK-NEXT: CMP64rr [[MOV64rm4]], [[MOV64rm1]], implicit-def $eflags + ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm1]], implicit-def dead $eflags + ; CHECK-NEXT: [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY10]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags + ; CHECK-NEXT: CMP64rr [[MOV64rm4]], [[MOVSX64rm32_1]], implicit-def $eflags ; CHECK-NEXT: JCC_1 %bb.6, 12, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.5 entry: diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll index bf6b096..b428ce4 100644 --- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll +++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll @@ -136,8 +136,6 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: movl (%r8), %edx ; CHECK-NEXT: leal 8(,%rbx,8), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: leaq 8(%rsi), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: leaq 32(%rsi), %r11 ; CHECK-NEXT: leaq 8(,%rbx,8), %rbx ; CHECK-NEXT: xorl %r14d, %r14d @@ -189,7 +187,8 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: jae .LBB1_7 ; CHECK-NEXT: # %bb.6: # %vector.memcheck ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; CHECK-NEXT: leaq 8(%rsi), %r9 +; CHECK-NEXT: addq %r9, %rax ; CHECK-NEXT: leaq (%rax,%r10,8), %rax ; CHECK-NEXT: cmpq %r15, %rax ; CHECK-NEXT: ja .LBB1_14 diff --git a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir index 10ee445..d355374 100644 --- a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir +++ b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir @@ -7,8 +7,8 @@ # CHECK: jne # CHECK: andl $-16, %edx # CHECK: xorl %ebx, %ebx -# CHECK: movl -16(%ebp), %esi -# CHECK: xorl %eax, %eax +# CHECK: xorl %esi, %esi +# CHECK: movl %eax, %ecx name: test tracksRegLiveness: true diff --git a/llvm/test/CodeGen/X86/inalloca-invoke.ll b/llvm/test/CodeGen/X86/inalloca-invoke.ll index c2728f7..68cb24d 100644 --- a/llvm/test/CodeGen/X86/inalloca-invoke.ll +++ b/llvm/test/CodeGen/X86/inalloca-invoke.ll @@ -23,7 +23,6 @@ blah: ; CHECK: pushl %eax ; CHECK: subl $20, %esp ; CHECK: movl %esp, %[[beg:[^ ]*]] -; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]] call void @begin(ptr sret(%Iter) %temp.lvalue) ; CHECK: calll _begin @@ -32,6 +31,7 @@ blah: to label %invoke.cont unwind label %lpad ; Uses end as sret param. +; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]] ; CHECK: pushl %[[end]] ; CHECK: calll _plus diff --git a/llvm/test/CodeGen/X86/licm-regpressure.ll b/llvm/test/CodeGen/X86/licm-regpressure.ll index 72a4832..26ed2a3 100644 --- a/llvm/test/CodeGen/X86/licm-regpressure.ll +++ b/llvm/test/CodeGen/X86/licm-regpressure.ll @@ -1,14 +1,64 @@ -; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -; This tests currently fails as MachineLICM does not compute register pressure -; correctly. More details: llvm.org/PR23143 -; XFAIL: * +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-linux -stop-after=early-machinelicm -o - | FileCheck %s -; MachineLICM should take register pressure into account. -; CHECK-NOT: Spill +; FIXME: MachineLICM does not compute register pressure correctly and we end up +; emitting too many ADD64ri32s. More details: llvm.org/PR23143 %struct.A = type { i32, i32, i32, i32, i32, i32, i32 } define void @test(i1 %b, ptr %a) nounwind { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $edi, $rsi + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $edi + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr8 = COPY [[COPY1]].sub_8bit + ; CHECK-NEXT: [[ADD64ri32_:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 4, implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64ri32_1:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 8, implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64ri32_2:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 12, implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64ri32_3:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 16, implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64ri32_4:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 20, implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64ri32_5:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 24, implicit-def dead $eflags + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.loop-body: + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[COPY]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_1]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_2]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_3]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_4]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_5]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: TEST8ri [[COPY2]], 1, implicit-def $eflags + ; CHECK-NEXT: JCC_1 %bb.1, 5, implicit $eflags + ; CHECK-NEXT: JMP_1 %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.loop-exit: + ; CHECK-NEXT: RET 0 entry: br label %loop-header diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll index 7f32766..0ccaa9c 100644 --- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll +++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll @@ -172,10 +172,8 @@ define float @clamp_negative_wrong_const(float %x) { ; Like @clamp_test_1 but both are min define float @clamp_negative_same_op(float %x) { ; CHECK-LABEL: @clamp_negative_same_op( -; CHECK-NEXT: [[INNER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02 -; CHECK-NEXT: [[INNER_SEL:%.*]] = select nnan ninf i1 [[INNER_CMP_INV]], float 2.550000e+02, float [[X]] -; CHECK-NEXT: [[OUTER_CMP:%.*]] = fcmp fast ult float [[X]], 1.000000e+00 -; CHECK-NEXT: [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 1.000000e+00 +; CHECK-NEXT: [[OUTER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 1.000000e+00 +; CHECK-NEXT: [[R:%.*]] = select nnan ninf i1 [[OUTER_CMP_INV]], float 1.000000e+00, float [[X]] ; CHECK-NEXT: ret float [[R]] ; %inner_cmp = fcmp fast ult float %x, 255.0 diff --git a/llvm/test/Transforms/InstCombine/select-gep.ll b/llvm/test/Transforms/InstCombine/select-gep.ll index dd8dffb..7181336 100644 --- a/llvm/test/Transforms/InstCombine/select-gep.ll +++ b/llvm/test/Transforms/InstCombine/select-gep.ll @@ -286,3 +286,35 @@ define <2 x ptr> @test7(<2 x ptr> %p1, i64 %idx, <2 x i1> %cc) { %select = select <2 x i1> %cc, <2 x ptr> %p1, <2 x ptr> %gep ret <2 x ptr> %select } + +define ptr @ptr_eq_replace_freeze1(ptr %p, ptr %q) { +; CHECK-LABEL: @ptr_eq_replace_freeze1( +; CHECK-NEXT: [[Q_FR:%.*]] = freeze ptr [[Q:%.*]] +; CHECK-NEXT: [[Q_FR1:%.*]] = freeze ptr [[Q1:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[Q_FR]], [[Q_FR1]] +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], ptr [[Q_FR]], ptr [[Q_FR1]] +; CHECK-NEXT: ret ptr [[SELECT]] +; + %p.fr = freeze ptr %p + %q.fr = freeze ptr %q + %cmp = icmp eq ptr %p.fr, %q.fr + %select = select i1 %cmp, ptr %p.fr, ptr %q.fr + ret ptr %select +} + +define ptr @ptr_eq_replace_freeze2(ptr %p, ptr %q) { +; CHECK-LABEL: @ptr_eq_replace_freeze2( +; CHECK-NEXT: [[P_FR:%.*]] = freeze ptr [[P:%.*]] +; CHECK-NEXT: [[P_FR1:%.*]] = freeze ptr [[P1:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P_FR1]], [[P_FR]] +; CHECK-NEXT: [[SELECT_V:%.*]] = select i1 [[CMP]], ptr [[P_FR1]], ptr [[P_FR]] +; CHECK-NEXT: [[SELECT:%.*]] = getelementptr i8, ptr [[SELECT_V]], i64 16 +; CHECK-NEXT: ret ptr [[SELECT]] +; + %gep1 = getelementptr i32, ptr %p, i64 4 + %gep2 = getelementptr i32, ptr %q, i64 4 + %cmp = icmp eq ptr %p, %q + %cmp.fr = freeze i1 %cmp + %select = select i1 %cmp.fr, ptr %gep1, ptr %gep2 + ret ptr %select +} diff --git a/llvm/test/Transforms/InstSimplify/domcondition.ll b/llvm/test/Transforms/InstSimplify/domcondition.ll index 43be5de..2893bb1 100644 --- a/llvm/test/Transforms/InstSimplify/domcondition.ll +++ b/llvm/test/Transforms/InstSimplify/domcondition.ll @@ -278,3 +278,210 @@ end: } declare void @foo(i32) + + +define i1 @simplify_fcmp_implied_by_dom_cond_range_true(float %x) { +; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_true( +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret i1 true +; CHECK: if.else: +; CHECK-NEXT: ret i1 false +; + %cmp = fcmp olt float %x, 0.0 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %cmp2 = fcmp olt float %x, 1.0 + ret i1 %cmp2 + +if.else: + ret i1 false +} + +define i1 @simplify_fcmp_in_else_implied_by_dom_cond_range_true(float %x) { +; CHECK-LABEL: @simplify_fcmp_in_else_implied_by_dom_cond_range_true( +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], 1.000000e+00 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret i1 true +; CHECK: if.else: +; CHECK-NEXT: ret i1 true +; + %cmp = fcmp olt float %x, 1.0 + br i1 %cmp, label %if.then, label %if.else + +if.then: + ret i1 true + +if.else: + %cmp2 = fcmp uge float %x, 0.5 + ret i1 %cmp2 +} + +define i1 @simplify_fcmp_implied_by_dom_cond_range_false(float %x) { +; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_false( +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret i1 false +; CHECK: if.else: +; CHECK-NEXT: ret i1 false +; + %cmp = fcmp olt float %x, 0.0 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %cmp2 = fcmp ogt float %x, 1.0 + ret i1 %cmp2 + +if.else: + ret i1 false +} + +define i1 @simplify_fcmp_implied_by_dom_cond_pred_true(float %x, float %y) { +; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_true( +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret i1 true +; CHECK: if.else: +; CHECK-NEXT: ret i1 false +; + %cmp = fcmp olt float %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: + %cmp2 = fcmp ole float %x, %y + ret i1 %cmp2 + +if.else: + ret i1 false +} + +define i1 @simplify_fcmp_implied_by_dom_cond_pred_false(float %x, float %y) { +; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_false( +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret i1 false +; CHECK: if.else: +; CHECK-NEXT: ret i1 false +; + %cmp = fcmp olt float %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: + %cmp2 = fcmp ogt float %x, %y + ret i1 %cmp2 + +if.else: + ret i1 false +} + +define i1 @simplify_fcmp_implied_by_dom_cond_pred_commuted(float %x, float %y) { +; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_commuted( +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret i1 true +; CHECK: if.else: +; CHECK-NEXT: ret i1 false +; + %cmp = fcmp olt float %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: + %cmp2 = fcmp oge float %y, %x + ret i1 %cmp2 + +if.else: + ret i1 false +} + +; Negative tests + +define i1 @simplify_fcmp_implied_by_dom_cond_wrong_range(float %x) { +; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_wrong_range( +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[CMP2:%.*]] = fcmp olt float [[X]], -1.000000e+00 +; CHECK-NEXT: ret i1 [[CMP2]] +; CHECK: if.else: +; CHECK-NEXT: ret i1 false +; + %cmp = fcmp olt float %x, 0.0 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %cmp2 = fcmp olt float %x, -1.0 + ret i1 %cmp2 + +if.else: + ret i1 false +} + +define i1 @simplify_fcmp_implied_by_dom_cond_range_mismatched_operand(float %x, float %y) { +; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_mismatched_operand( +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[CMP2:%.*]] = fcmp olt float [[Y:%.*]], 1.000000e+00 +; CHECK-NEXT: ret i1 [[CMP2]] +; CHECK: if.else: +; CHECK-NEXT: ret i1 false +; + %cmp = fcmp olt float %x, 0.0 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %cmp2 = fcmp olt float %y, 1.0 + ret i1 %cmp2 + +if.else: + ret i1 false +} + +define i1 @simplify_fcmp_implied_by_dom_cond_wrong_pred(float %x, float %y) { +; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_wrong_pred( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ole float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[CMP2:%.*]] = fcmp olt float [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[CMP2]] +; CHECK: if.else: +; CHECK-NEXT: ret i1 false +; + %cmp = fcmp ole float %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: + %cmp2 = fcmp olt float %x, %y + ret i1 %cmp2 + +if.else: + ret i1 false +} + +define i1 @simplify_fcmp_implied_by_dom_cond_pred_mismatched_operand(float %x, float %y, float %z) { +; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_mismatched_operand( +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[CMP2:%.*]] = fcmp ole float [[X]], [[Z:%.*]] +; CHECK-NEXT: ret i1 [[CMP2]] +; CHECK: if.else: +; CHECK-NEXT: ret i1 false +; + %cmp = fcmp olt float %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: + %cmp2 = fcmp ole float %x, %z + ret i1 %cmp2 + +if.else: + ret i1 false +} diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c index 0f09c74..e376d82 100644 --- a/llvm/tools/llvm-c-test/debuginfo.c +++ b/llvm/tools/llvm-c-test/debuginfo.c @@ -325,6 +325,13 @@ int llvm_test_dibuilder(void) { LLVMValueRef Phi2 = LLVMBuildPhi(Builder, I64, "p2"); LLVMAddIncoming(Phi2, &Zero, &FooEntryBlock, 1); + // Test that LLVMGetFirstDbgRecord and LLVMGetLastDbgRecord return NULL for + // instructions without debug info. + LLVMDbgRecordRef Phi1FirstDbgRecord = LLVMGetFirstDbgRecord(Phi1); + assert(Phi1FirstDbgRecord == NULL); + LLVMDbgRecordRef Phi1LastDbgRecord = LLVMGetLastDbgRecord(Phi1); + assert(Phi1LastDbgRecord == NULL); + // Insert a non-phi before the `ret` but not before the debug records to // test that works as expected. LLVMPositionBuilder(Builder, FooVarBlock, Ret); diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp index ba55079..2a89961 100644 --- a/llvm/tools/llvm-mc/llvm-mc.cpp +++ b/llvm/tools/llvm-mc/llvm-mc.cpp @@ -642,7 +642,8 @@ int main(int argc, char **argv) { : MAB->createObjectWriter(*OS), std::unique_ptr<MCCodeEmitter>(CE), *STI)); if (NoExecStack) - Str->switchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx)); + Str->switchSection( + Ctx.getAsmInfo()->getStackSection(Ctx, /*Exec=*/false)); Str->emitVersionForTarget(TheTriple, VersionTuple(), nullptr, VersionTuple()); } diff --git a/llvm/unittests/ADT/TypeSwitchTest.cpp b/llvm/unittests/ADT/TypeSwitchTest.cpp index c54b798..a7d9342 100644 --- a/llvm/unittests/ADT/TypeSwitchTest.cpp +++ b/llvm/unittests/ADT/TypeSwitchTest.cpp @@ -114,3 +114,31 @@ TEST(TypeSwitchTest, CasesOptional) { EXPECT_EQ(std::nullopt, translate(DerivedC())); EXPECT_EQ(-1, translate(DerivedD())); } + +TEST(TypeSwitchTest, DefaultUnreachableWithValue) { + auto translate = [](auto value) { + return TypeSwitch<Base *, int>(&value) + .Case([](DerivedA *) { return 0; }) + .DefaultUnreachable("Unhandled type"); + }; + EXPECT_EQ(0, translate(DerivedA())); + +#if defined(GTEST_HAS_DEATH_TEST) && !defined(NDEBUG) + EXPECT_DEATH((void)translate(DerivedD()), "Unhandled type"); +#endif +} + +TEST(TypeSwitchTest, DefaultUnreachableWithVoid) { + auto translate = [](auto value) { + int result = -1; + TypeSwitch<Base *>(&value) + .Case([&result](DerivedA *) { result = 0; }) + .DefaultUnreachable("Unhandled type"); + return result; + }; + EXPECT_EQ(0, translate(DerivedA())); + +#if defined(GTEST_HAS_DEATH_TEST) && !defined(NDEBUG) + EXPECT_DEATH((void)translate(DerivedD()), "Unhandled type"); +#endif +} diff --git a/mlir/docs/Tutorials/CreatingADialect.md b/mlir/docs/Tutorials/CreatingADialect.md index af709fc..e30b80bc 100644 --- a/mlir/docs/Tutorials/CreatingADialect.md +++ b/mlir/docs/Tutorials/CreatingADialect.md @@ -22,7 +22,7 @@ typically defined in FooDialect.cpp, which includes FooOps.cpp.inc and FooOpsInterfaces.h.inc. The 'Transforms' directory contains rewrite rules for the dialect, -typically described in TableGen file using the [DDR +typically described in TableGen file using the [DRR format](../DeclarativeRewrites.md). Note that dialect names should not generally be suffixed with “Ops”, diff --git a/mlir/include/mlir-c/Rewrite.h b/mlir/include/mlir-c/Rewrite.h index 77be1f4..5dd285e 100644 --- a/mlir/include/mlir-c/Rewrite.h +++ b/mlir/include/mlir-c/Rewrite.h @@ -101,6 +101,12 @@ mlirRewriterBaseGetInsertionBlock(MlirRewriterBase rewriter); MLIR_CAPI_EXPORTED MlirBlock mlirRewriterBaseGetBlock(MlirRewriterBase rewriter); +/// Returns the operation right after the current insertion point +/// of the rewriter. A null MlirOperation will be returned +// if the current insertion point is at the end of the block. +MLIR_CAPI_EXPORTED MlirOperation +mlirRewriterBaseGetOperationAfterInsertion(MlirRewriterBase rewriter); + //===----------------------------------------------------------------------===// /// Block and operation creation/insertion/cloning //===----------------------------------------------------------------------===// @@ -311,6 +317,14 @@ MLIR_CAPI_EXPORTED MlirLogicalResult mlirApplyPatternsAndFoldGreedily( MlirGreedyRewriteDriverConfig); //===----------------------------------------------------------------------===// +/// PatternRewriter API +//===----------------------------------------------------------------------===// + +/// Cast the PatternRewriter to a RewriterBase +MLIR_CAPI_EXPORTED MlirRewriterBase +mlirPatternRewriterAsBase(MlirPatternRewriter rewriter); + +//===----------------------------------------------------------------------===// /// PDLPatternModule API //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 32b2b0c..7b17106 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -2046,6 +2046,9 @@ PyInsertionPoint::PyInsertionPoint(PyOperationBase &beforeOperationBase) : refOperation(beforeOperationBase.getOperation().getRef()), block((*refOperation)->getBlock()) {} +PyInsertionPoint::PyInsertionPoint(PyOperationRef beforeOperationRef) + : refOperation(beforeOperationRef), block((*refOperation)->getBlock()) {} + void PyInsertionPoint::insert(PyOperationBase &operationBase) { PyOperation &operation = operationBase.getOperation(); if (operation.isAttached()) diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index edbd73e..e706be3b 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -841,6 +841,8 @@ public: PyInsertionPoint(const PyBlock &block); /// Creates an insertion point positioned before a reference operation. PyInsertionPoint(PyOperationBase &beforeOperationBase); + /// Creates an insertion point positioned before a reference operation. + PyInsertionPoint(PyOperationRef beforeOperationRef); /// Shortcut to create an insertion point at the beginning of the block. static PyInsertionPoint atBlockBegin(PyBlock &block); diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp index 836f44fd..9e3d970 100644 --- a/mlir/lib/Bindings/Python/Rewrite.cpp +++ b/mlir/lib/Bindings/Python/Rewrite.cpp @@ -26,6 +26,30 @@ using namespace mlir::python; namespace { +class PyPatternRewriter { +public: + PyPatternRewriter(MlirPatternRewriter rewriter) + : base(mlirPatternRewriterAsBase(rewriter)), + ctx(PyMlirContext::forContext(mlirRewriterBaseGetContext(base))) {} + + PyInsertionPoint getInsertionPoint() const { + MlirBlock block = mlirRewriterBaseGetInsertionBlock(base); + MlirOperation op = mlirRewriterBaseGetOperationAfterInsertion(base); + + if (mlirOperationIsNull(op)) { + MlirOperation owner = mlirBlockGetParentOperation(block); + auto parent = PyOperation::forOperation(ctx, owner); + return PyInsertionPoint(PyBlock(parent, block)); + } + + return PyInsertionPoint(PyOperation::forOperation(ctx, op)); + } + +private: + MlirRewriterBase base; + PyMlirContextRef ctx; +}; + #if MLIR_ENABLE_PDL_IN_PATTERNMATCH static nb::object objectFromPDLValue(MlirPDLValue value) { if (MlirValue v = mlirPDLValueAsValue(value); !mlirValueIsNull(v)) @@ -84,7 +108,8 @@ public: void *userData) -> MlirLogicalResult { nb::handle f = nb::handle(static_cast<PyObject *>(userData)); return logicalResultFromObject( - f(rewriter, results, objectsFromPDLValues(nValues, values))); + f(PyPatternRewriter(rewriter), results, + objectsFromPDLValues(nValues, values))); }, fn.ptr()); } @@ -98,7 +123,8 @@ public: void *userData) -> MlirLogicalResult { nb::handle f = nb::handle(static_cast<PyObject *>(userData)); return logicalResultFromObject( - f(rewriter, results, objectsFromPDLValues(nValues, values))); + f(PyPatternRewriter(rewriter), results, + objectsFromPDLValues(nValues, values))); }, fn.ptr()); } @@ -143,7 +169,9 @@ private: /// Create the `mlir.rewrite` here. void mlir::python::populateRewriteSubmodule(nb::module_ &m) { - nb::class_<MlirPatternRewriter>(m, "PatternRewriter"); + nb::class_<PyPatternRewriter>(m, "PatternRewriter") + .def_prop_ro("ip", &PyPatternRewriter::getInsertionPoint, + "The current insertion point of the PatternRewriter."); //---------------------------------------------------------------------------- // Mapping of the PDLResultList and PDLModule //---------------------------------------------------------------------------- diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp index 0d56259..c15a73b 100644 --- a/mlir/lib/CAPI/Transforms/Rewrite.cpp +++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp @@ -70,6 +70,17 @@ MlirBlock mlirRewriterBaseGetBlock(MlirRewriterBase rewriter) { return wrap(unwrap(rewriter)->getBlock()); } +MlirOperation +mlirRewriterBaseGetOperationAfterInsertion(MlirRewriterBase rewriter) { + mlir::RewriterBase *base = unwrap(rewriter); + mlir::Block *block = base->getInsertionBlock(); + mlir::Block::iterator it = base->getInsertionPoint(); + if (it == block->end()) + return {nullptr}; + + return wrap(std::addressof(*it)); +} + //===----------------------------------------------------------------------===// /// Block and operation creation/insertion/cloning //===----------------------------------------------------------------------===// @@ -317,6 +328,10 @@ inline MlirPatternRewriter wrap(mlir::PatternRewriter *rewriter) { return {rewriter}; } +MlirRewriterBase mlirPatternRewriterAsBase(MlirPatternRewriter rewriter) { + return wrap(static_cast<mlir::RewriterBase *>(unwrap(rewriter))); +} + //===----------------------------------------------------------------------===// /// PDLPatternModule API //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp b/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp index 7b9ed1d..79c2f23 100644 --- a/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp +++ b/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp @@ -363,7 +363,7 @@ static TypedValue<VectorType> storeTile(PatternRewriter &rewriter, } struct ContractionToAMX : public OpRewritePattern<vector::ContractionOp> { - using OpRewritePattern::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::ContractionOp contractOp, PatternRewriter &rewriter) const override { diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp index 4e1da39..363685a 100644 --- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp +++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp @@ -44,7 +44,7 @@ namespace { /// arm_sme.tile_load ... layout<vertical> struct TransferReadToArmSMELowering : public OpRewritePattern<vector::TransferReadOp> { - using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::TransferReadOp transferReadOp, PatternRewriter &rewriter) const final { @@ -120,7 +120,7 @@ struct TransferReadToArmSMELowering /// : memref<?x?xi8>, vector<[16]x[16]xi8> struct TransferWriteToArmSMELowering : public OpRewritePattern<vector::TransferWriteOp> { - using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp, PatternRewriter &rewriter) const final { @@ -157,7 +157,7 @@ struct TransferWriteToArmSMELowering /// Conversion pattern for vector.load. struct VectorLoadToArmSMELowering : public OpRewritePattern<vector::LoadOp> { - using OpRewritePattern<vector::LoadOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::LoadOp load, PatternRewriter &rewriter) const override { @@ -173,7 +173,7 @@ struct VectorLoadToArmSMELowering : public OpRewritePattern<vector::LoadOp> { /// Conversion pattern for vector.store. struct VectorStoreToArmSMELowering : public OpRewritePattern<vector::StoreOp> { - using OpRewritePattern<vector::StoreOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::StoreOp store, PatternRewriter &rewriter) const override { @@ -208,7 +208,7 @@ struct VectorStoreToArmSMELowering : public OpRewritePattern<vector::StoreOp> { /// Supports scalar, 0-d vector, and 1-d vector broadcasts. struct BroadcastOpToArmSMELowering : public OpRewritePattern<vector::BroadcastOp> { - using OpRewritePattern<vector::BroadcastOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::BroadcastOp broadcastOp, PatternRewriter &rewriter) const final { @@ -279,7 +279,7 @@ struct BroadcastOpToArmSMELowering /// implementation, perhaps with tile <-> vector (MOVA) ops. struct TransposeOpToArmSMELowering : public OpRewritePattern<vector::TransposeOp> { - using OpRewritePattern<vector::TransposeOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::TransposeOp transposeOp, PatternRewriter &rewriter) const final { @@ -372,7 +372,7 @@ struct TransposeOpToArmSMELowering struct VectorOuterProductToArmSMELowering : public OpRewritePattern<vector::OuterProductOp> { - using OpRewritePattern<vector::OuterProductOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::OuterProductOp outerProductOp, PatternRewriter &rewriter) const override { @@ -451,7 +451,7 @@ struct VectorOuterProductToArmSMELowering /// ``` struct VectorExtractToArmSMELowering : public OpRewritePattern<vector::ExtractOp> { - using OpRewritePattern<vector::ExtractOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::ExtractOp extractOp, PatternRewriter &rewriter) const override { @@ -507,7 +507,7 @@ struct VectorExtractToArmSMELowering /// ``` struct VectorInsertToArmSMELowering : public OpRewritePattern<vector::InsertOp> { - using OpRewritePattern<vector::InsertOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::InsertOp insertOp, PatternRewriter &rewriter) const override { @@ -568,7 +568,7 @@ struct VectorInsertToArmSMELowering /// } /// ``` struct VectorPrintToArmSMELowering : public OpRewritePattern<vector::PrintOp> { - using OpRewritePattern<vector::PrintOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::PrintOp printOp, PatternRewriter &rewriter) const override { @@ -623,7 +623,7 @@ struct VectorPrintToArmSMELowering : public OpRewritePattern<vector::PrintOp> { /// ``` struct FoldTransferWriteOfExtractTileSlice : public OpRewritePattern<vector::TransferWriteOp> { - using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp, PatternRewriter &rewriter) const final { @@ -679,7 +679,7 @@ struct FoldTransferWriteOfExtractTileSlice /// ``` struct ExtractFromCreateMaskToPselLowering : public OpRewritePattern<vector::ExtractOp> { - using OpRewritePattern<vector::ExtractOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::ExtractOp extractOp, PatternRewriter &rewriter) const override { @@ -734,7 +734,7 @@ struct ExtractFromCreateMaskToPselLowering // Convert all `vector.splat` to `vector.broadcast`. There is a path from // `vector.broadcast` to ArmSME via another pattern. struct ConvertSplatToBroadcast : public OpRewritePattern<vector::SplatOp> { - using OpRewritePattern<vector::SplatOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::SplatOp splatOp, PatternRewriter &rewriter) const final { diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp index d6a2622..9843435 100644 --- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp +++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp @@ -386,7 +386,7 @@ namespace { // to MMA matmul. struct PrepareContractToGPUMMA : public OpRewritePattern<vector::ContractionOp> { - using OpRewritePattern<vector::ContractionOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::ContractionOp op, PatternRewriter &rewriter) const override { @@ -450,7 +450,7 @@ struct PrepareContractToGPUMMA // Shared Memory to registers. struct CombineTransferReadOpTranspose final : public OpRewritePattern<vector::TransposeOp> { - using OpRewritePattern<vector::TransposeOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::TransposeOp op, PatternRewriter &rewriter) const override { diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index e0b1a88..5461646 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -1342,7 +1342,7 @@ struct VectorScalableExtractOpLowering /// ``` class VectorFMAOpNDRewritePattern : public OpRewritePattern<FMAOp> { public: - using OpRewritePattern<FMAOp>::OpRewritePattern; + using Base::Base; void initialize() { // This pattern recursively unpacks one dimension at a time. The recursion @@ -2127,7 +2127,7 @@ FailureOr<Value> ContractionOpToMatmulOpLowering::matchAndRewriteMaskableOp( class TransposeOpToMatrixTransposeOpLowering : public OpRewritePattern<vector::TransposeOp> { public: - using OpRewritePattern<TransposeOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::TransposeOp op, PatternRewriter &rewriter) const override { diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp index 5061a44..311ff6f 100644 --- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp +++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp @@ -838,7 +838,7 @@ struct VectorStoreOpConverter final struct VectorReductionToIntDotProd final : OpRewritePattern<vector::ReductionOp> { - using OpRewritePattern::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::ReductionOp op, PatternRewriter &rewriter) const override { diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index 9f5585a..e2c7d80 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -475,7 +475,7 @@ static LogicalResult lowerToScatteredStoreOp(vector::TransferWriteOp writeOp, } struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> { - using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::TransferReadOp readOp, PatternRewriter &rewriter) const override { @@ -546,7 +546,7 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> { struct TransferWriteLowering : public OpRewritePattern<vector::TransferWriteOp> { - using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp, PatternRewriter &rewriter) const override { @@ -597,7 +597,7 @@ struct TransferWriteLowering }; struct GatherLowering : public OpRewritePattern<vector::GatherOp> { - using OpRewritePattern<vector::GatherOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::GatherOp gatherOp, PatternRewriter &rewriter) const override { @@ -632,7 +632,7 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> { }; struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> { - using OpRewritePattern<vector::ScatterOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::ScatterOp scatterOp, PatternRewriter &rewriter) const override { @@ -662,7 +662,7 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> { }; struct LoadLowering : public OpRewritePattern<vector::LoadOp> { - using OpRewritePattern<vector::LoadOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::LoadOp loadOp, PatternRewriter &rewriter) const override { @@ -694,7 +694,7 @@ struct LoadLowering : public OpRewritePattern<vector::LoadOp> { }; struct StoreLowering : public OpRewritePattern<vector::StoreOp> { - using OpRewritePattern<vector::StoreOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::StoreOp storeOp, PatternRewriter &rewriter) const override { @@ -727,7 +727,7 @@ struct StoreLowering : public OpRewritePattern<vector::StoreOp> { }; struct ContractionLowering : public OpRewritePattern<vector::ContractionOp> { - using OpRewritePattern<vector::ContractionOp>::OpRewritePattern; + using Base::Base; LogicalResult matchAndRewrite(vector::ContractionOp contractOp, PatternRewriter &rewriter) const override { diff --git a/mlir/test/python/integration/dialects/pdl.py b/mlir/test/python/integration/dialects/pdl.py index c8e6197..fe27dd4 100644 --- a/mlir/test/python/integration/dialects/pdl.py +++ b/mlir/test/python/integration/dialects/pdl.py @@ -16,6 +16,7 @@ def construct_and_print_in_module(f): print(module) return f + def get_pdl_patterns(): # Create a rewrite from add to mul. This will match # - operation name is arith.addi @@ -121,8 +122,10 @@ def load_myint_dialect(): # This PDL pattern is to fold constant additions, -# i.e. add(constant0, constant1) -> constant2 -# where constant2 = constant0 + constant1. +# including two patterns: +# 1. add(constant0, constant1) -> constant2 +# where constant2 = constant0 + constant1; +# 2. add(x, 0) or add(0, x) -> x. def get_pdl_pattern_fold(): m = Module.create() i32 = IntegerType.get_signless(32) @@ -237,3 +240,87 @@ def test_pdl_register_function_constraint(module_): apply_patterns_and_fold_greedily(module_, frozen) return module_ + + +# This pattern is to expand constant to additions +# unless the constant is no more than 1, +# e.g. 3 -> 1 + 2 -> 1 + (1 + 1). +def get_pdl_pattern_expand(): + m = Module.create() + i32 = IntegerType.get_signless(32) + with InsertionPoint(m.body): + + @pdl.pattern(benefit=1, sym_name="myint_constant_expand") + def pat(): + t = pdl.TypeOp(i32) + cst = pdl.AttributeOp() + pdl.apply_native_constraint([], "is_one", [cst]) + op0 = pdl.OperationOp( + name="myint.constant", attributes={"value": cst}, types=[t] + ) + + @pdl.rewrite() + def rew(): + expanded = pdl.apply_native_rewrite( + [pdl.OperationType.get()], "expand", [cst] + ) + pdl.ReplaceOp(op0, with_op=expanded) + + def is_one(rewriter, results, values): + cst = values[0].value + return cst <= 1 + + def expand(rewriter, results, values): + cst = values[0].value + c1 = cst // 2 + c2 = cst - c1 + with rewriter.ip: + op1 = Operation.create( + "myint.constant", + results=[i32], + attributes={"value": IntegerAttr.get(i32, c1)}, + ) + op2 = Operation.create( + "myint.constant", + results=[i32], + attributes={"value": IntegerAttr.get(i32, c2)}, + ) + res = Operation.create( + "myint.add", results=[i32], operands=[op1.result, op2.result] + ) + results.append(res) + + pdl_module = PDLModule(m) + pdl_module.register_constraint_function("is_one", is_one) + pdl_module.register_rewrite_function("expand", expand) + return pdl_module.freeze() + + +# CHECK-LABEL: TEST: test_pdl_register_function_expand +# CHECK: %0 = "myint.constant"() {value = 1 : i32} : () -> i32 +# CHECK: %1 = "myint.constant"() {value = 1 : i32} : () -> i32 +# CHECK: %2 = "myint.add"(%0, %1) : (i32, i32) -> i32 +# CHECK: %3 = "myint.constant"() {value = 1 : i32} : () -> i32 +# CHECK: %4 = "myint.constant"() {value = 1 : i32} : () -> i32 +# CHECK: %5 = "myint.constant"() {value = 1 : i32} : () -> i32 +# CHECK: %6 = "myint.add"(%4, %5) : (i32, i32) -> i32 +# CHECK: %7 = "myint.add"(%3, %6) : (i32, i32) -> i32 +# CHECK: %8 = "myint.add"(%2, %7) : (i32, i32) -> i32 +# CHECK: return %8 : i32 +@construct_and_print_in_module +def test_pdl_register_function_expand(module_): + load_myint_dialect() + + module_ = Module.parse( + """ + func.func @f() -> i32 { + %0 = "myint.constant"() { value = 5 }: () -> (i32) + return %0 : i32 + } + """ + ) + + frozen = get_pdl_pattern_expand() + apply_patterns_and_fold_greedily(module_, frozen) + + return module_ diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h index 14a3d8e..3ed3295 100644 --- a/orc-rt/include/orc-rt/SPSWrapperFunction.h +++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h @@ -57,8 +57,8 @@ private: template <typename... Ts> using DeserializableTuple_t = typename DeserializableTuple<Ts...>::type; - template <typename T> static T fromSerializable(T &&Arg) noexcept { - return Arg; + template <typename T> static T &&fromSerializable(T &&Arg) noexcept { + return std::forward<T>(Arg); } static Error fromSerializable(SPSSerializableError Err) noexcept { @@ -86,7 +86,10 @@ public: decltype(Args)>::deserialize(IB, Args)) return std::nullopt; return std::apply( - [](auto &&...A) { return ArgTuple(fromSerializable(A)...); }, + [](auto &&...A) { + return std::optional<ArgTuple>(std::in_place, + std::move(fromSerializable(A))...); + }, std::move(Args)); } }; diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h index ca165db..47e770f 100644 --- a/orc-rt/include/orc-rt/WrapperFunction.h +++ b/orc-rt/include/orc-rt/WrapperFunction.h @@ -111,7 +111,23 @@ struct WFHandlerTraitsImpl { static_assert(std::is_void_v<RetT>, "Async wrapper function handler must return void"); typedef ReturnT YieldType; - typedef std::tuple<ArgTs...> ArgTupleType; + typedef std::tuple<std::decay_t<ArgTs>...> ArgTupleType; + + // Forwards arguments based on the parameter types of the handler. + template <typename FnT> class ForwardArgsAsRequested { + public: + ForwardArgsAsRequested(FnT &&Fn) : Fn(std::move(Fn)) {} + void operator()(ArgTs &...Args) { Fn(std::forward<ArgTs>(Args)...); } + + private: + FnT Fn; + }; + + template <typename FnT> + static ForwardArgsAsRequested<std::decay_t<FnT>> + forwardArgsAsRequested(FnT &&Fn) { + return ForwardArgsAsRequested<std::decay_t<FnT>>(std::forward<FnT>(Fn)); + } }; template <typename C> @@ -244,10 +260,11 @@ struct WrapperFunction { if (auto Args = S.arguments().template deserialize<ArgTuple>(std::move(ArgBytes))) - std::apply(bind_front(std::forward<Handler>(H), - detail::StructuredYield<RetTupleType, Serializer>( - Session, CallCtx, Return, std::move(S))), - std::move(*Args)); + std::apply(HandlerTraits::forwardArgsAsRequested(bind_front( + std::forward<Handler>(H), + detail::StructuredYield<RetTupleType, Serializer>( + Session, CallCtx, Return, std::move(S)))), + *Args); else Return(Session, CallCtx, WrapperFunctionBuffer::createOutOfBandError( diff --git a/orc-rt/unittests/CMakeLists.txt b/orc-rt/unittests/CMakeLists.txt index 54c453d..4d3da68 100644 --- a/orc-rt/unittests/CMakeLists.txt +++ b/orc-rt/unittests/CMakeLists.txt @@ -15,7 +15,6 @@ add_orc_rt_unittest(CoreTests AllocActionTest.cpp BitmaskEnumTest.cpp CallableTraitsHelperTest.cpp - CommonTestUtils.cpp ErrorTest.cpp ExecutorAddressTest.cpp IntervalMapTest.cpp diff --git a/orc-rt/unittests/CommonTestUtils.cpp b/orc-rt/unittests/CommonTestUtils.cpp deleted file mode 100644 index d9f9433..0000000 --- a/orc-rt/unittests/CommonTestUtils.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===- CommonTestUtils.cpp ------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Common test utilities. -// -//===----------------------------------------------------------------------===// - -#include "CommonTestUtils.h" - -size_t OpCounter::DefaultConstructions = 0; -size_t OpCounter::CopyConstructions = 0; -size_t OpCounter::CopyAssignments = 0; -size_t OpCounter::MoveConstructions = 0; -size_t OpCounter::MoveAssignments = 0; -size_t OpCounter::Destructions = 0; diff --git a/orc-rt/unittests/CommonTestUtils.h b/orc-rt/unittests/CommonTestUtils.h index 5ff2c8e..1c66bdd 100644 --- a/orc-rt/unittests/CommonTestUtils.h +++ b/orc-rt/unittests/CommonTestUtils.h @@ -11,7 +11,7 @@ #include <cstddef> -class OpCounter { +template <size_t Idx = 0> class OpCounter { public: OpCounter() { ++DefaultConstructions; } OpCounter(const OpCounter &Other) { ++CopyConstructions; } @@ -57,4 +57,11 @@ private: static size_t Destructions; }; +template <size_t Idx> size_t OpCounter<Idx>::DefaultConstructions = 0; +template <size_t Idx> size_t OpCounter<Idx>::CopyConstructions = 0; +template <size_t Idx> size_t OpCounter<Idx>::CopyAssignments = 0; +template <size_t Idx> size_t OpCounter<Idx>::MoveConstructions = 0; +template <size_t Idx> size_t OpCounter<Idx>::MoveAssignments = 0; +template <size_t Idx> size_t OpCounter<Idx>::Destructions = 0; + #endif // ORC_RT_UNITTEST_COMMONTESTUTILS_H diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp index c0c86ff..32aaa61 100644 --- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp +++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp @@ -10,6 +10,8 @@ // //===----------------------------------------------------------------------===// +#include "CommonTestUtils.h" + #include "orc-rt/SPSWrapperFunction.h" #include "orc-rt/WrapperFunction.h" #include "orc-rt/move_only_function.h" @@ -218,3 +220,80 @@ TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedFailureCase) { EXPECT_EQ(ErrMsg, "N is not a multiple of 2"); } + +template <size_t N> struct SPSOpCounter {}; + +namespace orc_rt { +template <size_t N> +class SPSSerializationTraits<SPSOpCounter<N>, OpCounter<N>> { +public: + static size_t size(const OpCounter<N> &O) { return 0; } + static bool serialize(SPSOutputBuffer &OB, const OpCounter<N> &O) { + return true; + } + static bool deserialize(SPSInputBuffer &OB, OpCounter<N> &O) { return true; } +}; +} // namespace orc_rt + +static void +handle_with_reference_types_sps_wrapper(orc_rt_SessionRef Session, + void *CallCtx, + orc_rt_WrapperFunctionReturn Return, + orc_rt_WrapperFunctionBuffer ArgBytes) { + SPSWrapperFunction<void( + SPSOpCounter<0>, SPSOpCounter<1>, SPSOpCounter<2>, + SPSOpCounter<3>)>::handle(Session, CallCtx, Return, ArgBytes, + [](move_only_function<void()> Return, + OpCounter<0>, OpCounter<1> &, + const OpCounter<2> &, + OpCounter<3> &&) { Return(); }); +} + +TEST(SPSWrapperFunctionUtilsTest, TestHandlerWithReferences) { + // Test that we can handle by-value, by-ref, by-const-ref, and by-rvalue-ref + // arguments, and that we generate the expected number of moves. + OpCounter<0>::reset(); + OpCounter<1>::reset(); + OpCounter<2>::reset(); + OpCounter<3>::reset(); + + bool DidRun = false; + SPSWrapperFunction<void(SPSOpCounter<0>, SPSOpCounter<1>, SPSOpCounter<2>, + SPSOpCounter<3>)>:: + call( + DirectCaller(nullptr, handle_with_reference_types_sps_wrapper), + [&](Error R) { + cantFail(std::move(R)); + DidRun = true; + }, + OpCounter<0>(), OpCounter<1>(), OpCounter<2>(), OpCounter<3>()); + + EXPECT_TRUE(DidRun); + + // We expect two default constructions for each parameter: one for the + // argument to call, and one for the object to deserialize into. + EXPECT_EQ(OpCounter<0>::defaultConstructions(), 2U); + EXPECT_EQ(OpCounter<1>::defaultConstructions(), 2U); + EXPECT_EQ(OpCounter<2>::defaultConstructions(), 2U); + EXPECT_EQ(OpCounter<3>::defaultConstructions(), 2U); + + // Pass-by-value: we expect two moves (one for SPS transparent conversion, + // one to copy the value to the parameter), and no copies. + EXPECT_EQ(OpCounter<0>::moves(), 2U); + EXPECT_EQ(OpCounter<0>::copies(), 0U); + + // Pass-by-lvalue-reference: we expect one move (for SPS transparent + // conversion), no copies. + EXPECT_EQ(OpCounter<1>::moves(), 1U); + EXPECT_EQ(OpCounter<1>::copies(), 0U); + + // Pass-by-const-lvalue-reference: we expect one move (for SPS transparent + // conversion), no copies. + EXPECT_EQ(OpCounter<2>::moves(), 1U); + EXPECT_EQ(OpCounter<2>::copies(), 0U); + + // Pass-by-rvalue-reference: we expect one move (for SPS transparent + // conversion), no copies. + EXPECT_EQ(OpCounter<3>::moves(), 1U); + EXPECT_EQ(OpCounter<3>::copies(), 0U); +} diff --git a/orc-rt/unittests/bind-test.cpp b/orc-rt/unittests/bind-test.cpp index bfaef4e..93a61e63 100644 --- a/orc-rt/unittests/bind-test.cpp +++ b/orc-rt/unittests/bind-test.cpp @@ -47,28 +47,28 @@ TEST(BindTest, LambdaCapture) { } TEST(BindTest, MinimalMoves) { - OpCounter::reset(); + OpCounter<>::reset(); { - auto B = bind_front([](OpCounter &O, int) {}, OpCounter()); + auto B = bind_front([](OpCounter<> &O, int) {}, OpCounter<>()); B(0); } - EXPECT_EQ(OpCounter::defaultConstructions(), 1U); - EXPECT_EQ(OpCounter::copies(), 0U); - EXPECT_EQ(OpCounter::moves(), 1U); - EXPECT_EQ(OpCounter::destructions(), 2U); + EXPECT_EQ(OpCounter<>::defaultConstructions(), 1U); + EXPECT_EQ(OpCounter<>::copies(), 0U); + EXPECT_EQ(OpCounter<>::moves(), 1U); + EXPECT_EQ(OpCounter<>::destructions(), 2U); } TEST(BindTest, MinimalCopies) { - OpCounter::reset(); + OpCounter<>::reset(); { - OpCounter O; - auto B = bind_front([](OpCounter &O, int) {}, O); + OpCounter<> O; + auto B = bind_front([](OpCounter<> &O, int) {}, O); B(0); } - EXPECT_EQ(OpCounter::defaultConstructions(), 1U); - EXPECT_EQ(OpCounter::copies(), 1U); - EXPECT_EQ(OpCounter::moves(), 0U); - EXPECT_EQ(OpCounter::destructions(), 2U); + EXPECT_EQ(OpCounter<>::defaultConstructions(), 1U); + EXPECT_EQ(OpCounter<>::copies(), 1U); + EXPECT_EQ(OpCounter<>::moves(), 0U); + EXPECT_EQ(OpCounter<>::destructions(), 2U); } TEST(BindTest, ForwardUnboundArgs) { diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 026664b..e91e7c8 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -2157,24 +2157,13 @@ libc_function( ########################### math support library ############################### libc_support_library( - name = "common_constants", - srcs = ["src/math/generic/common_constants.cpp"], - hdrs = ["src/math/generic/common_constants.h"], - deps = [ - ":__support_math_acosh_float_constants", - ":__support_math_exp_constants", - ":__support_number_pair", - ], -) - -libc_support_library( name = "log_range_reduction", hdrs = ["src/math/generic/log_range_reduction.h"], deps = [ ":__support_common", ":__support_fputil_dyadic_float", + ":__support_math_common_constants", ":__support_uint128", - ":common_constants", ], ) @@ -2189,8 +2178,8 @@ libc_support_library( ":__support_fputil_polyeval", ":__support_fputil_rounding_mode", ":__support_macros_optimization", + ":__support_math_common_constants", ":__support_math_exp10f_utils", - ":common_constants", ], ) @@ -2558,6 +2547,16 @@ libc_support_library( ) libc_support_library( + name = "__support_math_common_constants", + hdrs = ["src/__support/math/common_constants.h"], + deps = [ + ":__support_math_acosh_float_constants", + ":__support_math_exp_constants", + ":__support_number_pair", + ], +) + +libc_support_library( name = "__support_math_cos", hdrs = ["src/__support/math/cos.h"], deps = [ @@ -2632,8 +2631,8 @@ libc_support_library( ":__support_fputil_polyeval", ":__support_fputil_rounding_mode", ":__support_macros_optimization", + ":__support_math_common_constants", ":__support_sincosf_utils", - ":common_constants", ], ) @@ -2879,6 +2878,24 @@ libc_support_library( ) libc_support_library( + name = "__support_math_exp2", + hdrs = ["src/__support/math/exp2.h"], + deps = [ + ":__support_fputil_double_double", + ":__support_fputil_dyadic_float", + ":__support_fputil_multiply_add", + ":__support_fputil_nearest_integer", + ":__support_fputil_polyeval", + ":__support_fputil_rounding_mode", + ":__support_fputil_triple_double", + ":__support_integer_literals", + ":__support_macros_optimization", + ":__support_math_common_constants", + ":__support_math_exp_utils", + ], +) + +libc_support_library( name = "__support_math_exp10", hdrs = ["src/__support/math/exp10.h"], deps = [ @@ -3652,17 +3669,7 @@ libc_math_function( libc_math_function( name = "exp2", additional_deps = [ - ":__support_fputil_double_double", - ":__support_fputil_dyadic_float", - ":__support_fputil_multiply_add", - ":__support_fputil_nearest_integer", - ":__support_fputil_polyeval", - ":__support_fputil_rounding_mode", - ":__support_fputil_triple_double", - ":__support_integer_literals", - ":__support_macros_optimization", - ":__support_math_exp_utils", - ":common_constants", + ":__support_math_exp2", ], ) @@ -3706,7 +3713,7 @@ libc_math_function( ":__support_fputil_triple_double", ":__support_integer_literals", ":__support_macros_optimization", - ":common_constants", + ":__support_math_common_constants", ], ) @@ -3720,7 +3727,7 @@ libc_math_function( ":__support_fputil_rounding_mode", ":__support_macros_optimization", ":__support_macros_properties_cpu_features", - ":common_constants", + ":__support_math_common_constants", ], ) @@ -4233,7 +4240,7 @@ libc_math_function( ":__support_integer_literals", ":__support_macros_optimization", ":__support_macros_properties_cpu_features", - ":common_constants", + ":__support_math_common_constants", ":log_range_reduction", ], ) @@ -4246,7 +4253,7 @@ libc_math_function( ":__support_fputil_polyeval", ":__support_macros_optimization", ":__support_macros_properties_cpu_features", - ":common_constants", + ":__support_math_common_constants", ], ) @@ -4268,7 +4275,7 @@ libc_math_function( ":__support_integer_literals", ":__support_macros_optimization", ":__support_macros_properties_cpu_features", - ":common_constants", + ":__support_math_common_constants", ":log_range_reduction", ], ) @@ -4281,7 +4288,7 @@ libc_math_function( ":__support_fputil_polyeval", ":__support_macros_optimization", ":__support_macros_properties_cpu_features", - ":common_constants", + ":__support_math_common_constants", ], ) @@ -4303,7 +4310,7 @@ libc_math_function( ":__support_integer_literals", ":__support_macros_optimization", ":__support_macros_properties_cpu_features", - ":common_constants", + ":__support_math_common_constants", ], ) @@ -4315,7 +4322,7 @@ libc_math_function( ":__support_fputil_polyeval", ":__support_macros_optimization", ":__support_macros_properties_cpu_features", - ":common_constants", + ":__support_math_common_constants", ], ) @@ -4330,7 +4337,7 @@ libc_math_function( ":__support_integer_literals", ":__support_macros_optimization", ":__support_macros_properties_cpu_features", - ":common_constants", + ":__support_math_common_constants", ":log_range_reduction", ], ) @@ -4342,7 +4349,7 @@ libc_math_function( ":__support_fputil_multiply_add", ":__support_fputil_polyeval", ":__support_macros_optimization", - ":common_constants", + ":__support_math_common_constants", ], ) @@ -4488,7 +4495,7 @@ libc_math_function( ":__support_fputil_nearest_integer", ":__support_fputil_polyeval", ":__support_fputil_sqrt", - ":common_constants", + ":__support_math_common_constants", ], ) @@ -4503,7 +4510,7 @@ libc_math_function( ":__support_fputil_triple_double", ":__support_macros_optimization", ":__support_math_exp10f", - ":common_constants", + ":__support_math_common_constants", ":exp2f_impl", ], ) @@ -4664,7 +4671,7 @@ libc_math_function( ":__support_fputil_rounding_mode", ":__support_macros_optimization", ":__support_math_sinhfcoshf_utils", - ":common_constants", + ":__support_math_common_constants", ], ) @@ -4771,7 +4778,7 @@ libc_math_function( ":__support_macros_optimization", ":__support_macros_properties_cpu_features", ":__support_math_exp10f_utils", - ":common_constants", + ":__support_math_common_constants", ], ) |