diff options
33 files changed, 500 insertions, 648 deletions
diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp index 3dd0a50..ca85168 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp @@ -309,10 +309,9 @@ static void lengthExprHandle(const Expr *LengthExpr, // Try to obtain an 'IntegerLiteral' and adjust it. if (!IsMacroDefinition) { if (const auto *LengthIL = dyn_cast<IntegerLiteral>(LengthExpr)) { - size_t NewLength = LengthIL->getValue().getZExtValue() + - (LengthHandle == LengthHandleKind::Increase - ? (isInjectUL(Result) ? 1UL : 1) - : -1); + uint64_t NewLength = + LengthIL->getValue().getZExtValue() + + (LengthHandle == LengthHandleKind::Increase ? 1 : -1); const auto NewLengthFix = FixItHint::CreateReplacement( LengthIL->getSourceRange(), diff --git a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp index 0003429..77262eb 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp @@ -152,6 +152,8 @@ void UseIntegerSignComparisonCheck::check( if (const auto *RHSCast = llvm::dyn_cast<ExplicitCastExpr>(RHS)) { SubExprRHS = RHSCast->getSubExpr(); R2.setEnd(SubExprRHS->getBeginLoc().getLocWithOffset(-1)); + R3.setBegin(Lexer::getLocForEndOfToken( + SubExprRHS->getEndLoc(), 0, *Result.SourceManager, getLangOpts())); } DiagnosticBuilder Diag = diag(BinaryOp->getBeginLoc(), diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index a94dd97..43e4b61 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -276,6 +276,7 @@ Changes in existing checks - Improved :doc:`bugprone-not-null-terminated-result <clang-tidy/checks/bugprone/not-null-terminated-result>` check by fixing + bogus fix-its for ``strncmp`` and ``wcsncmp`` on Windows and a crash caused by certain value-dependent expressions. - Improved :doc:`bugprone-reserved-identifier @@ -341,7 +342,8 @@ Changes in existing checks - Improved :doc:`misc-const-correctness <clang-tidy/checks/misc/const-correctness>` check to avoid false positives when pointers is tranferred to non-const references - and avoid false positives of function pointer. + and avoid false positives of function pointer and fix false + positives on return of non-const pointer. - Improved :doc:`misc-header-include-cycle <clang-tidy/checks/misc/header-include-cycle>` check performance. @@ -363,6 +365,11 @@ Changes in existing checks <clang-tidy/checks/modernize/use-designated-initializers>` check to suggest using designated initializers for aliased aggregate types. +- Improved :doc:`modernize-use-integer-sign-comparison + <clang-tidy/checks/modernize/use-integer-sign-comparison>` by providing + correct fix-its when the right-hand side of a comparison contains a + non-C-style cast. + - Improved :doc:`modernize-use-nullptr <clang-tidy/checks/modernize/use-nullptr>` check by fixing a crash on Windows when the check was enabled with a 32-bit :program:`clang-tidy` diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c index dccf4ed..ca86986 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c @@ -1,11 +1,6 @@ // RUN: %check_clang_tidy %s bugprone-not-null-terminated-result %t -- \ // RUN: -- -I %S/Inputs/not-null-terminated-result -// FIXME: Something wrong with the APInt un/signed conversion on Windows: -// in 'strncmp(str6, "string", 7);' it tries to inject '4294967302' as length. - -// UNSUPPORTED: system-windows - #include "not-null-terminated-result-c.h" #define __STDC_LIB_EXT1__ 1 diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp index 8047db3..688e414 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp @@ -1,11 +1,6 @@ // RUN: %check_clang_tidy -std=c++11-or-later %s bugprone-not-null-terminated-result %t -- \ // RUN: -- -I %S/Inputs/not-null-terminated-result -// FIXME: Something wrong with the APInt un/signed conversion on Windows: -// in 'wcsncmp(wcs6, L"string", 7);' it tries to inject '4294967302' as length. - -// UNSUPPORTED: system-windows - #include "not-null-terminated-result-cxx.h" #define __STDC_LIB_EXT1__ 1 diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp index e20680c..4c847b5 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp @@ -48,6 +48,11 @@ void ignore_const_alias() { p_local0 = &a[1]; } +void *return_non_const() { + void *const a = nullptr; + return a; +} + void function_pointer_basic() { void (*const fp)() = nullptr; fp(); diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp index 1f26ff3..31a3677 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp @@ -92,8 +92,7 @@ int AllComparisons() { if (static_cast<unsigned int>(uArray[2]) < static_cast<int>(sArray[2])) return 0; // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison] -// CHECK-FIXES: if (q20::cmp_less(uArray[2],sArray[2]))) -// FIXME: There should only be 2 closing braces. The fix-it inserts an unbalanced one. +// CHECK-FIXES: if (q20::cmp_less(uArray[2],sArray[2])) if ((unsigned int)uArray[3] < (int)sArray[3]) return 0; @@ -116,6 +115,11 @@ int AllComparisons() { // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison] // CHECK-FIXES: if (q20::cmp_greater(uArray[6] , VALUE)) + if (unsigned(uArray[7]) >= int(sArray[7])) + return 0; +// CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison] +// CHECK-FIXES: if (q20::cmp_greater_equal(uArray[7],sArray[7])) + FuncParameters(uVar); TemplateFuncParameter(sVar); diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp index 628cee0..e7981a6 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp @@ -91,8 +91,7 @@ int AllComparisons() { if (static_cast<unsigned int>(uArray[2]) < static_cast<int>(sArray[2])) return 0; // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison] -// CHECK-FIXES: if (std::cmp_less(uArray[2],sArray[2]))) -// FIXME: There should only be 2 closing braces. The fix-it inserts an unbalanced one. +// CHECK-FIXES: if (std::cmp_less(uArray[2],sArray[2])) if ((unsigned int)uArray[3] < (int)sArray[3]) return 0; @@ -115,6 +114,11 @@ int AllComparisons() { // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison] // CHECK-FIXES: if (std::cmp_greater(uArray[6] , VALUE)) + if (unsigned(uArray[7]) >= int(sArray[7])) + return 0; +// CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison] +// CHECK-FIXES: if (std::cmp_greater_equal(uArray[7],sArray[7])) + FuncParameters(uVar); TemplateFuncParameter(sVar); diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp index 1e376da..75b17c54 100644 --- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp +++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp @@ -140,7 +140,8 @@ class ExprPointeeResolve { // explicit cast will be checked in `findPointeeToNonConst` const CastKind kind = ICE->getCastKind(); if (kind == CK_LValueToRValue || kind == CK_DerivedToBase || - kind == CK_UncheckedDerivedToBase) + kind == CK_UncheckedDerivedToBase || + (kind == CK_NoOp && (ICE->getType() == ICE->getSubExpr()->getType()))) return resolveExpr(ICE->getSubExpr()); return false; } @@ -788,13 +789,16 @@ ExprMutationAnalyzer::Analyzer::findPointeeToNonConst(const Expr *Exp) { // FIXME: false positive if the pointee does not change in lambda const auto CaptureNoConst = lambdaExpr(hasCaptureInit(Exp)); - const auto Matches = - match(stmt(anyOf(forEachDescendant( - stmt(anyOf(AssignToNonConst, PassAsNonConstArg, - CastToNonConst, CaptureNoConst)) - .bind("stmt")), - forEachDescendant(InitToNonConst))), - Stm, Context); + const auto ReturnNoConst = + returnStmt(hasReturnValue(canResolveToExprPointee(Exp))); + + const auto Matches = match( + stmt(anyOf(forEachDescendant( + stmt(anyOf(AssignToNonConst, PassAsNonConstArg, + CastToNonConst, CaptureNoConst, ReturnNoConst)) + .bind("stmt")), + forEachDescendant(InitToNonConst))), + Stm, Context); return selectFirst<Stmt>("stmt", Matches); } diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp index 9261294..aae2f3e 100644 --- a/clang/lib/Format/WhitespaceManager.cpp +++ b/clang/lib/Format/WhitespaceManager.cpp @@ -432,7 +432,11 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End, // right-justified. It is used to align compound assignments like `+=` and `=`. // When RightJustify and ACS.PadOperators are true, operators in each block to // be aligned will be padded on the left to the same length before aligning. -template <typename F> +// +// The simple check will not look at the indentaion and nesting level to recurse +// into the line for alignment. It will also not count the commas. This is e.g. +// for aligning macro definitions. +template <typename F, bool SimpleCheck = false> static unsigned AlignTokens(const FormatStyle &Style, F &&Matches, SmallVector<WhitespaceManager::Change, 16> &Changes, unsigned StartAt, @@ -465,9 +469,9 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches, // Measure the scope level (i.e. depth of (), [], {}) of the first token, and // abort when we hit any token in a higher scope than the starting one. - auto IndentAndNestingLevel = StartAt < Changes.size() - ? Changes[StartAt].indentAndNestingLevel() - : std::tuple<unsigned, unsigned, unsigned>(); + const auto IndentAndNestingLevel = + StartAt < Changes.size() ? Changes[StartAt].indentAndNestingLevel() + : std::tuple<unsigned, unsigned, unsigned>(); // Keep track of the number of commas before the matching tokens, we will only // align a sequence of matching tokens if they are preceded by the same number @@ -536,14 +540,17 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches, if (CurrentChange.Tok->isNot(tok::comment)) LineIsComment = false; - if (CurrentChange.Tok->is(tok::comma)) { - ++CommasBeforeMatch; - } else if (CurrentChange.indentAndNestingLevel() > IndentAndNestingLevel) { - // Call AlignTokens recursively, skipping over this scope block. - unsigned StoppedAt = - AlignTokens(Style, Matches, Changes, i, ACS, RightJustify); - i = StoppedAt - 1; - continue; + if (!SimpleCheck) { + if (CurrentChange.Tok->is(tok::comma)) { + ++CommasBeforeMatch; + } else if (CurrentChange.indentAndNestingLevel() > + IndentAndNestingLevel) { + // Call AlignTokens recursively, skipping over this scope block. + const auto StoppedAt = + AlignTokens(Style, Matches, Changes, i, ACS, RightJustify); + i = StoppedAt - 1; + continue; + } } if (!Matches(CurrentChange)) @@ -683,61 +690,8 @@ void WhitespaceManager::alignConsecutiveMacros() { return Current->Next->SpacesRequiredBefore == SpacesRequiredBefore; }; - unsigned MinColumn = 0; - - // Start and end of the token sequence we're processing. - unsigned StartOfSequence = 0; - unsigned EndOfSequence = 0; - - // Whether a matching token has been found on the current line. - bool FoundMatchOnLine = false; - - // Whether the current line consists only of comments - bool LineIsComment = true; - - unsigned I = 0; - for (unsigned E = Changes.size(); I != E; ++I) { - if (Changes[I].NewlinesBefore != 0) { - EndOfSequence = I; - - // Whether to break the alignment sequence because of an empty line. - bool EmptyLineBreak = (Changes[I].NewlinesBefore > 1) && - !Style.AlignConsecutiveMacros.AcrossEmptyLines; - - // Whether to break the alignment sequence because of a line without a - // match. - bool NoMatchBreak = - !FoundMatchOnLine && - !(LineIsComment && Style.AlignConsecutiveMacros.AcrossComments); - - if (EmptyLineBreak || NoMatchBreak) { - AlignMatchingTokenSequence(StartOfSequence, EndOfSequence, MinColumn, - AlignMacrosMatches, Changes); - } - - // A new line starts, re-initialize line status tracking bools. - FoundMatchOnLine = false; - LineIsComment = true; - } - - if (Changes[I].Tok->isNot(tok::comment)) - LineIsComment = false; - - if (!AlignMacrosMatches(Changes[I])) - continue; - - FoundMatchOnLine = true; - - if (StartOfSequence == 0) - StartOfSequence = I; - - unsigned ChangeMinColumn = Changes[I].StartOfTokenColumn; - MinColumn = std::max(MinColumn, ChangeMinColumn); - } - - EndOfSequence = I; - AlignMatchingTokenSequence(StartOfSequence, EndOfSequence, MinColumn, - AlignMacrosMatches, Changes); + AlignTokens<decltype(AlignMacrosMatches) &, /*SimpleCheck=*/true>( + Style, AlignMacrosMatches, Changes, 0, Style.AlignConsecutiveMacros); } void WhitespaceManager::alignConsecutiveAssignments() { diff --git a/clang/test/Frontend/rewrite-includes-bom.c b/clang/test/Frontend/rewrite-includes-bom.c index caa431a..27bf470 100644 --- a/clang/test/Frontend/rewrite-includes-bom.c +++ b/clang/test/Frontend/rewrite-includes-bom.c @@ -1,8 +1,8 @@ -// RUN: grep -q $'^\xEF\xBB\xBF' %S/Inputs/rewrite-includes-bom.h +// RUN: cat %S/Inputs/rewrite-includes-bom.h | od -t x1 | grep -q 'ef\s*bb\s*bf' // RUN: %clang_cc1 -E -frewrite-includes -I %S/Inputs %s -o %t.c -// RUN: ! grep -q $'\xEF\xBB\xBF' %t.c +// RUN: cat %t.c | od -t x1 | not grep -q 'ef\s*bb\s*bf' // RUN: %clang_cc1 -fsyntax-only -verify %t.c // expected-no-diagnostics -// REQUIRES: shell +// UNSUPPORTED: system-windows #include "rewrite-includes-bom.h" diff --git a/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c b/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c index 46aba91..6f574ac 100644 --- a/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c +++ b/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c @@ -1,5 +1,5 @@ // Test UTF8 BOM at start of file -// RUN: printf '\xef\xbb\xbf' > %t.c +// RUN: printf '\357\273\277' > %t.c // RUN: echo '#ifdef TEST\n' >> %t.c // RUN: echo '#include <string>' >> %t.c // RUN: echo '#endif' >> %t.c diff --git a/clang/test/Modules/crash-vfs-relative-incdir.m b/clang/test/Modules/crash-vfs-relative-incdir.m index c0407f7..46c3413c 100644 --- a/clang/test/Modules/crash-vfs-relative-incdir.m +++ b/clang/test/Modules/crash-vfs-relative-incdir.m @@ -53,4 +53,4 @@ // RUN: cd %t // RUN: chmod 755 crash-vfs-*.sh -// RUN: ./crash-vfs-*.sh +// RUN: bash ./crash-vfs-*.sh diff --git a/clang/test/Modules/crash-vfs-run-reproducer.m b/clang/test/Modules/crash-vfs-run-reproducer.m index fd861fe..fa06fd9 100644 --- a/clang/test/Modules/crash-vfs-run-reproducer.m +++ b/clang/test/Modules/crash-vfs-run-reproducer.m @@ -53,4 +53,4 @@ // RUN: cd %t // RUN: chmod 755 crash-vfs-*.sh -// RUN: ./crash-vfs-*.sh +// RUN: bash ./crash-vfs-*.sh diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp index 95f8ae2..ef22960 100644 --- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp +++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp @@ -2038,4 +2038,42 @@ TEST(ExprMutationAnalyzerTest, PointeeMutatedByConditionOperator) { EXPECT_TRUE(isPointeeMutated(Results, AST.get())); } +TEST(ExprMutationAnalyzerTest, PointeeMutatedByReturn) { + { + const std::string Code = R"( + int * f() { + int *const x = nullptr; + return x; + })"; + auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"}); + auto Results = + match(withEnclosingCompound(declRefTo("x")), AST->getASTContext()); + EXPECT_TRUE(isPointeeMutated(Results, AST.get())); + } + { + const std::string Code = R"( + int * f() { + int *const x = nullptr; + return x; + })"; + // in C++23, AST will have NoOp cast. + auto AST = + buildASTFromCodeWithArgs(Code, {"-Wno-everything", "-std=c++23"}); + auto Results = + match(withEnclosingCompound(declRefTo("x")), AST->getASTContext()); + EXPECT_TRUE(isPointeeMutated(Results, AST.get())); + } + { + const std::string Code = R"( + int const* f() { + int *const x = nullptr; + return x; + })"; + auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"}); + auto Results = + match(withEnclosingCompound(declRefTo("x")), AST->getASTContext()); + EXPECT_FALSE(isPointeeMutated(Results, AST.get())); + } +} + } // namespace clang diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 0fb8139..a3ad978 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -18559,6 +18559,11 @@ TEST_F(FormatTest, AlignConsecutiveMacros) { "#define bbbb 4\n" "#define ccc (5)", Style); + + Style.ColumnLimit = 30; + verifyFormat("#define MY_FUNC(x) callMe(X)\n" + "#define MY_LONG_CONSTANT 17", + Style); } TEST_F(FormatTest, AlignConsecutiveAssignmentsAcrossEmptyLines) { diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 088edc0..9a81f26 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -77,6 +77,7 @@ if( LIBCLC_STANDALONE_BUILD OR CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DI # Setup the paths where libclc runtimes should be stored. set( LIBCLC_OUTPUT_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR} ) + set( LIBCLC_INSTALL_DIR ${CMAKE_INSTALL_DATADIR}/clc ) else() # In-tree configuration set( LIBCLC_STANDALONE_BUILD FALSE ) @@ -100,10 +101,12 @@ else() # Setup the paths where libclc runtimes should be stored. By default, in an # in-tree build we place the libraries in clang's resource driectory. include(GetClangResourceDir) - get_clang_resource_dir( LIBCLC_OUTPUT_DIR PREFIX ${LLVM_LIBRARY_OUTPUT_INTDIR}/.. ) + get_clang_resource_dir( LIBCLC_INSTALL_DIR ) + cmake_path( APPEND LIBCLC_INSTALL_DIR "lib" "libclc" ) # Note we do not adhere to LLVM_ENABLE_PER_TARGET_RUNTIME_DIR. - set( LIBCLC_OUTPUT_LIBRARY_DIR ${LIBCLC_OUTPUT_DIR}/lib/libclc ) + cmake_path( GET LLVM_LIBRARY_OUTPUT_INTDIR PARENT_PATH LIBCLC_OUTPUT_LIBRARY_DIR ) + cmake_path( APPEND LIBCLC_OUTPUT_LIBRARY_DIR ${LIBCLC_INSTALL_DIR} ) file( MAKE_DIRECTORY ${LIBCLC_OUTPUT_LIBRARY_DIR} ) endif() diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers.inc b/libclc/clc/include/clc/math/clc_sincos_helpers.inc index 4daff92..0a3b816 100644 --- a/libclc/clc/include/clc/math/clc_sincos_helpers.inc +++ b/libclc/clc/include/clc/math/clc_sincos_helpers.inc @@ -10,6 +10,11 @@ _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_sinf_piby4(__CLC_FLOATN x, __CLC_FLOATN y); _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x, __CLC_FLOATN y); + +_CLC_DECL _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_FLOATN x, + private __CLC_FLOATN *sinval, + private __CLC_FLOATN *cosval); + _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x, __CLC_INTN regn); diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc index 09c6e1c..15934ca 100644 --- a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc +++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc @@ -6,6 +6,15 @@ // //===----------------------------------------------------------------------===// +_CLC_DECL _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x, + __CLC_DOUBLEN xx, + private __CLC_DOUBLEN *sinval, + private __CLC_DOUBLEN *cosval); + +_CLC_DECL _CLC_OVERLOAD void __clc_tan_piby4(__CLC_DOUBLEN x, __CLC_DOUBLEN xx, + private __CLC_DOUBLEN *leadval, + private __CLC_DOUBLEN *tailval); + _CLC_DECL _CLC_OVERLOAD void __clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r, private __CLC_DOUBLEN *rr, diff --git a/libclc/clc/include/clc/math/clc_sincos_piby4.h b/libclc/clc/include/clc/math/clc_sincos_piby4.h deleted file mode 100644 index 50608ae..0000000 --- a/libclc/clc/include/clc/math/clc_sincos_piby4.h +++ /dev/null @@ -1,14 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include <clc/math/clc_fma.h> -#include <clc/math/clc_mad.h> -#include <clc/math/math.h> - -#define __CLC_BODY <clc/math/clc_sincos_piby4.inc> -#include <clc/math/gentype.inc> diff --git a/libclc/clc/include/clc/math/clc_sincos_piby4.inc b/libclc/clc/include/clc/math/clc_sincos_piby4.inc deleted file mode 100644 index 91ec518..0000000 --- a/libclc/clc/include/clc/math/clc_sincos_piby4.inc +++ /dev/null @@ -1,174 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#if __CLC_FPSIZE == 32 - -// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4] -_CLC_INLINE _CLC_OVERLOAD void -__clc_sincos_piby4(__CLC_GENTYPE x, private __CLC_GENTYPE *sinval, - private __CLC_GENTYPE *cosval) { - // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... - // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... - // = x * f(w) - // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... - // We use a minimax approximation of (f(w) - 1) / w - // because this produces an expansion in even powers of x. - - // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... - // = f(w) - // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... - // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) - // because this produces an expansion in even powers of x. - - const __CLC_GENTYPE sc1 = -0.166666666638608441788607926e0F; - const __CLC_GENTYPE sc2 = 0.833333187633086262120839299e-2F; - const __CLC_GENTYPE sc3 = -0.198400874359527693921333720e-3F; - const __CLC_GENTYPE sc4 = 0.272500015145584081596826911e-5F; - - const __CLC_GENTYPE cc1 = 0.41666666664325175238031e-1F; - const __CLC_GENTYPE cc2 = -0.13888887673175665567647e-2F; - const __CLC_GENTYPE cc3 = 0.24800600878112441958053e-4F; - const __CLC_GENTYPE cc4 = -0.27301013343179832472841e-6F; - - __CLC_GENTYPE x2 = x * x; - - *sinval = __clc_mad( - x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1), - x); - *cosval = __clc_mad( - x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1), - __clc_mad(x2, -0.5f, 1.0f)); -} - -#elif __CLC_FPSIZE == 64 - -_CLC_INLINE _CLC_OVERLOAD void -__clc_sincos_piby4(__CLC_GENTYPE x, __CLC_GENTYPE xx, - private __CLC_GENTYPE *sinval, - private __CLC_GENTYPE *cosval) { - // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... - // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... - // = x * f(w) - // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... - // We use a minimax approximation of (f(w) - 1) / w - // because this produces an expansion in even powers of x. - // If xx (the tail of x) is non-zero, we add a correction - // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx) - // is an approximation to cos(x)*sin(xx) valid because - // xx is tiny relative to x. - - // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... - // = f(w) - // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... - // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) - // because this produces an expansion in even powers of x. - // If xx (the tail of x) is non-zero, we subtract a correction - // term g(x,xx) = x*xx to the result, where g(x,xx) - // is an approximation to sin(x)*sin(xx) valid because - // xx is tiny relative to x. - - const __CLC_GENTYPE sc1 = -0.166666666666666646259241729; - const __CLC_GENTYPE sc2 = 0.833333333333095043065222816e-2; - const __CLC_GENTYPE sc3 = -0.19841269836761125688538679e-3; - const __CLC_GENTYPE sc4 = 0.275573161037288022676895908448e-5; - const __CLC_GENTYPE sc5 = -0.25051132068021699772257377197e-7; - const __CLC_GENTYPE sc6 = 0.159181443044859136852668200e-9; - - const __CLC_GENTYPE cc1 = 0.41666666666666665390037e-1; - const __CLC_GENTYPE cc2 = -0.13888888888887398280412e-2; - const __CLC_GENTYPE cc3 = 0.248015872987670414957399e-4; - const __CLC_GENTYPE cc4 = -0.275573172723441909470836e-6; - const __CLC_GENTYPE cc5 = 0.208761463822329611076335e-8; - const __CLC_GENTYPE cc6 = -0.113826398067944859590880e-10; - - __CLC_GENTYPE x2 = x * x; - __CLC_GENTYPE x3 = x2 * x; - __CLC_GENTYPE r = (__CLC_GENTYPE)0.5 * x2; - __CLC_GENTYPE t = (__CLC_GENTYPE)1.0 - r; - - __CLC_GENTYPE sp = __clc_fma( - __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2); - - __CLC_GENTYPE cp = - t + - __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5), - x2, cc4), - x2, cc3), - x2, cc2), - x2, cc1), - x2 * x2, __clc_fma(x, xx, (1.0 - t) - r)); - - *sinval = - x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx)); - *cosval = cp; -} - -_CLC_INLINE _CLC_OVERLOAD void __clc_tan_piby4(__CLC_GENTYPE x, - __CLC_GENTYPE xx, - private __CLC_GENTYPE *leadval, - private __CLC_GENTYPE *tailval) { - // 0x3fe921fb54442d18 - const __CLC_GENTYPE piby4_lead = 7.85398163397448278999e-01; - // 0x3c81a62633145c06 - const __CLC_GENTYPE piby4_tail = 3.06161699786838240164e-17; - - // In order to maintain relative precision transform using the identity: - // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4. - // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4. - - __CLC_LONGN ca = x > 0.68; - __CLC_LONGN cb = x < -0.68; - __CLC_GENTYPE transform = ca ? 1.0 : 0.0; - transform = cb ? -1.0 : transform; - - __CLC_GENTYPE tx = __clc_fma(-transform, x, piby4_lead) + - __clc_fma(-transform, xx, piby4_tail); - __CLC_LONGN c = ca | cb; - x = c ? tx : x; - xx = c ? 0.0 : xx; - - // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68]. - __CLC_GENTYPE t1 = x; - __CLC_GENTYPE r = __clc_fma(2.0, x * xx, x * x); - - __CLC_GENTYPE a = __clc_fma(r, - __clc_fma(r, 0.224044448537022097264602535574e-3, - -0.229345080057565662883358588111e-1), - 0.372379159759792203640806338901e0); - - __CLC_GENTYPE b = - __clc_fma(r, - __clc_fma(r, - __clc_fma(r, -0.232371494088563558304549252913e-3, - 0.260656620398645407524064091208e-1), - -0.515658515729031149329237816945e0), - 0.111713747927937668539901657944e1); - - __CLC_GENTYPE t2 = __clc_fma(MATH_DIVIDE(a, b), x * r, xx); - - __CLC_GENTYPE tp = t1 + t2; - - // Compute -1.0/(t1 + t2) accurately - __CLC_GENTYPE z1 = - __CLC_AS_GENTYPE(__CLC_AS_ULONGN(tp) & 0xffffffff00000000L); - __CLC_GENTYPE z2 = t2 - (z1 - t1); - __CLC_GENTYPE trec = -MATH_RECIP(tp); - __CLC_GENTYPE trec_top = - __CLC_AS_GENTYPE(__CLC_AS_ULONGN(trec) & 0xffffffff00000000L); - - __CLC_GENTYPE tpr = __clc_fma( - __clc_fma(trec_top, z2, __clc_fma(trec_top, z1, 1.0)), trec, trec_top); - - __CLC_GENTYPE tpt = transform * (1.0 - MATH_DIVIDE(2.0 * tp, 1.0 + tp)); - __CLC_GENTYPE tptr = transform * (MATH_DIVIDE(2.0 * tp, tp - 1.0) - 1.0); - - *leadval = c ? tpt : tp; - *tailval = c ? tptr : tpr; -} - -#endif diff --git a/libclc/clc/lib/generic/math/clc_cos.cl b/libclc/clc/lib/generic/math/clc_cos.cl index e7e4d6a..5529ec4 100644 --- a/libclc/clc/lib/generic/math/clc_cos.cl +++ b/libclc/clc/lib/generic/math/clc_cos.cl @@ -10,7 +10,6 @@ #include <clc/float/definitions.h> #include <clc/math/clc_fabs.h> #include <clc/math/clc_sincos_helpers.h> -#include <clc/math/clc_sincos_piby4.h> #include <clc/math/math.h> #include <clc/relational/clc_isinf.h> #include <clc/relational/clc_isnan.h> diff --git a/libclc/clc/lib/generic/math/clc_cospi.cl b/libclc/clc/lib/generic/math/clc_cospi.cl index 07e1b49..6a10171 100644 --- a/libclc/clc/lib/generic/math/clc_cospi.cl +++ b/libclc/clc/lib/generic/math/clc_cospi.cl @@ -11,7 +11,6 @@ #include <clc/internal/clc.h> #include <clc/math/clc_fabs.h> #include <clc/math/clc_sincos_helpers.h> -#include <clc/math/clc_sincos_piby4.h> #include <clc/math/math.h> #define __CLC_BODY <clc_cospi.inc> diff --git a/libclc/clc/lib/generic/math/clc_sin.cl b/libclc/clc/lib/generic/math/clc_sin.cl index 741383f..99338c9 100644 --- a/libclc/clc/lib/generic/math/clc_sin.cl +++ b/libclc/clc/lib/generic/math/clc_sin.cl @@ -11,7 +11,6 @@ #include <clc/internal/clc.h> #include <clc/math/clc_fabs.h> #include <clc/math/clc_sincos_helpers.h> -#include <clc/math/clc_sincos_piby4.h> #include <clc/math/clc_trunc.h> #include <clc/math/math.h> #include <clc/math/tables.h> diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc index 9a46170..2a71b56 100644 --- a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc +++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc @@ -74,6 +74,43 @@ _CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x, return ret; } +// Evaluate single precisions sin and cos of value in interval [-pi/4, pi/4] +_CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_FLOATN x, + private __CLC_FLOATN *sinval, + private __CLC_FLOATN *cosval) { + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + + const __CLC_FLOATN sc1 = -0.166666666638608441788607926e0F; + const __CLC_FLOATN sc2 = 0.833333187633086262120839299e-2F; + const __CLC_FLOATN sc3 = -0.198400874359527693921333720e-3F; + const __CLC_FLOATN sc4 = 0.272500015145584081596826911e-5F; + + const __CLC_FLOATN cc1 = 0.41666666664325175238031e-1F; + const __CLC_FLOATN cc2 = -0.13888887673175665567647e-2F; + const __CLC_FLOATN cc3 = 0.24800600878112441958053e-4F; + const __CLC_FLOATN cc4 = -0.27301013343179832472841e-6F; + + __CLC_FLOATN x2 = x * x; + + *sinval = __clc_mad( + x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1), + x); + *cosval = __clc_mad( + x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1), + __clc_mad(x2, -0.5f, 1.0f)); +} + _CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x, __CLC_INTN regn) { // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4]. diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc index 8fae90c..e029c6d 100644 --- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc +++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc @@ -6,6 +6,129 @@ // //===----------------------------------------------------------------------===// +_CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x, + __CLC_DOUBLEN xx, + private __CLC_DOUBLEN *sinval, + private __CLC_DOUBLEN *cosval) { + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we add a correction + // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx) + // is an approximation to cos(x)*sin(xx) valid because + // xx is tiny relative to x. + + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we subtract a correction + // term g(x,xx) = x*xx to the result, where g(x,xx) + // is an approximation to sin(x)*sin(xx) valid because + // xx is tiny relative to x. + + const __CLC_DOUBLEN sc1 = -0.166666666666666646259241729; + const __CLC_DOUBLEN sc2 = 0.833333333333095043065222816e-2; + const __CLC_DOUBLEN sc3 = -0.19841269836761125688538679e-3; + const __CLC_DOUBLEN sc4 = 0.275573161037288022676895908448e-5; + const __CLC_DOUBLEN sc5 = -0.25051132068021699772257377197e-7; + const __CLC_DOUBLEN sc6 = 0.159181443044859136852668200e-9; + + const __CLC_DOUBLEN cc1 = 0.41666666666666665390037e-1; + const __CLC_DOUBLEN cc2 = -0.13888888888887398280412e-2; + const __CLC_DOUBLEN cc3 = 0.248015872987670414957399e-4; + const __CLC_DOUBLEN cc4 = -0.275573172723441909470836e-6; + const __CLC_DOUBLEN cc5 = 0.208761463822329611076335e-8; + const __CLC_DOUBLEN cc6 = -0.113826398067944859590880e-10; + + __CLC_DOUBLEN x2 = x * x; + __CLC_DOUBLEN x3 = x2 * x; + __CLC_DOUBLEN r = (__CLC_DOUBLEN)0.5 * x2; + __CLC_DOUBLEN t = (__CLC_DOUBLEN)1.0 - r; + + __CLC_DOUBLEN sp = __clc_fma( + __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2); + + __CLC_DOUBLEN cp = + t + + __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5), + x2, cc4), + x2, cc3), + x2, cc2), + x2, cc1), + x2 * x2, __clc_fma(x, xx, (1.0 - t) - r)); + + *sinval = + x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx)); + *cosval = cp; +} + +_CLC_DEF _CLC_OVERLOAD void __clc_tan_piby4(__CLC_DOUBLEN x, __CLC_DOUBLEN xx, + private __CLC_DOUBLEN *leadval, + private __CLC_DOUBLEN *tailval) { + // 0x3fe921fb54442d18 + const __CLC_DOUBLEN piby4_lead = 7.85398163397448278999e-01; + // 0x3c81a62633145c06 + const __CLC_DOUBLEN piby4_tail = 3.06161699786838240164e-17; + + // In order to maintain relative precision transform using the identity: + // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4. + // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4. + + __CLC_LONGN ca = x > 0.68; + __CLC_LONGN cb = x < -0.68; + __CLC_DOUBLEN transform = ca ? 1.0 : 0.0; + transform = cb ? -1.0 : transform; + + __CLC_DOUBLEN tx = __clc_fma(-transform, x, piby4_lead) + + __clc_fma(-transform, xx, piby4_tail); + __CLC_LONGN c = ca | cb; + x = c ? tx : x; + xx = c ? 0.0 : xx; + + // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68]. + __CLC_DOUBLEN t1 = x; + __CLC_DOUBLEN r = __clc_fma(2.0, x * xx, x * x); + + __CLC_DOUBLEN a = __clc_fma(r, + __clc_fma(r, 0.224044448537022097264602535574e-3, + -0.229345080057565662883358588111e-1), + 0.372379159759792203640806338901e0); + + __CLC_DOUBLEN b = + __clc_fma(r, + __clc_fma(r, + __clc_fma(r, -0.232371494088563558304549252913e-3, + 0.260656620398645407524064091208e-1), + -0.515658515729031149329237816945e0), + 0.111713747927937668539901657944e1); + + __CLC_DOUBLEN t2 = __clc_fma(MATH_DIVIDE(a, b), x * r, xx); + + __CLC_DOUBLEN tp = t1 + t2; + + // Compute -1.0/(t1 + t2) accurately + __CLC_DOUBLEN z1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(tp) & 0xffffffff00000000L); + __CLC_DOUBLEN z2 = t2 - (z1 - t1); + __CLC_DOUBLEN trec = -MATH_RECIP(tp); + __CLC_DOUBLEN trec_top = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(trec) & 0xffffffff00000000L); + + __CLC_DOUBLEN tpr = __clc_fma( + __clc_fma(trec_top, z2, __clc_fma(trec_top, z1, 1.0)), trec, trec_top); + + __CLC_DOUBLEN tpt = transform * (1.0 - MATH_DIVIDE(2.0 * tp, 1.0 + tp)); + __CLC_DOUBLEN tptr = transform * (MATH_DIVIDE(2.0 * tp, tp - 1.0) - 1.0); + + *leadval = c ? tpt : tp; + *tailval = c ? tptr : tpr; +} + // Reduction for medium sized arguments _CLC_DEF _CLC_OVERLOAD void __clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r, diff --git a/libclc/clc/lib/generic/math/clc_sinpi.cl b/libclc/clc/lib/generic/math/clc_sinpi.cl index 6cff247..bb5de09f0 100644 --- a/libclc/clc/lib/generic/math/clc_sinpi.cl +++ b/libclc/clc/lib/generic/math/clc_sinpi.cl @@ -11,7 +11,6 @@ #include <clc/internal/clc.h> #include <clc/math/clc_fabs.h> #include <clc/math/clc_sincos_helpers.h> -#include <clc/math/clc_sincos_piby4.h> #include <clc/math/math.h> #define __CLC_BODY <clc_sinpi.inc> diff --git a/libclc/clc/lib/generic/math/clc_tan.cl b/libclc/clc/lib/generic/math/clc_tan.cl index adf42c4..7e68216 100644 --- a/libclc/clc/lib/generic/math/clc_tan.cl +++ b/libclc/clc/lib/generic/math/clc_tan.cl @@ -11,7 +11,6 @@ #include <clc/internal/clc.h> #include <clc/math/clc_fabs.h> #include <clc/math/clc_sincos_helpers.h> -#include <clc/math/clc_sincos_piby4.h> #include <clc/math/math.h> #include <clc/math/tables.h> #include <clc/relational/clc_isinf.h> diff --git a/libclc/clc/lib/generic/math/clc_tanpi.cl b/libclc/clc/lib/generic/math/clc_tanpi.cl index f126589..099457c1 100644 --- a/libclc/clc/lib/generic/math/clc_tanpi.cl +++ b/libclc/clc/lib/generic/math/clc_tanpi.cl @@ -12,7 +12,6 @@ #include <clc/math/clc_fabs.h> #include <clc/math/clc_native_recip.h> #include <clc/math/clc_sincos_helpers.h> -#include <clc/math/clc_sincos_piby4.h> #include <clc/math/math.h> #define __CLC_BODY <clc_tanpi.inc> diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index 614f9e3..d8c2219 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -261,7 +261,7 @@ function(libclc_install) install( FILES ${files} - DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" + DESTINATION ${LIBCLC_INSTALL_DIR} ) endfunction() diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll index d4cc154..52ca22b 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll @@ -1,38 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,GENERIC -; RUN: llc < %s -O0 -fast-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,FAST -; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* \ -; RUN: -mtriple=arm64-eabi -aarch64-neon-syntax=apple \ -; RUN: | FileCheck %s --check-prefixes=GISEL,FALLBACK +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -O0 -fast-isel | FileCheck %s --check-prefixes=CHECK,CHECK-FI +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for test_vcvt_bf16_f64 -; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_f64_f32) -; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_f64_f32) define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp { ; CHECK-LABEL: test_vcvt_f64_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtl v0.2d, v0.2s ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_f64_f32: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtl v0.2d, v0.2s -; GISEL-NEXT: ret %vcvt1.i = fpext <2 x float> %x to <2 x double> ret <2 x double> %vcvt1.i } -; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_high_f64_f32) -; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_high_f64_f32) define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp { ; CHECK-LABEL: test_vcvt_high_f64_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtl2 v0.2d, v0.4s ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_high_f64_f32: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtl2 v0.2d, v0.4s -; GISEL-NEXT: ret %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3> %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double> ret <2 x double> %vcvt1.i @@ -43,11 +29,6 @@ define <2 x double> @test_vcvt_high_v1f64_f32_bitcast(<4 x float> %x) nounwind r ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtl2 v0.2d, v0.4s ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_high_v1f64_f32_bitcast: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtl2 v0.2d, v0.4s -; GISEL-NEXT: ret %bc1 = bitcast <4 x float> %x to <2 x double> %ext = shufflevector <2 x double> %bc1, <2 x double> undef, <1 x i32> <i32 1> %bc2 = bitcast <1 x double> %ext to <2 x float> @@ -60,11 +41,6 @@ define <2 x double> @test_vcvt_high_v1i64_f32_bitcast(<2 x i64> %x) nounwind rea ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtl2 v0.2d, v0.4s ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_high_v1i64_f32_bitcast: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtl2 v0.2d, v0.4s -; GISEL-NEXT: ret %ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1> %bc2 = bitcast <1 x i64> %ext to <2 x float> %r = fpext <2 x float> %bc2 to <2 x double> @@ -76,11 +52,6 @@ define <2 x double> @test_vcvt_high_v2i32_f32_bitcast(<4 x i32> %x) nounwind rea ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtl2 v0.2d, v0.4s ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_high_v2i32_f32_bitcast: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtl2 v0.2d, v0.4s -; GISEL-NEXT: ret %ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %bc2 = bitcast <2 x i32> %ext to <2 x float> %r = fpext <2 x float> %bc2 to <2 x double> @@ -92,11 +63,6 @@ define <2 x double> @test_vcvt_high_v4i16_f32_bitcast(<8 x i16> %x) nounwind rea ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtl2 v0.2d, v0.4s ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_high_v4i16_f32_bitcast: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtl2 v0.2d, v0.4s -; GISEL-NEXT: ret %ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %bc2 = bitcast <4 x i16> %ext to <2 x float> %r = fpext <2 x float> %bc2 to <2 x double> @@ -108,11 +74,6 @@ define <2 x double> @test_vcvt_high_v8i8_f32_bitcast(<16 x i8> %x) nounwind read ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtl2 v0.2d, v0.4s ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_high_v8i8_f32_bitcast: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtl2 v0.2d, v0.4s -; GISEL-NEXT: ret %ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %bc2 = bitcast <8 x i8> %ext to <2 x float> %r = fpext <2 x float> %bc2 to <2 x double> @@ -124,11 +85,6 @@ define <4 x float> @test_vcvt_high_v1i64_f16_bitcast(<2 x i64> %x) nounwind read ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtl2 v0.4s, v0.8h ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_high_v1i64_f16_bitcast: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtl2 v0.4s, v0.8h -; GISEL-NEXT: ret %ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1> %bc2 = bitcast <1 x i64> %ext to <4 x half> %r = fpext <4 x half> %bc2 to <4 x float> @@ -140,11 +96,6 @@ define <4 x float> @test_vcvt_high_v2i32_f16_bitcast(<4 x i32> %x) nounwind read ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtl2 v0.4s, v0.8h ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_high_v2i32_f16_bitcast: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtl2 v0.4s, v0.8h -; GISEL-NEXT: ret %ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %bc2 = bitcast <2 x i32> %ext to <4 x half> %r = fpext <4 x half> %bc2 to <4 x float> @@ -156,11 +107,6 @@ define <4 x float> @test_vcvt_high_v4i16_f16_bitcast(<8 x i16> %x) nounwind read ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtl2 v0.4s, v0.8h ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_high_v4i16_f16_bitcast: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtl2 v0.4s, v0.8h -; GISEL-NEXT: ret %ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %bc2 = bitcast <4 x i16> %ext to <4 x half> %r = fpext <4 x half> %bc2 to <4 x float> @@ -172,134 +118,118 @@ define <4 x float> @test_vcvt_high_v8i8_f16_bitcast(<16 x i8> %x) nounwind readn ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtl2 v0.4s, v0.8h ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_high_v8i8_f16_bitcast: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtl2 v0.4s, v0.8h -; GISEL-NEXT: ret %ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %bc2 = bitcast <8 x i8> %ext to <4 x half> %r = fpext <4 x half> %bc2 to <4 x float> ret <4 x float> %r } -; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_f32_f64) -; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_f32_f64) define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp { ; CHECK-LABEL: test_vcvt_f32_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvt_f32_f64: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtn v0.2s, v0.2d -; GISEL-NEXT: ret %vcvt1.i = fptrunc <2 x double> %v to <2 x float> ret <2 x float> %vcvt1.i } -; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_bf16_f64) -; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_bf16_f64) define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp { -; GENERIC-LABEL: test_vcvt_bf16_f64: -; GENERIC: // %bb.0: -; GENERIC-NEXT: fcvtxn v0.2s, v0.2d -; GENERIC-NEXT: movi.4s v1, #1 -; GENERIC-NEXT: movi.4s v2, #127, msl #8 -; GENERIC-NEXT: ushr.4s v3, v0, #16 -; GENERIC-NEXT: add.4s v2, v0, v2 -; GENERIC-NEXT: and.16b v1, v3, v1 -; GENERIC-NEXT: fcmeq.4s v3, v0, v0 -; GENERIC-NEXT: orr.4s v0, #64, lsl #16 -; GENERIC-NEXT: add.4s v1, v1, v2 -; GENERIC-NEXT: bit.16b v0, v1, v3 -; GENERIC-NEXT: shrn.4h v0, v0, #16 -; GENERIC-NEXT: ret +; CHECK-SD-LABEL: test_vcvt_bf16_f64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d +; CHECK-SD-NEXT: movi.4s v1, #1 +; CHECK-SD-NEXT: movi.4s v2, #127, msl #8 +; CHECK-SD-NEXT: ushr.4s v3, v0, #16 +; CHECK-SD-NEXT: add.4s v2, v0, v2 +; CHECK-SD-NEXT: and.16b v1, v3, v1 +; CHECK-SD-NEXT: fcmeq.4s v3, v0, v0 +; CHECK-SD-NEXT: orr.4s v0, #64, lsl #16 +; CHECK-SD-NEXT: add.4s v1, v1, v2 +; CHECK-SD-NEXT: bit.16b v0, v1, v3 +; CHECK-SD-NEXT: shrn.4h v0, v0, #16 +; CHECK-SD-NEXT: ret ; -; FAST-LABEL: test_vcvt_bf16_f64: -; FAST: // %bb.0: -; FAST-NEXT: fcvtxn v1.2s, v0.2d -; FAST-NEXT: // implicit-def: $q0 -; FAST-NEXT: fmov d0, d1 -; FAST-NEXT: ushr.4s v1, v0, #16 -; FAST-NEXT: movi.4s v2, #1 -; FAST-NEXT: and.16b v1, v1, v2 -; FAST-NEXT: add.4s v1, v1, v0 -; FAST-NEXT: movi.4s v2, #127, msl #8 -; FAST-NEXT: add.4s v1, v1, v2 -; FAST-NEXT: mov.16b v2, v0 -; FAST-NEXT: orr.4s v2, #64, lsl #16 -; FAST-NEXT: fcmeq.4s v0, v0, v0 -; FAST-NEXT: bsl.16b v0, v1, v2 -; FAST-NEXT: shrn.4h v0, v0, #16 -; FAST-NEXT: ret +; CHECK-FI-LABEL: test_vcvt_bf16_f64: +; CHECK-FI: // %bb.0: +; CHECK-FI-NEXT: fcvtxn v1.2s, v0.2d +; CHECK-FI-NEXT: // implicit-def: $q0 +; CHECK-FI-NEXT: fmov d0, d1 +; CHECK-FI-NEXT: ushr.4s v1, v0, #16 +; CHECK-FI-NEXT: movi.4s v2, #1 +; CHECK-FI-NEXT: and.16b v1, v1, v2 +; CHECK-FI-NEXT: add.4s v1, v1, v0 +; CHECK-FI-NEXT: movi.4s v2, #127, msl #8 +; CHECK-FI-NEXT: add.4s v1, v1, v2 +; CHECK-FI-NEXT: mov.16b v2, v0 +; CHECK-FI-NEXT: orr.4s v2, #64, lsl #16 +; CHECK-FI-NEXT: fcmeq.4s v0, v0, v0 +; CHECK-FI-NEXT: bsl.16b v0, v1, v2 +; CHECK-FI-NEXT: shrn.4h v0, v0, #16 +; CHECK-FI-NEXT: ret ; -; GISEL-LABEL: test_vcvt_bf16_f64: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtxn v0.2s, v0.2d -; GISEL-NEXT: movi.4s v1, #1 -; GISEL-NEXT: movi.4s v2, #127, msl #8 -; GISEL-NEXT: ushr.4s v3, v0, #16 -; GISEL-NEXT: add.4s v2, v0, v2 -; GISEL-NEXT: and.16b v1, v3, v1 -; GISEL-NEXT: fcmeq.4s v3, v0, v0 -; GISEL-NEXT: orr.4s v0, #64, lsl #16 -; GISEL-NEXT: add.4s v1, v1, v2 -; GISEL-NEXT: bit.16b v0, v1, v3 -; GISEL-NEXT: shrn.4h v0, v0, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_vcvt_bf16_f64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcvtxn v0.2s, v0.2d +; CHECK-GI-NEXT: movi.4s v1, #1 +; CHECK-GI-NEXT: movi.4s v2, #127, msl #8 +; CHECK-GI-NEXT: ushr.4s v3, v0, #16 +; CHECK-GI-NEXT: add.4s v2, v0, v2 +; CHECK-GI-NEXT: and.16b v1, v3, v1 +; CHECK-GI-NEXT: fcmeq.4s v3, v0, v0 +; CHECK-GI-NEXT: orr.4s v0, #64, lsl #16 +; CHECK-GI-NEXT: add.4s v1, v1, v2 +; CHECK-GI-NEXT: bit.16b v0, v1, v3 +; CHECK-GI-NEXT: shrn.4h v0, v0, #16 +; CHECK-GI-NEXT: ret %vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat> ret <2 x bfloat> %vcvt1.i } define half @test_vcvt_f16_f32(<1 x float> %x) { -; GENERIC-LABEL: test_vcvt_f16_f32: -; GENERIC: // %bb.0: -; GENERIC-NEXT: // kill: def $d0 killed $d0 def $q0 -; GENERIC-NEXT: fcvt h0, s0 -; GENERIC-NEXT: ret +; CHECK-SD-LABEL: test_vcvt_f16_f32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fcvt h0, s0 +; CHECK-SD-NEXT: ret ; -; FAST-LABEL: test_vcvt_f16_f32: -; FAST: // %bb.0: -; FAST-NEXT: fmov d1, d0 -; FAST-NEXT: // implicit-def: $q0 -; FAST-NEXT: fmov d0, d1 -; FAST-NEXT: // kill: def $s0 killed $s0 killed $q0 -; FAST-NEXT: fcvt h0, s0 -; FAST-NEXT: ret +; CHECK-FI-LABEL: test_vcvt_f16_f32: +; CHECK-FI: // %bb.0: +; CHECK-FI-NEXT: fmov d1, d0 +; CHECK-FI-NEXT: // implicit-def: $q0 +; CHECK-FI-NEXT: fmov d0, d1 +; CHECK-FI-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-FI-NEXT: fcvt h0, s0 +; CHECK-FI-NEXT: ret ; -; GISEL-LABEL: test_vcvt_f16_f32: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_vcvt_f16_f32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ret %tmp = fptrunc <1 x float> %x to <1 x half> %elt = extractelement <1 x half> %tmp, i32 0 ret half %elt } -; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_high_f32_f64) -; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_high_f32_f64) define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp { -; GENERIC-LABEL: test_vcvt_high_f32_f64: -; GENERIC: // %bb.0: -; GENERIC-NEXT: // kill: def $d0 killed $d0 def $q0 -; GENERIC-NEXT: fcvtn2 v0.4s, v1.2d -; GENERIC-NEXT: ret +; CHECK-SD-LABEL: test_vcvt_high_f32_f64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d +; CHECK-SD-NEXT: ret ; -; FAST-LABEL: test_vcvt_high_f32_f64: -; FAST: // %bb.0: -; FAST-NEXT: fmov d2, d0 -; FAST-NEXT: // implicit-def: $q0 -; FAST-NEXT: fmov d0, d2 -; FAST-NEXT: fcvtn2 v0.4s, v1.2d -; FAST-NEXT: ret +; CHECK-FI-LABEL: test_vcvt_high_f32_f64: +; CHECK-FI: // %bb.0: +; CHECK-FI-NEXT: fmov d2, d0 +; CHECK-FI-NEXT: // implicit-def: $q0 +; CHECK-FI-NEXT: fmov d0, d2 +; CHECK-FI-NEXT: fcvtn2 v0.4s, v1.2d +; CHECK-FI-NEXT: ret ; -; GISEL-LABEL: test_vcvt_high_f32_f64: -; GISEL: // %bb.0: -; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; GISEL-NEXT: fcvtn2 v0.4s, v1.2d -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_vcvt_high_f32_f64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fcvtn2 v0.4s, v1.2d +; CHECK-GI-NEXT: ret %cvt = fptrunc <2 x double> %v to <2 x float> %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ret <4 x float> %vcvt2.i @@ -310,99 +240,80 @@ define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp { ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtxn v0.2s, v0.2d ; CHECK-NEXT: ret -; -; GISEL-LABEL: test_vcvtx_f32_f64: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvtxn v0.2s, v0.2d -; GISEL-NEXT: ret %vcvtx1.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind ret <2 x float> %vcvtx1.i } define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp { -; GENERIC-LABEL: test_vcvtx_high_f32_f64: -; GENERIC: // %bb.0: -; GENERIC-NEXT: // kill: def $d0 killed $d0 def $q0 -; GENERIC-NEXT: fcvtxn2 v0.4s, v1.2d -; GENERIC-NEXT: ret +; CHECK-SD-LABEL: test_vcvtx_high_f32_f64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fcvtxn2 v0.4s, v1.2d +; CHECK-SD-NEXT: ret ; -; FAST-LABEL: test_vcvtx_high_f32_f64: -; FAST: // %bb.0: -; FAST-NEXT: fmov d2, d0 -; FAST-NEXT: // implicit-def: $q0 -; FAST-NEXT: fmov d0, d2 -; FAST-NEXT: fcvtxn2 v0.4s, v1.2d -; FAST-NEXT: ret +; CHECK-FI-LABEL: test_vcvtx_high_f32_f64: +; CHECK-FI: // %bb.0: +; CHECK-FI-NEXT: fmov d2, d0 +; CHECK-FI-NEXT: // implicit-def: $q0 +; CHECK-FI-NEXT: fmov d0, d2 +; CHECK-FI-NEXT: fcvtxn2 v0.4s, v1.2d +; CHECK-FI-NEXT: ret ; -; GISEL-LABEL: test_vcvtx_high_f32_f64: -; GISEL: // %bb.0: -; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; GISEL-NEXT: fcvtxn2 v0.4s, v1.2d -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_vcvtx_high_f32_f64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fcvtxn2 v0.4s, v1.2d +; CHECK-GI-NEXT: ret %vcvtx2.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ret <4 x float> %res } - -declare <2 x double> @llvm.aarch64.neon.vcvthighfp2df(<4 x float>) nounwind readnone -declare <2 x double> @llvm.aarch64.neon.vcvtfp2df(<2 x float>) nounwind readnone - -declare <2 x float> @llvm.aarch64.neon.vcvtdf2fp(<2 x double>) nounwind readnone -declare <4 x float> @llvm.aarch64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone - -declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone - define i16 @to_half(float %in) { -; GENERIC-LABEL: to_half: -; GENERIC: // %bb.0: -; GENERIC-NEXT: fcvt h0, s0 -; GENERIC-NEXT: fmov w0, s0 -; GENERIC-NEXT: ret +; CHECK-SD-LABEL: to_half: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcvt h0, s0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; FAST-LABEL: to_half: -; FAST: // %bb.0: -; FAST-NEXT: fcvt h1, s0 -; FAST-NEXT: // implicit-def: $w0 -; FAST-NEXT: fmov s0, w0 -; FAST-NEXT: fmov s0, s1 -; FAST-NEXT: fmov w0, s0 -; FAST-NEXT: // kill: def $w1 killed $w0 -; FAST-NEXT: ret +; CHECK-FI-LABEL: to_half: +; CHECK-FI: // %bb.0: +; CHECK-FI-NEXT: fcvt h1, s0 +; CHECK-FI-NEXT: // implicit-def: $w0 +; CHECK-FI-NEXT: fmov s0, w0 +; CHECK-FI-NEXT: fmov s0, s1 +; CHECK-FI-NEXT: fmov w0, s0 +; CHECK-FI-NEXT: // kill: def $w1 killed $w0 +; CHECK-FI-NEXT: ret ; -; GISEL-LABEL: to_half: -; GISEL: // %bb.0: -; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: fmov w0, s0 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: to_half: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %res = call i16 @llvm.convert.to.fp16.f32(float %in) ret i16 %res } define float @from_half(i16 %in) { -; GENERIC-LABEL: from_half: -; GENERIC: // %bb.0: -; GENERIC-NEXT: fmov s0, w0 -; GENERIC-NEXT: fcvt s0, h0 -; GENERIC-NEXT: ret +; CHECK-SD-LABEL: from_half: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: fcvt s0, h0 +; CHECK-SD-NEXT: ret ; -; FAST-LABEL: from_half: -; FAST: // %bb.0: -; FAST-NEXT: fmov s0, w0 -; FAST-NEXT: // kill: def $h0 killed $h0 killed $s0 -; FAST-NEXT: fcvt s0, h0 -; FAST-NEXT: ret +; CHECK-FI-LABEL: from_half: +; CHECK-FI: // %bb.0: +; CHECK-FI-NEXT: fmov s0, w0 +; CHECK-FI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-FI-NEXT: fcvt s0, h0 +; CHECK-FI-NEXT: ret ; -; GISEL-LABEL: from_half: -; GISEL: // %bb.0: -; GISEL-NEXT: fmov s0, w0 -; GISEL-NEXT: fcvt s0, h0 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: from_half: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fcvt s0, h0 +; CHECK-GI-NEXT: ret %res = call float @llvm.convert.from.fp16.f32(i16 %in) ret float %res } - -declare float @llvm.convert.from.fp16.f32(i16) #1 -declare i16 @llvm.convert.to.fp16.f32(float) #1 -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; FALLBACK: {{.*}} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index 5ae0839..3dfa6df 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -1361,132 +1361,6 @@ for.body: ; preds = %for.body.preheader, br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 } -define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) { -; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain( -; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEON-NEXT: entry: -; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 -; CHECK-NEON-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 -; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 -; CHECK-NEON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] -; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 -; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-NEON: vector.ph: -; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16 -; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] -; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] -; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0 -; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-NEON: vector.body: -; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] -; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]] -; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]] -; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEON-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] -; CHECK-NEON: middle.block: -; CHECK-NEON-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]]) -; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] -; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK-NEON: scalar.ph: -; -; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain( -; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-SVE-NEXT: entry: -; CHECK-SVE-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 -; CHECK-SVE-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 -; CHECK-SVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] -; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]] -; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] -; CHECK-SVE-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] -; CHECK-SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[OFFSET]], i64 0 -; CHECK-SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer -; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-SVE: vector.body: -; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-SVE-NEXT: [[TMP7:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-SVE-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP7]] -; CHECK-SVE-NEXT: [[TMP9]] = add <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-SVE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-SVE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] -; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]]) -; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] -; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK-SVE: scalar.ph: -; -; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain( -; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-SVE-MAXBW-NEXT: entry: -; CHECK-SVE-MAXBW-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 -; CHECK-SVE-MAXBW-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 -; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 -; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] -; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3 -; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-SVE-MAXBW: vector.ph: -; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 -; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]] -; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] -; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] -; CHECK-SVE-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[OFFSET]], i64 0 -; CHECK-SVE-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer -; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-SVE-MAXBW: vector.body: -; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] -; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]] -; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]] -; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] -; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP8]]) -; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK-SVE-MAXBW: scalar.ph: -; -entry: - br label %loop - -loop: - %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] - %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] - %l = load i8, ptr %ptr.iv, align 1 - %l.ext = zext i8 %l to i32 - %add = add i32 %red, %l.ext - %red.next = add i32 %add, %offset - %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 - %ec = icmp eq ptr %ptr.iv, %end - br i1 %ec, label %exit, label %loop - -exit: - ret i32 %red.next -} - attributes #0 = { vscale_range(1,16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll new file mode 100644 index 0000000..d80178fd --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S %s | FileCheck %s --check-prefixes=CHECK-NEON + +target triple = "arm64-apple-macosx" + +define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) { +; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain( +; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEON-NEXT: [[ENTRY:.*]]: +; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEON-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEON: [[VECTOR_PH]]: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0 +; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEON-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NEON: [[VECTOR_BODY]]: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP4:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-NEON-NEXT: [[TMP5]] = add <16 x i32> [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEON: [[MIDDLE_BLOCK]]: +; CHECK-NEON-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEON: [[SCALAR_PH]]: +; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEON-NEXT: br label %[[LOOP:.*]] +; CHECK-NEON: [[LOOP]]: +; CHECK-NEON-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEON-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEON-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 +; CHECK-NEON-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 +; CHECK-NEON-NEXT: [[ADD:%.*]] = add i32 [[RED]], [[L_EXT]] +; CHECK-NEON-NEXT: [[RED_NEXT]] = add i32 [[ADD]], [[OFFSET]] +; CHECK-NEON-NEXT: [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1 +; CHECK-NEON-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] +; CHECK-NEON-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEON: [[EXIT]]: +; CHECK-NEON-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEON-NEXT: ret i32 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = zext i8 %l to i32 + %add = add i32 %red, %l.ext + %red.next = add i32 %add, %offset + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} |