33 files changed, 500 insertions, 648 deletions
diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
index 3dd0a50..ca85168 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
@@ -309,10 +309,9 @@ static void lengthExprHandle(const Expr *LengthExpr,
   // Try to obtain an 'IntegerLiteral' and adjust it.
   if (!IsMacroDefinition) {
     if (const auto *LengthIL = dyn_cast<IntegerLiteral>(LengthExpr)) {
-      size_t NewLength = LengthIL->getValue().getZExtValue() +
-                         (LengthHandle == LengthHandleKind::Increase
-                              ? (isInjectUL(Result) ? 1UL : 1)
-                              : -1);
+      uint64_t NewLength =
+          LengthIL->getValue().getZExtValue() +
+          (LengthHandle == LengthHandleKind::Increase ? 1 : -1);
 
       const auto NewLengthFix = FixItHint::CreateReplacement(
           LengthIL->getSourceRange(),
diff --git a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
index 0003429..77262eb 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
@@ -152,6 +152,8 @@ void UseIntegerSignComparisonCheck::check(
   if (const auto *RHSCast = llvm::dyn_cast<ExplicitCastExpr>(RHS)) {
     SubExprRHS = RHSCast->getSubExpr();
     R2.setEnd(SubExprRHS->getBeginLoc().getLocWithOffset(-1));
+    R3.setBegin(Lexer::getLocForEndOfToken(
+        SubExprRHS->getEndLoc(), 0, *Result.SourceManager, getLangOpts()));
   }
   DiagnosticBuilder Diag =
       diag(BinaryOp->getBeginLoc(),
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index a94dd97..43e4b61 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -276,6 +276,7 @@ Changes in existing checks
 
 - Improved :doc:`bugprone-not-null-terminated-result
   <clang-tidy/checks/bugprone/not-null-terminated-result>` check by fixing
+  bogus fix-its for ``strncmp`` and ``wcsncmp`` on Windows and
   a crash caused by certain value-dependent expressions.
 
 - Improved :doc:`bugprone-reserved-identifier
@@ -341,7 +342,8 @@ Changes in existing checks
 - Improved :doc:`misc-const-correctness
   <clang-tidy/checks/misc/const-correctness>` check to avoid false
   positives when pointers is tranferred to non-const references 
-  and avoid false positives of function pointer.
+  and avoid false positives of function pointer and fix false
+  positives on return of non-const pointer.
 
 - Improved :doc:`misc-header-include-cycle
   <clang-tidy/checks/misc/header-include-cycle>` check performance.
@@ -363,6 +365,11 @@ Changes in existing checks
   <clang-tidy/checks/modernize/use-designated-initializers>` check to
   suggest using designated initializers for aliased aggregate types.
 
+- Improved :doc:`modernize-use-integer-sign-comparison
+  <clang-tidy/checks/modernize/use-integer-sign-comparison>` by providing
+  correct fix-its when the right-hand side of a comparison contains a
+  non-C-style cast.
+
 - Improved :doc:`modernize-use-nullptr
   <clang-tidy/checks/modernize/use-nullptr>` check by fixing a crash
   on Windows when the check was enabled with a 32-bit :program:`clang-tidy`
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c
index dccf4ed..ca86986 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c
@@ -1,11 +1,6 @@
 // RUN: %check_clang_tidy %s bugprone-not-null-terminated-result %t -- \
 // RUN: -- -I %S/Inputs/not-null-terminated-result
 
-// FIXME: Something wrong with the APInt un/signed conversion on Windows:
-// in 'strncmp(str6, "string", 7);' it tries to inject '4294967302' as length.
-
-// UNSUPPORTED: system-windows
-
 #include "not-null-terminated-result-c.h"
 
 #define __STDC_LIB_EXT1__ 1
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp
index 8047db3..688e414 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp
@@ -1,11 +1,6 @@
 // RUN: %check_clang_tidy -std=c++11-or-later %s bugprone-not-null-terminated-result %t -- \
 // RUN: -- -I %S/Inputs/not-null-terminated-result
 
-// FIXME: Something wrong with the APInt un/signed conversion on Windows:
-// in 'wcsncmp(wcs6, L"string", 7);' it tries to inject '4294967302' as length.
-
-// UNSUPPORTED: system-windows
-
 #include "not-null-terminated-result-cxx.h"
 
 #define __STDC_LIB_EXT1__ 1
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp
index e20680c..4c847b5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp
@@ -48,6 +48,11 @@ void ignore_const_alias() {
   p_local0 = &a[1];
 }
 
+void *return_non_const() {
+  void *const a = nullptr;
+  return a;
+}
+
 void function_pointer_basic() {
   void (*const fp)() = nullptr;
   fp();
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp
index 1f26ff3..31a3677 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp
@@ -92,8 +92,7 @@ int AllComparisons() {
     if (static_cast<unsigned int>(uArray[2]) < static_cast<int>(sArray[2]))
         return 0;
 // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (q20::cmp_less(uArray[2],sArray[2])))
-// FIXME: There should only be 2 closing braces. The fix-it inserts an unbalanced one.
+// CHECK-FIXES: if (q20::cmp_less(uArray[2],sArray[2]))
 
     if ((unsigned int)uArray[3] < (int)sArray[3])
         return 0;
@@ -116,6 +115,11 @@ int AllComparisons() {
 // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
 // CHECK-FIXES: if (q20::cmp_greater(uArray[6] , VALUE))
 
+    if (unsigned(uArray[7]) >= int(sArray[7]))
+        return 0;
+// CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
+// CHECK-FIXES: if (q20::cmp_greater_equal(uArray[7],sArray[7]))
+
 
     FuncParameters(uVar);
     TemplateFuncParameter(sVar);
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
index 628cee0..e7981a6 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
@@ -91,8 +91,7 @@ int AllComparisons() {
     if (static_cast<unsigned int>(uArray[2]) < static_cast<int>(sArray[2]))
         return 0;
 // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_less(uArray[2],sArray[2])))
-// FIXME: There should only be 2 closing braces. The fix-it inserts an unbalanced one.
+// CHECK-FIXES: if (std::cmp_less(uArray[2],sArray[2]))
 
     if ((unsigned int)uArray[3] < (int)sArray[3])
         return 0;
@@ -115,6 +114,11 @@ int AllComparisons() {
 // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
 // CHECK-FIXES: if (std::cmp_greater(uArray[6] , VALUE))
 
+    if (unsigned(uArray[7]) >= int(sArray[7]))
+        return 0;
+// CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
+// CHECK-FIXES: if (std::cmp_greater_equal(uArray[7],sArray[7]))
+
 
     FuncParameters(uVar);
     TemplateFuncParameter(sVar);
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
index 1e376da..75b17c54 100644
--- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -140,7 +140,8 @@ class ExprPointeeResolve {
       // explicit cast will be checked in `findPointeeToNonConst`
       const CastKind kind = ICE->getCastKind();
       if (kind == CK_LValueToRValue || kind == CK_DerivedToBase ||
-          kind == CK_UncheckedDerivedToBase)
+          kind == CK_UncheckedDerivedToBase ||
+          (kind == CK_NoOp && (ICE->getType() == ICE->getSubExpr()->getType())))
         return resolveExpr(ICE->getSubExpr());
       return false;
     }
@@ -788,13 +789,16 @@ ExprMutationAnalyzer::Analyzer::findPointeeToNonConst(const Expr *Exp) {
   // FIXME: false positive if the pointee does not change in lambda
   const auto CaptureNoConst = lambdaExpr(hasCaptureInit(Exp));
 
-  const auto Matches =
-      match(stmt(anyOf(forEachDescendant(
-                           stmt(anyOf(AssignToNonConst, PassAsNonConstArg,
-                                      CastToNonConst, CaptureNoConst))
-                               .bind("stmt")),
-                       forEachDescendant(InitToNonConst))),
-            Stm, Context);
+  const auto ReturnNoConst =
+      returnStmt(hasReturnValue(canResolveToExprPointee(Exp)));
+
+  const auto Matches = match(
+      stmt(anyOf(forEachDescendant(
+                     stmt(anyOf(AssignToNonConst, PassAsNonConstArg,
+                                CastToNonConst, CaptureNoConst, ReturnNoConst))
+                         .bind("stmt")),
+                 forEachDescendant(InitToNonConst))),
+      Stm, Context);
   return selectFirst<Stmt>("stmt", Matches);
 }
 
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 9261294..aae2f3e 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -432,7 +432,11 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
 // right-justified. It is used to align compound assignments like `+=` and `=`.
 // When RightJustify and ACS.PadOperators are true, operators in each block to
 // be aligned will be padded on the left to the same length before aligning.
-template <typename F>
+//
+// The simple check will not look at the indentaion and nesting level to recurse
+// into the line for alignment. It will also not count the commas. This is e.g.
+// for aligning macro definitions.
+template <typename F, bool SimpleCheck = false>
 static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
                             SmallVector<WhitespaceManager::Change, 16> &Changes,
                             unsigned StartAt,
@@ -465,9 +469,9 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
 
   // Measure the scope level (i.e. depth of (), [], {}) of the first token, and
   // abort when we hit any token in a higher scope than the starting one.
-  auto IndentAndNestingLevel = StartAt < Changes.size()
-                                   ? Changes[StartAt].indentAndNestingLevel()
-                                   : std::tuple<unsigned, unsigned, unsigned>();
+  const auto IndentAndNestingLevel =
+      StartAt < Changes.size() ? Changes[StartAt].indentAndNestingLevel()
+                               : std::tuple<unsigned, unsigned, unsigned>();
 
   // Keep track of the number of commas before the matching tokens, we will only
   // align a sequence of matching tokens if they are preceded by the same number
@@ -536,14 +540,17 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
     if (CurrentChange.Tok->isNot(tok::comment))
       LineIsComment = false;
 
-    if (CurrentChange.Tok->is(tok::comma)) {
-      ++CommasBeforeMatch;
-    } else if (CurrentChange.indentAndNestingLevel() > IndentAndNestingLevel) {
-      // Call AlignTokens recursively, skipping over this scope block.
-      unsigned StoppedAt =
-          AlignTokens(Style, Matches, Changes, i, ACS, RightJustify);
-      i = StoppedAt - 1;
-      continue;
+    if (!SimpleCheck) {
+      if (CurrentChange.Tok->is(tok::comma)) {
+        ++CommasBeforeMatch;
+      } else if (CurrentChange.indentAndNestingLevel() >
+                 IndentAndNestingLevel) {
+        // Call AlignTokens recursively, skipping over this scope block.
+        const auto StoppedAt =
+            AlignTokens(Style, Matches, Changes, i, ACS, RightJustify);
+        i = StoppedAt - 1;
+        continue;
+      }
     }
 
     if (!Matches(CurrentChange))
@@ -683,61 +690,8 @@ void WhitespaceManager::alignConsecutiveMacros() {
     return Current->Next->SpacesRequiredBefore == SpacesRequiredBefore;
   };
 
-  unsigned MinColumn = 0;
-
-  // Start and end of the token sequence we're processing.
-  unsigned StartOfSequence = 0;
-  unsigned EndOfSequence = 0;
-
-  // Whether a matching token has been found on the current line.
-  bool FoundMatchOnLine = false;
-
-  // Whether the current line consists only of comments
-  bool LineIsComment = true;
-
-  unsigned I = 0;
-  for (unsigned E = Changes.size(); I != E; ++I) {
-    if (Changes[I].NewlinesBefore != 0) {
-      EndOfSequence = I;
-
-      // Whether to break the alignment sequence because of an empty line.
-      bool EmptyLineBreak = (Changes[I].NewlinesBefore > 1) &&
-                            !Style.AlignConsecutiveMacros.AcrossEmptyLines;
-
-      // Whether to break the alignment sequence because of a line without a
-      // match.
-      bool NoMatchBreak =
-          !FoundMatchOnLine &&
-          !(LineIsComment && Style.AlignConsecutiveMacros.AcrossComments);
-
-      if (EmptyLineBreak || NoMatchBreak) {
-        AlignMatchingTokenSequence(StartOfSequence, EndOfSequence, MinColumn,
-                                   AlignMacrosMatches, Changes);
-      }
-
-      // A new line starts, re-initialize line status tracking bools.
-      FoundMatchOnLine = false;
-      LineIsComment = true;
-    }
-
-    if (Changes[I].Tok->isNot(tok::comment))
-      LineIsComment = false;
-
-    if (!AlignMacrosMatches(Changes[I]))
-      continue;
-
-    FoundMatchOnLine = true;
-
-    if (StartOfSequence == 0)
-      StartOfSequence = I;
-
-    unsigned ChangeMinColumn = Changes[I].StartOfTokenColumn;
-    MinColumn = std::max(MinColumn, ChangeMinColumn);
-  }
-
-  EndOfSequence = I;
-  AlignMatchingTokenSequence(StartOfSequence, EndOfSequence, MinColumn,
-                             AlignMacrosMatches, Changes);
+  AlignTokens<decltype(AlignMacrosMatches) &, /*SimpleCheck=*/true>(
+      Style, AlignMacrosMatches, Changes, 0, Style.AlignConsecutiveMacros);
 }
 
 void WhitespaceManager::alignConsecutiveAssignments() {
diff --git a/clang/test/Frontend/rewrite-includes-bom.c b/clang/test/Frontend/rewrite-includes-bom.c
index caa431a..27bf470 100644
--- a/clang/test/Frontend/rewrite-includes-bom.c
+++ b/clang/test/Frontend/rewrite-includes-bom.c
@@ -1,8 +1,8 @@
-// RUN: grep -q $'^\xEF\xBB\xBF' %S/Inputs/rewrite-includes-bom.h
+// RUN: cat %S/Inputs/rewrite-includes-bom.h | od -t x1 | grep -q 'ef\s*bb\s*bf'
 // RUN: %clang_cc1 -E -frewrite-includes -I %S/Inputs %s -o %t.c
-// RUN: ! grep -q $'\xEF\xBB\xBF' %t.c
+// RUN: cat %t.c | od -t x1 | not grep -q 'ef\s*bb\s*bf'
 // RUN: %clang_cc1 -fsyntax-only -verify %t.c
 // expected-no-diagnostics
-// REQUIRES: shell
+// UNSUPPORTED: system-windows
 
 #include "rewrite-includes-bom.h"
diff --git a/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c b/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c
index 46aba91..6f574ac 100644
--- a/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c
+++ b/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c
@@ -1,5 +1,5 @@
 // Test UTF8 BOM at start of file
-// RUN: printf '\xef\xbb\xbf' > %t.c
+// RUN: printf '\357\273\277' > %t.c
 // RUN: echo '#ifdef TEST\n' >> %t.c
 // RUN: echo '#include <string>' >> %t.c
 // RUN: echo '#endif' >> %t.c
diff --git a/clang/test/Modules/crash-vfs-relative-incdir.m b/clang/test/Modules/crash-vfs-relative-incdir.m
index c0407f7..46c3413c 100644
--- a/clang/test/Modules/crash-vfs-relative-incdir.m
+++ b/clang/test/Modules/crash-vfs-relative-incdir.m
@@ -53,4 +53,4 @@
 
 // RUN: cd %t
 // RUN: chmod 755 crash-vfs-*.sh
-// RUN: ./crash-vfs-*.sh
+// RUN: bash ./crash-vfs-*.sh
diff --git a/clang/test/Modules/crash-vfs-run-reproducer.m b/clang/test/Modules/crash-vfs-run-reproducer.m
index fd861fe..fa06fd9 100644
--- a/clang/test/Modules/crash-vfs-run-reproducer.m
+++ b/clang/test/Modules/crash-vfs-run-reproducer.m
@@ -53,4 +53,4 @@
 
 // RUN: cd %t
 // RUN: chmod 755 crash-vfs-*.sh
-// RUN: ./crash-vfs-*.sh
+// RUN: bash ./crash-vfs-*.sh
diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
index 95f8ae2..ef22960 100644
--- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
+++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
@@ -2038,4 +2038,42 @@ TEST(ExprMutationAnalyzerTest, PointeeMutatedByConditionOperator) {
   EXPECT_TRUE(isPointeeMutated(Results, AST.get()));
 }
 
+TEST(ExprMutationAnalyzerTest, PointeeMutatedByReturn) {
+  {
+    const std::string Code = R"(
+    int * f() {
+      int *const x = nullptr;
+      return x;
+    })";
+    auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"});
+    auto Results =
+        match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+    EXPECT_TRUE(isPointeeMutated(Results, AST.get()));
+  }
+  {
+    const std::string Code = R"(
+    int * f() {
+      int *const x = nullptr;
+      return x;
+    })";
+    // in C++23, AST will have NoOp cast.
+    auto AST =
+        buildASTFromCodeWithArgs(Code, {"-Wno-everything", "-std=c++23"});
+    auto Results =
+        match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+    EXPECT_TRUE(isPointeeMutated(Results, AST.get()));
+  }
+  {
+    const std::string Code = R"(
+    int const* f() {
+      int *const x = nullptr;
+      return x;
+    })";
+    auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"});
+    auto Results =
+        match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+    EXPECT_FALSE(isPointeeMutated(Results, AST.get()));
+  }
+}
+
 } // namespace clang
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 0fb8139..a3ad978 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -18559,6 +18559,11 @@ TEST_F(FormatTest, AlignConsecutiveMacros) {
                "#define bbbb 4\n"
                "#define ccc       (5)",
                Style);
+
+  Style.ColumnLimit = 30;
+  verifyFormat("#define MY_FUNC(x) callMe(X)\n"
+               "#define MY_LONG_CONSTANT 17",
+               Style);
 }
 
 TEST_F(FormatTest, AlignConsecutiveAssignmentsAcrossEmptyLines) {
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 088edc0..9a81f26 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -77,6 +77,7 @@ if( LIBCLC_STANDALONE_BUILD OR CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DI
 
   # Setup the paths where libclc runtimes should be stored.
   set( LIBCLC_OUTPUT_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR} )
+  set( LIBCLC_INSTALL_DIR ${CMAKE_INSTALL_DATADIR}/clc )
 else()
   # In-tree configuration
   set( LIBCLC_STANDALONE_BUILD FALSE )
@@ -100,10 +101,12 @@ else()
   # Setup the paths where libclc runtimes should be stored. By default, in an
   # in-tree build we place the libraries in clang's resource driectory.
   include(GetClangResourceDir)
-  get_clang_resource_dir( LIBCLC_OUTPUT_DIR PREFIX ${LLVM_LIBRARY_OUTPUT_INTDIR}/.. )
+  get_clang_resource_dir( LIBCLC_INSTALL_DIR )
+  cmake_path( APPEND LIBCLC_INSTALL_DIR "lib" "libclc" )
 
   # Note we do not adhere to LLVM_ENABLE_PER_TARGET_RUNTIME_DIR.
-  set( LIBCLC_OUTPUT_LIBRARY_DIR ${LIBCLC_OUTPUT_DIR}/lib/libclc )
+  cmake_path( GET LLVM_LIBRARY_OUTPUT_INTDIR PARENT_PATH LIBCLC_OUTPUT_LIBRARY_DIR )
+  cmake_path( APPEND LIBCLC_OUTPUT_LIBRARY_DIR ${LIBCLC_INSTALL_DIR} )
   file( MAKE_DIRECTORY ${LIBCLC_OUTPUT_LIBRARY_DIR} )
 endif()
 
diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers.inc b/libclc/clc/include/clc/math/clc_sincos_helpers.inc
index 4daff92..0a3b816 100644
--- a/libclc/clc/include/clc/math/clc_sincos_helpers.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers.inc
@@ -10,6 +10,11 @@ _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_sinf_piby4(__CLC_FLOATN x,
                                                       __CLC_FLOATN y);
 _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
                                                       __CLC_FLOATN y);
+
+_CLC_DECL _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_FLOATN x,
+                                                private __CLC_FLOATN *sinval,
+                                                private __CLC_FLOATN *cosval);
+
 _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x,
                                                       __CLC_INTN regn);
 
diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc
index 09c6e1c..15934ca 100644
--- a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc
@@ -6,6 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+_CLC_DECL _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
+                                                __CLC_DOUBLEN xx,
+                                                private __CLC_DOUBLEN *sinval,
+                                                private __CLC_DOUBLEN *cosval);
+
+_CLC_DECL _CLC_OVERLOAD void __clc_tan_piby4(__CLC_DOUBLEN x, __CLC_DOUBLEN xx,
+                                             private __CLC_DOUBLEN *leadval,
+                                             private __CLC_DOUBLEN *tailval);
+
 _CLC_DECL _CLC_OVERLOAD void
 __clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
                              private __CLC_DOUBLEN *rr,
diff --git a/libclc/clc/include/clc/math/clc_sincos_piby4.h b/libclc/clc/include/clc/math/clc_sincos_piby4.h
deleted file mode 100644
index 50608ae..0000000
--- a/libclc/clc/include/clc/math/clc_sincos_piby4.h
+++ /dev/null
@@ -1,14 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/math.h>
-
-#define __CLC_BODY <clc/math/clc_sincos_piby4.inc>
-#include <clc/math/gentype.inc>
diff --git a/libclc/clc/include/clc/math/clc_sincos_piby4.inc b/libclc/clc/include/clc/math/clc_sincos_piby4.inc
deleted file mode 100644
index 91ec518..0000000
--- a/libclc/clc/include/clc/math/clc_sincos_piby4.inc
+++ /dev/null
@@ -1,174 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4]
-_CLC_INLINE _CLC_OVERLOAD void
-__clc_sincos_piby4(__CLC_GENTYPE x, private __CLC_GENTYPE *sinval,
-                   private __CLC_GENTYPE *cosval) {
-  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-  // = x * f(w)
-  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-  // We use a minimax approximation of (f(w) - 1) / w
-  // because this produces an expansion in even powers of x.
-
-  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-  // = f(w)
-  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-  // because this produces an expansion in even powers of x.
-
-  const __CLC_GENTYPE sc1 = -0.166666666638608441788607926e0F;
-  const __CLC_GENTYPE sc2 = 0.833333187633086262120839299e-2F;
-  const __CLC_GENTYPE sc3 = -0.198400874359527693921333720e-3F;
-  const __CLC_GENTYPE sc4 = 0.272500015145584081596826911e-5F;
-
-  const __CLC_GENTYPE cc1 = 0.41666666664325175238031e-1F;
-  const __CLC_GENTYPE cc2 = -0.13888887673175665567647e-2F;
-  const __CLC_GENTYPE cc3 = 0.24800600878112441958053e-4F;
-  const __CLC_GENTYPE cc4 = -0.27301013343179832472841e-6F;
-
-  __CLC_GENTYPE x2 = x * x;
-
-  *sinval = __clc_mad(
-      x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1),
-      x);
-  *cosval = __clc_mad(
-      x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1),
-      __clc_mad(x2, -0.5f, 1.0f));
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_INLINE _CLC_OVERLOAD void
-__clc_sincos_piby4(__CLC_GENTYPE x, __CLC_GENTYPE xx,
-                   private __CLC_GENTYPE *sinval,
-                   private __CLC_GENTYPE *cosval) {
-  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-  //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-  //                      = x * f(w)
-  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-  // We use a minimax approximation of (f(w) - 1) / w
-  // because this produces an expansion in even powers of x.
-  // If xx (the tail of x) is non-zero, we add a correction
-  // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
-  // is an approximation to cos(x)*sin(xx) valid because
-  // xx is tiny relative to x.
-
-  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-  //                      = f(w)
-  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-  // because this produces an expansion in even powers of x.
-  // If xx (the tail of x) is non-zero, we subtract a correction
-  // term g(x,xx) = x*xx to the result, where g(x,xx)
-  // is an approximation to sin(x)*sin(xx) valid because
-  // xx is tiny relative to x.
-
-  const __CLC_GENTYPE sc1 = -0.166666666666666646259241729;
-  const __CLC_GENTYPE sc2 = 0.833333333333095043065222816e-2;
-  const __CLC_GENTYPE sc3 = -0.19841269836761125688538679e-3;
-  const __CLC_GENTYPE sc4 = 0.275573161037288022676895908448e-5;
-  const __CLC_GENTYPE sc5 = -0.25051132068021699772257377197e-7;
-  const __CLC_GENTYPE sc6 = 0.159181443044859136852668200e-9;
-
-  const __CLC_GENTYPE cc1 = 0.41666666666666665390037e-1;
-  const __CLC_GENTYPE cc2 = -0.13888888888887398280412e-2;
-  const __CLC_GENTYPE cc3 = 0.248015872987670414957399e-4;
-  const __CLC_GENTYPE cc4 = -0.275573172723441909470836e-6;
-  const __CLC_GENTYPE cc5 = 0.208761463822329611076335e-8;
-  const __CLC_GENTYPE cc6 = -0.113826398067944859590880e-10;
-
-  __CLC_GENTYPE x2 = x * x;
-  __CLC_GENTYPE x3 = x2 * x;
-  __CLC_GENTYPE r = (__CLC_GENTYPE)0.5 * x2;
-  __CLC_GENTYPE t = (__CLC_GENTYPE)1.0 - r;
-
-  __CLC_GENTYPE sp = __clc_fma(
-      __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
-
-  __CLC_GENTYPE cp =
-      t +
-      __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5),
-                                                        x2, cc4),
-                                              x2, cc3),
-                                    x2, cc2),
-                          x2, cc1),
-                x2 * x2, __clc_fma(x, xx, (1.0 - t) - r));
-
-  *sinval =
-      x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx));
-  *cosval = cp;
-}
-
-_CLC_INLINE _CLC_OVERLOAD void __clc_tan_piby4(__CLC_GENTYPE x,
-                                               __CLC_GENTYPE xx,
-                                               private __CLC_GENTYPE *leadval,
-                                               private __CLC_GENTYPE *tailval) {
-  // 0x3fe921fb54442d18
-  const __CLC_GENTYPE piby4_lead = 7.85398163397448278999e-01;
-  // 0x3c81a62633145c06
-  const __CLC_GENTYPE piby4_tail = 3.06161699786838240164e-17;
-
-  // In order to maintain relative precision transform using the identity:
-  // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4.
-  // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4.
-
-  __CLC_LONGN ca = x > 0.68;
-  __CLC_LONGN cb = x < -0.68;
-  __CLC_GENTYPE transform = ca ? 1.0 : 0.0;
-  transform = cb ? -1.0 : transform;
-
-  __CLC_GENTYPE tx = __clc_fma(-transform, x, piby4_lead) +
-                     __clc_fma(-transform, xx, piby4_tail);
-  __CLC_LONGN c = ca | cb;
-  x = c ? tx : x;
-  xx = c ? 0.0 : xx;
-
-  // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
-  __CLC_GENTYPE t1 = x;
-  __CLC_GENTYPE r = __clc_fma(2.0, x * xx, x * x);
-
-  __CLC_GENTYPE a = __clc_fma(r,
-                              __clc_fma(r, 0.224044448537022097264602535574e-3,
-                                        -0.229345080057565662883358588111e-1),
-                              0.372379159759792203640806338901e0);
-
-  __CLC_GENTYPE b =
-      __clc_fma(r,
-                __clc_fma(r,
-                          __clc_fma(r, -0.232371494088563558304549252913e-3,
-                                    0.260656620398645407524064091208e-1),
-                          -0.515658515729031149329237816945e0),
-                0.111713747927937668539901657944e1);
-
-  __CLC_GENTYPE t2 = __clc_fma(MATH_DIVIDE(a, b), x * r, xx);
-
-  __CLC_GENTYPE tp = t1 + t2;
-
-  // Compute -1.0/(t1 + t2) accurately
-  __CLC_GENTYPE z1 =
-      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(tp) & 0xffffffff00000000L);
-  __CLC_GENTYPE z2 = t2 - (z1 - t1);
-  __CLC_GENTYPE trec = -MATH_RECIP(tp);
-  __CLC_GENTYPE trec_top =
-      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(trec) & 0xffffffff00000000L);
-
-  __CLC_GENTYPE tpr = __clc_fma(
-      __clc_fma(trec_top, z2, __clc_fma(trec_top, z1, 1.0)), trec, trec_top);
-
-  __CLC_GENTYPE tpt = transform * (1.0 - MATH_DIVIDE(2.0 * tp, 1.0 + tp));
-  __CLC_GENTYPE tptr = transform * (MATH_DIVIDE(2.0 * tp, tp - 1.0) - 1.0);
-
-  *leadval = c ? tpt : tp;
-  *tailval = c ? tptr : tpr;
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_cos.cl b/libclc/clc/lib/generic/math/clc_cos.cl
index e7e4d6a..5529ec4 100644
--- a/libclc/clc/lib/generic/math/clc_cos.cl
+++ b/libclc/clc/lib/generic/math/clc_cos.cl
@@ -10,7 +10,6 @@
 #include <clc/float/definitions.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/math.h>
 #include <clc/relational/clc_isinf.h>
 #include <clc/relational/clc_isnan.h>
diff --git a/libclc/clc/lib/generic/math/clc_cospi.cl b/libclc/clc/lib/generic/math/clc_cospi.cl
index 07e1b49..6a10171 100644
--- a/libclc/clc/lib/generic/math/clc_cospi.cl
+++ b/libclc/clc/lib/generic/math/clc_cospi.cl
@@ -11,7 +11,6 @@
 #include <clc/internal/clc.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/math.h>
 
 #define __CLC_BODY <clc_cospi.inc>
diff --git a/libclc/clc/lib/generic/math/clc_sin.cl b/libclc/clc/lib/generic/math/clc_sin.cl
index 741383f..99338c9 100644
--- a/libclc/clc/lib/generic/math/clc_sin.cl
+++ b/libclc/clc/lib/generic/math/clc_sin.cl
@@ -11,7 +11,6 @@
 #include <clc/internal/clc.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
 #include <clc/math/tables.h>
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
index 9a46170..2a71b56 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
@@ -74,6 +74,43 @@ _CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
   return ret;
 }
 
+// Evaluate single precisions sin and cos of value in interval [-pi/4, pi/4]
+_CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_FLOATN x,
+                                               private __CLC_FLOATN *sinval,
+                                               private __CLC_FLOATN *cosval) {
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  // = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
+
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  // = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
+
+  const __CLC_FLOATN sc1 = -0.166666666638608441788607926e0F;
+  const __CLC_FLOATN sc2 = 0.833333187633086262120839299e-2F;
+  const __CLC_FLOATN sc3 = -0.198400874359527693921333720e-3F;
+  const __CLC_FLOATN sc4 = 0.272500015145584081596826911e-5F;
+
+  const __CLC_FLOATN cc1 = 0.41666666664325175238031e-1F;
+  const __CLC_FLOATN cc2 = -0.13888887673175665567647e-2F;
+  const __CLC_FLOATN cc3 = 0.24800600878112441958053e-4F;
+  const __CLC_FLOATN cc4 = -0.27301013343179832472841e-6F;
+
+  __CLC_FLOATN x2 = x * x;
+
+  *sinval = __clc_mad(
+      x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1),
+      x);
+  *cosval = __clc_mad(
+      x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1),
+      __clc_mad(x2, -0.5f, 1.0f));
+}
+
 _CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x,
                                                      __CLC_INTN regn) {
   // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
index 8fae90c..e029c6d 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
@@ -6,6 +6,129 @@
 //
 //===----------------------------------------------------------------------===//
 
+_CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
+                                               __CLC_DOUBLEN xx,
+                                               private __CLC_DOUBLEN *sinval,
+                                               private __CLC_DOUBLEN *cosval) {
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  //                      = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
+  // If xx (the tail of x) is non-zero, we add a correction
+  // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+  // is an approximation to cos(x)*sin(xx) valid because
+  // xx is tiny relative to x.
+
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  //                      = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
+  // If xx (the tail of x) is non-zero, we subtract a correction
+  // term g(x,xx) = x*xx to the result, where g(x,xx)
+  // is an approximation to sin(x)*sin(xx) valid because
+  // xx is tiny relative to x.
+
+  const __CLC_DOUBLEN sc1 = -0.166666666666666646259241729;
+  const __CLC_DOUBLEN sc2 = 0.833333333333095043065222816e-2;
+  const __CLC_DOUBLEN sc3 = -0.19841269836761125688538679e-3;
+  const __CLC_DOUBLEN sc4 = 0.275573161037288022676895908448e-5;
+  const __CLC_DOUBLEN sc5 = -0.25051132068021699772257377197e-7;
+  const __CLC_DOUBLEN sc6 = 0.159181443044859136852668200e-9;
+
+  const __CLC_DOUBLEN cc1 = 0.41666666666666665390037e-1;
+  const __CLC_DOUBLEN cc2 = -0.13888888888887398280412e-2;
+  const __CLC_DOUBLEN cc3 = 0.248015872987670414957399e-4;
+  const __CLC_DOUBLEN cc4 = -0.275573172723441909470836e-6;
+  const __CLC_DOUBLEN cc5 = 0.208761463822329611076335e-8;
+  const __CLC_DOUBLEN cc6 = -0.113826398067944859590880e-10;
+
+  __CLC_DOUBLEN x2 = x * x;
+  __CLC_DOUBLEN x3 = x2 * x;
+  __CLC_DOUBLEN r = (__CLC_DOUBLEN)0.5 * x2;
+  __CLC_DOUBLEN t = (__CLC_DOUBLEN)1.0 - r;
+
+  __CLC_DOUBLEN sp = __clc_fma(
+      __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+  __CLC_DOUBLEN cp =
+      t +
+      __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5),
+                                                        x2, cc4),
+                                              x2, cc3),
+                                    x2, cc2),
+                          x2, cc1),
+                x2 * x2, __clc_fma(x, xx, (1.0 - t) - r));
+
+  *sinval =
+      x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx));
+  *cosval = cp;
+}
+
+_CLC_DEF _CLC_OVERLOAD void __clc_tan_piby4(__CLC_DOUBLEN x, __CLC_DOUBLEN xx,
+                                            private __CLC_DOUBLEN *leadval,
+                                            private __CLC_DOUBLEN *tailval) {
+  // 0x3fe921fb54442d18
+  const __CLC_DOUBLEN piby4_lead = 7.85398163397448278999e-01;
+  // 0x3c81a62633145c06
+  const __CLC_DOUBLEN piby4_tail = 3.06161699786838240164e-17;
+
+  // In order to maintain relative precision transform using the identity:
+  // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4.
+  // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4.
+
+  __CLC_LONGN ca = x > 0.68;
+  __CLC_LONGN cb = x < -0.68;
+  __CLC_DOUBLEN transform = ca ? 1.0 : 0.0;
+  transform = cb ? -1.0 : transform;
+
+  __CLC_DOUBLEN tx = __clc_fma(-transform, x, piby4_lead) +
+                     __clc_fma(-transform, xx, piby4_tail);
+  __CLC_LONGN c = ca | cb;
+  x = c ? tx : x;
+  xx = c ? 0.0 : xx;
+
+  // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
+  __CLC_DOUBLEN t1 = x;
+  __CLC_DOUBLEN r = __clc_fma(2.0, x * xx, x * x);
+
+  __CLC_DOUBLEN a = __clc_fma(r,
+                              __clc_fma(r, 0.224044448537022097264602535574e-3,
+                                        -0.229345080057565662883358588111e-1),
+                              0.372379159759792203640806338901e0);
+
+  __CLC_DOUBLEN b =
+      __clc_fma(r,
+                __clc_fma(r,
+                          __clc_fma(r, -0.232371494088563558304549252913e-3,
+                                    0.260656620398645407524064091208e-1),
+                          -0.515658515729031149329237816945e0),
+                0.111713747927937668539901657944e1);
+
+  __CLC_DOUBLEN t2 = __clc_fma(MATH_DIVIDE(a, b), x * r, xx);
+
+  __CLC_DOUBLEN tp = t1 + t2;
+
+  // Compute -1.0/(t1 + t2) accurately
+  __CLC_DOUBLEN z1 =
+      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(tp) & 0xffffffff00000000L);
+  __CLC_DOUBLEN z2 = t2 - (z1 - t1);
+  __CLC_DOUBLEN trec = -MATH_RECIP(tp);
+  __CLC_DOUBLEN trec_top =
+      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(trec) & 0xffffffff00000000L);
+
+  __CLC_DOUBLEN tpr = __clc_fma(
+      __clc_fma(trec_top, z2, __clc_fma(trec_top, z1, 1.0)), trec, trec_top);
+
+  __CLC_DOUBLEN tpt = transform * (1.0 - MATH_DIVIDE(2.0 * tp, 1.0 + tp));
+  __CLC_DOUBLEN tptr = transform * (MATH_DIVIDE(2.0 * tp, tp - 1.0) - 1.0);
+
+  *leadval = c ? tpt : tp;
+  *tailval = c ? tptr : tpr;
+}
+
 // Reduction for medium sized arguments
 _CLC_DEF _CLC_OVERLOAD void
 __clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
diff --git a/libclc/clc/lib/generic/math/clc_sinpi.cl b/libclc/clc/lib/generic/math/clc_sinpi.cl
index 6cff247..bb5de09f0 100644
--- a/libclc/clc/lib/generic/math/clc_sinpi.cl
+++ b/libclc/clc/lib/generic/math/clc_sinpi.cl
@@ -11,7 +11,6 @@
 #include <clc/internal/clc.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/math.h>
 
 #define __CLC_BODY <clc_sinpi.inc>
diff --git a/libclc/clc/lib/generic/math/clc_tan.cl b/libclc/clc/lib/generic/math/clc_tan.cl
index adf42c4..7e68216 100644
--- a/libclc/clc/lib/generic/math/clc_tan.cl
+++ b/libclc/clc/lib/generic/math/clc_tan.cl
@@ -11,7 +11,6 @@
 #include <clc/internal/clc.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/math.h>
 #include <clc/math/tables.h>
 #include <clc/relational/clc_isinf.h>
diff --git a/libclc/clc/lib/generic/math/clc_tanpi.cl b/libclc/clc/lib/generic/math/clc_tanpi.cl
index f126589..099457c1 100644
--- a/libclc/clc/lib/generic/math/clc_tanpi.cl
+++ b/libclc/clc/lib/generic/math/clc_tanpi.cl
@@ -12,7 +12,6 @@
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_native_recip.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/math.h>
 
 #define __CLC_BODY <clc_tanpi.inc>
diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index 614f9e3..d8c2219 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -261,7 +261,7 @@ function(libclc_install)
 
   install(
     FILES ${files}
-    DESTINATION "${CMAKE_INSTALL_DATADIR}/clc"
+    DESTINATION ${LIBCLC_INSTALL_DIR}
   )
 endfunction()
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index d4cc154..52ca22b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -1,38 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,GENERIC
-; RUN: llc < %s -O0 -fast-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,FAST
-; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* \
-; RUN:          -mtriple=arm64-eabi -aarch64-neon-syntax=apple \
-; RUN:          | FileCheck %s --check-prefixes=GISEL,FALLBACK
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -O0 -fast-isel | FileCheck %s --check-prefixes=CHECK,CHECK-FI
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple  -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for test_vcvt_bf16_f64
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_f64_f32)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_f64_f32)
 define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_f64_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_f64_f32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl v0.2d, v0.2s
-; GISEL-NEXT:    ret
   %vcvt1.i = fpext <2 x float> %x to <2 x double>
   ret <2 x double> %vcvt1.i
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_high_f64_f32)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_high_f64_f32)
 define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_high_f64_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_f64_f32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
   %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
   ret <2 x double> %vcvt1.i
@@ -43,11 +29,6 @@ define <2 x double> @test_vcvt_high_v1f64_f32_bitcast(<4 x float> %x) nounwind r
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v1f64_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %bc1 = bitcast <4 x float> %x to <2 x double>
   %ext = shufflevector <2 x double> %bc1, <2 x double> undef, <1 x i32> <i32 1>
   %bc2 = bitcast <1 x double> %ext to <2 x float>
@@ -60,11 +41,6 @@ define <2 x double> @test_vcvt_high_v1i64_f32_bitcast(<2 x i64> %x) nounwind rea
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v1i64_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1>
   %bc2 = bitcast <1 x i64> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -76,11 +52,6 @@ define <2 x double> @test_vcvt_high_v2i32_f32_bitcast(<4 x i32> %x) nounwind rea
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v2i32_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %bc2 = bitcast <2 x i32> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -92,11 +63,6 @@ define <2 x double> @test_vcvt_high_v4i16_f32_bitcast(<8 x i16> %x) nounwind rea
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v4i16_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %bc2 = bitcast <4 x i16> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -108,11 +74,6 @@ define <2 x double> @test_vcvt_high_v8i8_f32_bitcast(<16 x i8> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v8i8_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %bc2 = bitcast <8 x i8> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -124,11 +85,6 @@ define <4 x float> @test_vcvt_high_v1i64_f16_bitcast(<2 x i64> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v1i64_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1>
   %bc2 = bitcast <1 x i64> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
@@ -140,11 +96,6 @@ define <4 x float> @test_vcvt_high_v2i32_f16_bitcast(<4 x i32> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v2i32_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %bc2 = bitcast <2 x i32> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
@@ -156,11 +107,6 @@ define <4 x float> @test_vcvt_high_v4i16_f16_bitcast(<8 x i16> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v4i16_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %bc2 = bitcast <4 x i16> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
@@ -172,134 +118,118 @@ define <4 x float> @test_vcvt_high_v8i8_f16_bitcast(<16 x i8> %x) nounwind readn
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v8i8_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %bc2 = bitcast <8 x i8> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
   ret <4 x float> %r
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_f32_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_f32_f64)
 define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_f32_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtn v0.2s, v0.2d
-; GISEL-NEXT:    ret
   %vcvt1.i = fptrunc <2 x double> %v to <2 x float>
   ret <2 x float> %vcvt1.i
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_bf16_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_bf16_f64)
 define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_bf16_f64:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    fcvtxn v0.2s, v0.2d
-; GENERIC-NEXT:    movi.4s v1, #1
-; GENERIC-NEXT:    movi.4s v2, #127, msl #8
-; GENERIC-NEXT:    ushr.4s v3, v0, #16
-; GENERIC-NEXT:    add.4s v2, v0, v2
-; GENERIC-NEXT:    and.16b v1, v3, v1
-; GENERIC-NEXT:    fcmeq.4s v3, v0, v0
-; GENERIC-NEXT:    orr.4s v0, #64, lsl #16
-; GENERIC-NEXT:    add.4s v1, v1, v2
-; GENERIC-NEXT:    bit.16b v0, v1, v3
-; GENERIC-NEXT:    shrn.4h v0, v0, #16
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvt_bf16_f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-SD-NEXT:    movi.4s v1, #1
+; CHECK-SD-NEXT:    movi.4s v2, #127, msl #8
+; CHECK-SD-NEXT:    ushr.4s v3, v0, #16
+; CHECK-SD-NEXT:    add.4s v2, v0, v2
+; CHECK-SD-NEXT:    and.16b v1, v3, v1
+; CHECK-SD-NEXT:    fcmeq.4s v3, v0, v0
+; CHECK-SD-NEXT:    orr.4s v0, #64, lsl #16
+; CHECK-SD-NEXT:    add.4s v1, v1, v2
+; CHECK-SD-NEXT:    bit.16b v0, v1, v3
+; CHECK-SD-NEXT:    shrn.4h v0, v0, #16
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvt_bf16_f64:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fcvtxn v1.2s, v0.2d
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d1
-; FAST-NEXT:    ushr.4s v1, v0, #16
-; FAST-NEXT:    movi.4s v2, #1
-; FAST-NEXT:    and.16b v1, v1, v2
-; FAST-NEXT:    add.4s v1, v1, v0
-; FAST-NEXT:    movi.4s v2, #127, msl #8
-; FAST-NEXT:    add.4s v1, v1, v2
-; FAST-NEXT:    mov.16b v2, v0
-; FAST-NEXT:    orr.4s v2, #64, lsl #16
-; FAST-NEXT:    fcmeq.4s v0, v0, v0
-; FAST-NEXT:    bsl.16b v0, v1, v2
-; FAST-NEXT:    shrn.4h v0, v0, #16
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvt_bf16_f64:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fcvtxn v1.2s, v0.2d
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d1
+; CHECK-FI-NEXT:    ushr.4s v1, v0, #16
+; CHECK-FI-NEXT:    movi.4s v2, #1
+; CHECK-FI-NEXT:    and.16b v1, v1, v2
+; CHECK-FI-NEXT:    add.4s v1, v1, v0
+; CHECK-FI-NEXT:    movi.4s v2, #127, msl #8
+; CHECK-FI-NEXT:    add.4s v1, v1, v2
+; CHECK-FI-NEXT:    mov.16b v2, v0
+; CHECK-FI-NEXT:    orr.4s v2, #64, lsl #16
+; CHECK-FI-NEXT:    fcmeq.4s v0, v0, v0
+; CHECK-FI-NEXT:    bsl.16b v0, v1, v2
+; CHECK-FI-NEXT:    shrn.4h v0, v0, #16
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvt_bf16_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtxn v0.2s, v0.2d
-; GISEL-NEXT:    movi.4s v1, #1
-; GISEL-NEXT:    movi.4s v2, #127, msl #8
-; GISEL-NEXT:    ushr.4s v3, v0, #16
-; GISEL-NEXT:    add.4s v2, v0, v2
-; GISEL-NEXT:    and.16b v1, v3, v1
-; GISEL-NEXT:    fcmeq.4s v3, v0, v0
-; GISEL-NEXT:    orr.4s v0, #64, lsl #16
-; GISEL-NEXT:    add.4s v1, v1, v2
-; GISEL-NEXT:    bit.16b v0, v1, v3
-; GISEL-NEXT:    shrn.4h v0, v0, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvt_bf16_f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-GI-NEXT:    movi.4s v1, #1
+; CHECK-GI-NEXT:    movi.4s v2, #127, msl #8
+; CHECK-GI-NEXT:    ushr.4s v3, v0, #16
+; CHECK-GI-NEXT:    add.4s v2, v0, v2
+; CHECK-GI-NEXT:    and.16b v1, v3, v1
+; CHECK-GI-NEXT:    fcmeq.4s v3, v0, v0
+; CHECK-GI-NEXT:    orr.4s v0, #64, lsl #16
+; CHECK-GI-NEXT:    add.4s v1, v1, v2
+; CHECK-GI-NEXT:    bit.16b v0, v1, v3
+; CHECK-GI-NEXT:    shrn.4h v0, v0, #16
+; CHECK-GI-NEXT:    ret
   %vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat>
   ret <2 x bfloat> %vcvt1.i
 }
 
 define half @test_vcvt_f16_f32(<1 x float> %x) {
-; GENERIC-LABEL: test_vcvt_f16_f32:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT:    fcvt h0, s0
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvt_f16_f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvt_f16_f32:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov d1, d0
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d1
-; FAST-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; FAST-NEXT:    fcvt h0, s0
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvt_f16_f32:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov d1, d0
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d1
+; CHECK-FI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FI-NEXT:    fcvt h0, s0
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvt_f16_f32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvt_f16_f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ret
   %tmp = fptrunc <1 x float> %x to <1 x half>
   %elt = extractelement <1 x half> %tmp, i32 0
   ret half %elt
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_high_f32_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_high_f32_f64)
 define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_f32_f64:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT:    fcvtn2 v0.4s, v1.2d
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvt_high_f32_f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvt_high_f32_f64:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov d2, d0
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d2
-; FAST-NEXT:    fcvtn2 v0.4s, v1.2d
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvt_high_f32_f64:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov d2, d0
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d2
+; CHECK-FI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvt_high_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    fcvtn2 v0.4s, v1.2d
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvt_high_f32_f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-GI-NEXT:    ret
   %cvt = fptrunc <2 x double> %v to <2 x float>
   %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %vcvt2.i
@@ -310,99 +240,80 @@ define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtxn v0.2s, v0.2d
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvtx_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtxn v0.2s, v0.2d
-; GISEL-NEXT:    ret
   %vcvtx1.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
   ret <2 x float> %vcvtx1.i
 }
 
 define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvtx_high_f32_f64:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT:    fcvtxn2 v0.4s, v1.2d
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvtx_high_f32_f64:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov d2, d0
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d2
-; FAST-NEXT:    fcvtxn2 v0.4s, v1.2d
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov d2, d0
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d2
+; CHECK-FI-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvtx_high_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    fcvtxn2 v0.4s, v1.2d
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-GI-NEXT:    ret
   %vcvtx2.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
   %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
 
-
-declare <2 x double> @llvm.aarch64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.aarch64.neon.vcvtfp2df(<2 x float>) nounwind readnone
-
-declare <2 x float> @llvm.aarch64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
-declare <4 x float> @llvm.aarch64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
-
-declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
-
 define i16 @to_half(float %in) {
-; GENERIC-LABEL: to_half:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    fcvt h0, s0
-; GENERIC-NEXT:    fmov w0, s0
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: to_half:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: to_half:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fcvt h1, s0
-; FAST-NEXT:    // implicit-def: $w0
-; FAST-NEXT:    fmov s0, w0
-; FAST-NEXT:    fmov s0, s1
-; FAST-NEXT:    fmov w0, s0
-; FAST-NEXT:    // kill: def $w1 killed $w0
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: to_half:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fcvt h1, s0
+; CHECK-FI-NEXT:    // implicit-def: $w0
+; CHECK-FI-NEXT:    fmov s0, w0
+; CHECK-FI-NEXT:    fmov s0, s1
+; CHECK-FI-NEXT:    fmov w0, s0
+; CHECK-FI-NEXT:    // kill: def $w1 killed $w0
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: to_half:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    fmov w0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: to_half:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %res = call i16 @llvm.convert.to.fp16.f32(float %in)
   ret i16 %res
 }
 
 define float @from_half(i16 %in) {
-; GENERIC-LABEL: from_half:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    fmov s0, w0
-; GENERIC-NEXT:    fcvt s0, h0
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: from_half:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: from_half:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov s0, w0
-; FAST-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; FAST-NEXT:    fcvt s0, h0
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: from_half:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov s0, w0
+; CHECK-FI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-FI-NEXT:    fcvt s0, h0
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: from_half:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fmov s0, w0
-; GISEL-NEXT:    fcvt s0, h0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: from_half:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    ret
   %res = call float @llvm.convert.from.fp16.f32(i16 %in)
   ret float %res
 }
-
-declare float @llvm.convert.from.fp16.f32(i16) #1
-declare i16 @llvm.convert.to.fp16.f32(float) #1
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; FALLBACK: {{.*}}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 5ae0839..3dfa6df 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -1361,132 +1361,6 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
 }
 
-define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
-; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEON-NEXT:  entry:
-; CHECK-NEON-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-NEON-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-NEON-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-NEON:       vector.ph:
-; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
-; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-NEON-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-NEON-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-NEON:       vector.body:
-; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEON-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-NEON-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-NEON-NEXT:    [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
-; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEON-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEON-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-NEON:       middle.block:
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
-; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-NEON:       scalar.ph:
-;
-; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-SVE-NEXT:  entry:
-; CHECK-SVE-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-SVE-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
-; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-SVE-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-SVE:       vector.body:
-; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-SVE-NEXT:    [[TMP7:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP7]]
-; CHECK-SVE-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-SVE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]])
-; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE:       scalar.ph:
-;
-; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-SVE-MAXBW-NEXT:  entry:
-; CHECK-SVE-MAXBW-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-SVE-MAXBW-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
-; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-SVE-MAXBW:       vector.ph:
-; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
-; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
-; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-SVE-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-SVE-MAXBW:       vector.body:
-; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-SVE-MAXBW:       middle.block:
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP8]])
-; CHECK-SVE-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE-MAXBW:       scalar.ph:
-;
-entry:
-  br label %loop
-
-loop:
-  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
-  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
-  %l = load i8, ptr %ptr.iv, align 1
-  %l.ext = zext i8 %l to i32
-  %add = add i32 %red, %l.ext
-  %red.next = add i32 %add, %offset
-  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
-  %ec = icmp eq ptr %ptr.iv, %end
-  br i1 %ec, label %exit, label %loop
-
-exit:
-  ret i32 %red.next
-}
-
 attributes #0 = { vscale_range(1,16) }
 
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
new file mode 100644
index 0000000..d80178fd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S %s | FileCheck %s --check-prefixes=CHECK-NEON
+
+target triple = "arm64-apple-macosx"
+
+define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
+; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
+; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEON-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEON-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEON-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
+; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEON:       [[VECTOR_PH]]:
+; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
+; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-NEON-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
+; CHECK-NEON-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NEON:       [[VECTOR_BODY]]:
+; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-NEON-NEXT:    [[TMP5]] = add <16 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEON:       [[MIDDLE_BLOCK]]:
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEON:       [[SCALAR_PH]]:
+; CHECK-NEON-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; CHECK-NEON-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEON-NEXT:    br label %[[LOOP:.*]]
+; CHECK-NEON:       [[LOOP]]:
+; CHECK-NEON-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEON-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEON-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1
+; CHECK-NEON-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEON-NEXT:    [[ADD:%.*]] = add i32 [[RED]], [[L_EXT]]
+; CHECK-NEON-NEXT:    [[RED_NEXT]] = add i32 [[ADD]], [[OFFSET]]
+; CHECK-NEON-NEXT:    [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
+; CHECK-NEON-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
+; CHECK-NEON-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEON:       [[EXIT]]:
+; CHECK-NEON-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEON-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %add = add i32 %red, %l.ext
+  %red.next = add i32 %add, %offset
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.next
+}