[𝘀𝗽𝗿] changes introduced through rebaseusers/wangpc-pp/spr/main.riscv-support-select-optimization

Created using spr 1.3.6-beta.1 [skip ci]
author: Luke Lau <luke@igalia.com> 2024-07-05 16:47:46 +0800
committer: Wang Pengcheng <wangpengcheng.pp@bytedance.com> 2024-07-05 16:47:46 +0800
commit: ca94bf9f2bd15fe34936f1759bffd26d7e93b182 (patch)
tree: c5441e7125b948e51d862f973afac0fc3d5a1652
parent: 52d9cac11192a50c8a3ac1526f295c248d7534ab (diff)
parent: db782b44b3471c0ab41950c3f79d0ea7b916c135 (diff)
download: llvm-users/wangpc-pp/spr/main.riscv-support-select-optimization.zip
llvm-users/wangpc-pp/spr/main.riscv-support-select-optimization.tar.gz
llvm-users/wangpc-pp/spr/main.riscv-support-select-optimization.tar.bz2
179 files changed, 9304 insertions, 7043 deletions
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index d7c2139..1456f24 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -63,8 +63,8 @@ jobs:
         cxx: [ 'clang++-19' ]
         include:
           - config: 'generic-gcc'
-            cc: 'gcc-13'
-            cxx: 'g++-13'
+            cc: 'gcc-14'
+            cxx: 'g++-14'
     steps:
       - uses: actions/checkout@v4
       - name: ${{ matrix.config }}.${{ matrix.cxx }}
@@ -101,8 +101,8 @@ jobs:
         cxx: [ 'clang++-19' ]
         include:
           - config: 'generic-gcc-cxx11'
-            cc: 'gcc-13'
-            cxx: 'g++-13'
+            cc: 'gcc-14'
+            cxx: 'g++-14'
           - config: 'generic-cxx23'
             cc: 'clang-17'
             cxx: 'clang++-17'
diff --git a/bolt/include/bolt/Core/DebugNames.h b/bolt/include/bolt/Core/DebugNames.h
index a14a305..0e61a0e 100644
--- a/bolt/include/bolt/Core/DebugNames.h
+++ b/bolt/include/bolt/Core/DebugNames.h
@@ -91,6 +91,10 @@ private:
   uint64_t CurrentUnitOffset = 0;
   const DWARFUnit *CurrentUnit = nullptr;
   std::unordered_map<uint32_t, uint32_t> AbbrevTagToIndexMap;
+  /// Contains a map of TU hashes to a Foreign TU indecies.
+  /// This is used to reduce the size of Foreign TU list since there could be
+  /// multiple TUs with the same hash.
+  DenseMap<uint64_t, uint32_t> TUHashToIndexMap;
 
   /// Represents a group of entries with identical name (and hence, hash value).
   struct HashData {
diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp
index ebe895e..640b29e 100644
--- a/bolt/lib/Core/DebugNames.cpp
+++ b/bolt/lib/Core/DebugNames.cpp
@@ -90,7 +90,11 @@ void DWARF5AcceleratorTable::addUnit(DWARFUnit &Unit,
       auto Iter = CUOffsetsToPatch.insert({*DWOID, CUList.size()});
       if (Iter.second)
         CUList.push_back(BADCUOFFSET);
-      ForeignTUList.push_back(cast<DWARFTypeUnit>(&Unit)->getTypeHash());
+      const uint64_t TUHash = cast<DWARFTypeUnit>(&Unit)->getTypeHash();
+      if (!TUHashToIndexMap.count(TUHash)) {
+        TUHashToIndexMap.insert({TUHash, ForeignTUList.size()});
+        ForeignTUList.push_back(TUHash);
+      }
     } else {
       LocalTUList.push_back(CurrentUnitOffset);
     }
@@ -231,8 +235,13 @@ DWARF5AcceleratorTable::addAccelTableEntry(
     IsTU = Unit.isTypeUnit();
     DieTag = Die.getTag();
     if (IsTU) {
-      if (DWOID)
-        return ForeignTUList.size() - 1;
+      if (DWOID) {
+        const uint64_t TUHash = cast<DWARFTypeUnit>(&Unit)->getTypeHash();
+        auto Iter = TUHashToIndexMap.find(TUHash);
+        assert(Iter != TUHashToIndexMap.end() &&
+               "Could not find TU hash in map");
+        return Iter->second;
+      }
       return LocalTUList.size() - 1;
     }
     return CUList.size() - 1;
diff --git a/bolt/test/X86/dwarf5-df-types-debug-names.test b/bolt/test/X86/dwarf5-df-types-debug-names.test
index f5a2c9c..7c1c8e4 100644
--- a/bolt/test/X86/dwarf5-df-types-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-types-debug-names.test
@@ -18,19 +18,19 @@
 ; BOLT: type_signature = [[TYPE1:0x[0-9a-f]*]]
 ; BOLT: Compile Unit
 ; BOLT: type_signature = [[TYPE2:0x[0-9a-f]*]]
-; BOLT: type_signature = [[TYPE3:0x[0-9a-f]*]]
+; BOLT: type_signature = [[TYPE1]]
 ; BOLT: Compile Unit
 ; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
 ; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
 
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0x17E
+; BOLT-NEXT:     Length: 0x176
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 2
 ; BOLT-NEXT:     Local TU count: 0
-; BOLT-NEXT:     Foreign TU count: 4
+; BOLT-NEXT:     Foreign TU count: 3
 ; BOLT-NEXT:     Bucket count: 9
 ; BOLT-NEXT:     Name count: 9
 ; BOLT-NEXT:     Abbreviations table size: 0x37
@@ -44,7 +44,6 @@
 ; BOLT-NEXT:     ForeignTU[0]: [[TYPE]]
 ; BOLT-NEXT:     ForeignTU[1]: [[TYPE1]]
 ; BOLT-NEXT:     ForeignTU[2]: [[TYPE2]]
-; BOLT-NEXT:     ForeignTU[3]: [[TYPE3]]
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT: Abbreviations [
 ; BOLT-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
@@ -173,7 +172,7 @@
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
-; BOLT-NEXT:         DW_IDX_type_unit: 0x03
+; BOLT-NEXT:         DW_IDX_type_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
 ; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
@@ -237,7 +236,7 @@
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: 0x5
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
-; BOLT-NEXT:         DW_IDX_type_unit: 0x03
+; BOLT-NEXT:         DW_IDX_type_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
 ; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index 1b92d2e..689eb92 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -51,6 +51,7 @@
 #include "NotNullTerminatedResultCheck.h"
 #include "OptionalValueConversionCheck.h"
 #include "ParentVirtualCallCheck.h"
+#include "PointerArithmeticOnPolymorphicObjectCheck.h"
 #include "PosixReturnCheck.h"
 #include "RedundantBranchConditionCheck.h"
 #include "ReservedIdentifierCheck.h"
@@ -171,6 +172,8 @@ public:
         "bugprone-multiple-statement-macro");
     CheckFactories.registerCheck<OptionalValueConversionCheck>(
         "bugprone-optional-value-conversion");
+    CheckFactories.registerCheck<PointerArithmeticOnPolymorphicObjectCheck>(
+        "bugprone-pointer-arithmetic-on-polymorphic-object");
     CheckFactories.registerCheck<RedundantBranchConditionCheck>(
         "bugprone-redundant-branch-condition");
     CheckFactories.registerCheck<cppcoreguidelines::NarrowingConversionsCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 2d30319..cb0d8ae 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -48,6 +48,7 @@ add_clang_library(clangTidyBugproneModule
   NotNullTerminatedResultCheck.cpp
   OptionalValueConversionCheck.cpp
   ParentVirtualCallCheck.cpp
+  PointerArithmeticOnPolymorphicObjectCheck.cpp
   PosixReturnCheck.cpp
   RedundantBranchConditionCheck.cpp
   ReservedIdentifierCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.cpp
new file mode 100644
index 0000000..6e6ad10
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.cpp
@@ -0,0 +1,81 @@
+//===--- PointerArithmeticOnPolymorphicObjectCheck.cpp - clang-tidy--------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PointerArithmeticOnPolymorphicObjectCheck.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::bugprone {
+
+namespace {
+AST_MATCHER(CXXRecordDecl, isAbstract) { return Node.isAbstract(); }
+AST_MATCHER(CXXRecordDecl, isPolymorphic) { return Node.isPolymorphic(); }
+} // namespace
+
+PointerArithmeticOnPolymorphicObjectCheck::
+    PointerArithmeticOnPolymorphicObjectCheck(StringRef Name,
+                                              ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      IgnoreInheritedVirtualFunctions(
+          Options.get("IgnoreInheritedVirtualFunctions", false)) {}
+
+void PointerArithmeticOnPolymorphicObjectCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IgnoreInheritedVirtualFunctions",
+                IgnoreInheritedVirtualFunctions);
+}
+
+void PointerArithmeticOnPolymorphicObjectCheck::registerMatchers(
+    MatchFinder *Finder) {
+  const auto PolymorphicPointerExpr =
+      expr(hasType(hasCanonicalType(pointerType(pointee(hasCanonicalType(
+               hasDeclaration(cxxRecordDecl(unless(isFinal()), isPolymorphic())
+                                  .bind("pointee"))))))))
+          .bind("pointer");
+
+  const auto PointerExprWithVirtualMethod =
+      expr(hasType(hasCanonicalType(
+               pointerType(pointee(hasCanonicalType(hasDeclaration(
+                   cxxRecordDecl(
+                       unless(isFinal()),
+                       anyOf(hasMethod(isVirtualAsWritten()), isAbstract()))
+                       .bind("pointee"))))))))
+          .bind("pointer");
+
+  const auto SelectedPointerExpr = IgnoreInheritedVirtualFunctions
+                                       ? PointerExprWithVirtualMethod
+                                       : PolymorphicPointerExpr;
+
+  const auto ArraySubscript = arraySubscriptExpr(hasBase(SelectedPointerExpr));
+
+  const auto BinaryOperators =
+      binaryOperator(hasAnyOperatorName("+", "-", "+=", "-="),
+                     hasEitherOperand(SelectedPointerExpr));
+
+  const auto UnaryOperators = unaryOperator(
+      hasAnyOperatorName("++", "--"), hasUnaryOperand(SelectedPointerExpr));
+
+  Finder->addMatcher(ArraySubscript, this);
+  Finder->addMatcher(BinaryOperators, this);
+  Finder->addMatcher(UnaryOperators, this);
+}
+
+void PointerArithmeticOnPolymorphicObjectCheck::check(
+    const MatchFinder::MatchResult &Result) {
+  const auto *PointerExpr = Result.Nodes.getNodeAs<Expr>("pointer");
+  const auto *PointeeDecl = Result.Nodes.getNodeAs<CXXRecordDecl>("pointee");
+
+  diag(PointerExpr->getBeginLoc(),
+       "pointer arithmetic on polymorphic object of type %0 can result in "
+       "undefined behavior if the dynamic type differs from the pointer type")
+      << PointeeDecl << PointerExpr->getSourceRange();
+}
+
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.h b/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.h
new file mode 100644
index 0000000..84f2d8e
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.h
@@ -0,0 +1,41 @@
+//===--- PointerArithmeticOnPolymorphicObjectCheck.h ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POINTERARITHMETICONPOLYMORPHICOBJECTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POINTERARITHMETICONPOLYMORPHICOBJECTCHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::bugprone {
+
+/// Finds pointer arithmetic performed on classes that contain a
+/// virtual function.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.html
+class PointerArithmeticOnPolymorphicObjectCheck : public ClangTidyCheck {
+public:
+  PointerArithmeticOnPolymorphicObjectCheck(StringRef Name,
+                                            ClangTidyContext *Context);
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
+    return LangOpts.CPlusPlus;
+  }
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
+
+private:
+  const bool IgnoreInheritedVirtualFunctions;
+};
+
+} // namespace clang::tidy::bugprone
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POINTERARITHMETICONPOLYMORPHICOBJECTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index 00370ee..ffb62b4 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -10,6 +10,7 @@
 #include "../ClangTidyModule.h"
 #include "../ClangTidyModuleRegistry.h"
 #include "../bugprone/BadSignalToKillThreadCheck.h"
+#include "../bugprone/PointerArithmeticOnPolymorphicObjectCheck.h"
 #include "../bugprone/ReservedIdentifierCheck.h"
 #include "../bugprone/SignalHandlerCheck.h"
 #include "../bugprone/SignedCharMisuseCheck.h"
@@ -238,6 +239,10 @@ public:
     // CON
     CheckFactories.registerCheck<bugprone::SpuriouslyWakeUpFunctionsCheck>(
         "cert-con54-cpp");
+    // CTR
+    CheckFactories
+        .registerCheck<bugprone::PointerArithmeticOnPolymorphicObjectCheck>(
+            "cert-ctr56-cpp");
     // DCL
     CheckFactories.registerCheck<VariadicFunctionDefCheck>("cert-dcl50-cpp");
     CheckFactories.registerCheck<bugprone::ReservedIdentifierCheck>(
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index fe81973..e570c81 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -137,6 +137,11 @@ New checks
   Detects error-prone Curiously Recurring Template Pattern usage, when the CRTP
   can be constructed outside itself and the derived class.
 
+- New :doc:`bugprone-pointer-arithmetic-on-polymorphic-object
+  <clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object>` check.
+
+  Finds pointer arithmetic performed on classes that contain a virtual function.
+
 - New :doc:`bugprone-return-const-ref-from-parameter
   <clang-tidy/checks/bugprone/return-const-ref-from-parameter>` check.
 
@@ -199,6 +204,11 @@ New checks
 New check aliases
 ^^^^^^^^^^^^^^^^^
 
+- New alias :doc:`cert-ctr56-cpp <clang-tidy/checks/cert/ctr56-cpp>` to
+  :doc:`bugprone-pointer-arithmetic-on-polymorphic-object
+  <clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object>`
+  was added.
+
 - New alias :doc:`cert-int09-c <clang-tidy/checks/cert/int09-c>` to
   :doc:`readability-enum-initial-value <clang-tidy/checks/readability/enum-initial-value>`
   was added.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.rst
new file mode 100644
index 0000000..1884acd
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.rst
@@ -0,0 +1,68 @@
+.. title:: clang-tidy - bugprone-pointer-arithmetic-on-polymorphic-object
+
+bugprone-pointer-arithmetic-on-polymorphic-object
+=================================================
+
+Finds pointer arithmetic performed on classes that contain a virtual function.
+
+Pointer arithmetic on polymorphic objects where the pointer's static type is
+different from its dynamic type is undefined behavior, as the two types could
+have different sizes, and thus the vtable pointer could point to an
+invalid address.
+
+Finding pointers where the static type contains a virtual member function is a
+good heuristic, as the pointer is likely to point to a different,
+derived object.
+
+Example:
+
+.. code-block:: c++
+
+  struct Base {
+    virtual void ~Base();
+  };
+
+  struct Derived : public Base {};
+
+  void foo() {
+    Base *b = new Derived[10];
+
+    b += 1;
+    // warning: pointer arithmetic on class that declares a virtual function can
+    // result in undefined behavior if the dynamic type differs from the
+    // pointer type
+
+    delete[] static_cast<Derived*>(b);
+  }
+
+Options
+-------
+
+.. option:: IgnoreInheritedVirtualFunctions
+
+  When `true`, objects that only inherit a virtual function are not checked.
+  Classes that do not declare a new virtual function are excluded
+  by default, as they make up the majority of false positives.
+  Default: `false`.
+
+  .. code-block:: c++
+  
+    void bar() {
+      Base *b = new Base[10];
+      b += 1; // warning, as Base declares a virtual destructor
+  
+      delete[] b;
+  
+      Derived *d = new Derived[10]; // Derived overrides the destructor, and
+                                    // declares no other virtual functions
+      d += 1; // warning only if IgnoreVirtualDeclarationsOnly is set to false
+  
+      delete[] d;
+    }
+
+References
+----------
+
+This check corresponds to the SEI Cert rule
+`CTR56-CPP. Do not use pointer arithmetic on polymorphic objects
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/CTR56-CPP.+Do+not+use+pointer+arithmetic+on+polymorphic+objects>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/ctr56-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/ctr56-cpp.rst
new file mode 100644
index 0000000..e42acbe
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/ctr56-cpp.rst
@@ -0,0 +1,10 @@
+.. title:: clang-tidy - cert-ctr56-cpp
+.. meta::
+    :http-equiv=refresh: 5;URL=../bugprone/pointer-arithmetic-on-polymorphic-object.html
+
+cert-ctr56-cpp
+==============
+
+The `cert-ctr56-cpp` check is an alias, please see
+:doc:`bugprone-pointer-arithmetic-on-polymorphic-object
+<../bugprone/pointer-arithmetic-on-polymorphic-object>` for more information.
+\ No newline at end of file
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index a698cec..9671f38 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -157,6 +157,7 @@ Clang-Tidy Checks
    :doc:`bugprone-unused-raii <bugprone/unused-raii>`, "Yes"
    :doc:`bugprone-unused-return-value <bugprone/unused-return-value>`,
    :doc:`bugprone-use-after-move <bugprone/use-after-move>`,
+   :doc:`bugprone-pointer-arithmetic-on-polymorphic-object <bugprone/pointer-arithmetic-on-polymorphic-object>`,
    :doc:`bugprone-virtual-near-miss <bugprone/virtual-near-miss>`, "Yes"
    :doc:`cert-dcl50-cpp <cert/dcl50-cpp>`,
    :doc:`cert-dcl58-cpp <cert/dcl58-cpp>`,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/pointer-arithmetic-on-polymorphic-object-all.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/pointer-arithmetic-on-polymorphic-object-all.cpp
new file mode 100644
index 0000000..caf24f7
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/pointer-arithmetic-on-polymorphic-object-all.cpp
@@ -0,0 +1,140 @@
+// RUN: %check_clang_tidy %s bugprone-pointer-arithmetic-on-polymorphic-object %t --
+
+class Base {
+public:  
+  virtual ~Base() {}
+};
+
+class Derived : public Base {};
+
+class FinalDerived final : public Base {};
+
+class AbstractBase {
+public:
+  virtual void f() = 0;
+  virtual ~AbstractBase() {}
+};
+
+class AbstractInherited : public AbstractBase {};
+
+class AbstractOverride : public AbstractInherited {
+public:
+  void f() override {}
+};
+
+void operators() {
+  Base *b = new Derived[10];
+
+  b += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base' can result in undefined behavior if the dynamic type differs from the pointer type [bugprone-pointer-arithmetic-on-polymorphic-object]
+
+  b = b + 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: pointer arithmetic on polymorphic object of type 'Base' can result in undefined behavior if the dynamic type differs from the pointer type [bugprone-pointer-arithmetic-on-polymorphic-object]
+
+  b++;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base' can result in undefined behavior if the dynamic type differs from the pointer type [bugprone-pointer-arithmetic-on-polymorphic-object]
+
+  --b;
+  // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: pointer arithmetic on polymorphic object of type 'Base' can result in undefined behavior if the dynamic type differs from the pointer type [bugprone-pointer-arithmetic-on-polymorphic-object]
+
+  b[1];
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base' can result in undefined behavior if the dynamic type differs from the pointer type [bugprone-pointer-arithmetic-on-polymorphic-object]
+
+  delete[] static_cast<Derived*>(b);
+}
+
+void subclassWarnings() {
+  Base *b = new Base[10];
+
+  // False positive that's impossible to distinguish without
+  // path-sensitive analysis, but the code is bug-prone regardless.
+  b += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base'
+
+  delete[] b;
+
+  // Common false positive is a class that overrides all parent functions.
+  // Is a warning because of the check configuration.
+  Derived *d = new Derived[10];
+
+  d += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Derived'
+
+  delete[] d;
+
+  // Final classes cannot have a dynamic type.
+  FinalDerived *fd = new FinalDerived[10];
+
+  fd += 1;
+  // no-warning
+
+  delete[] fd;
+}
+
+void abstractWarnings() {
+  // Classes with an abstract member funtion are always matched.
+  AbstractBase *ab = new AbstractOverride[10];
+
+  ab += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'AbstractBase'
+
+  delete[] static_cast<AbstractOverride*>(ab);
+
+  AbstractInherited *ai = new AbstractOverride[10];
+
+  ai += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'AbstractInherited'
+
+  delete[] static_cast<AbstractOverride*>(ai);
+
+  // Is a warning because of the check configuration.
+  AbstractOverride *ao = new AbstractOverride[10];
+
+  ao += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'AbstractOverride'
+
+  delete[] ao;
+}
+
+template <typename T>
+void templateWarning(T *t) {
+  // FIXME: Tidy doesn't support template instantiation locations properly.
+  t += 1;
+  // no-warning
+}
+
+void functionArgument(Base *b) {
+  b += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base'
+
+  templateWarning(b);
+}
+
+using BaseAlias = Base;
+using DerivedAlias = Derived;
+using FinalDerivedAlias = FinalDerived;
+
+using BasePtr = Base*;
+using DerivedPtr = Derived*;
+using FinalDerivedPtr = FinalDerived*;
+
+void typeAliases(BaseAlias *b, DerivedAlias *d, FinalDerivedAlias *fd,
+                 BasePtr bp, DerivedPtr dp, FinalDerivedPtr fdp) {
+  b += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base'
+
+  d += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Derived'
+
+  fd += 1;
+  // no-warning
+
+  bp += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base'
+
+  dp += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Derived'
+
+  fdp += 1;
+  // no-warning
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/pointer-arithmetic-on-polymorphic-object-decl-only.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/pointer-arithmetic-on-polymorphic-object-decl-only.cpp
new file mode 100644
index 0000000..48757bb
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/pointer-arithmetic-on-polymorphic-object-decl-only.cpp
@@ -0,0 +1,141 @@
+// RUN: %check_clang_tidy %s bugprone-pointer-arithmetic-on-polymorphic-object %t -- \
+// RUN: -config="{CheckOptions: \
+// RUN: {bugprone-pointer-arithmetic-on-polymorphic-object.IgnoreInheritedVirtualFunctions: true}}"
+
+class Base {
+public:  
+  virtual ~Base() {}
+};
+
+class Derived : public Base {};
+
+class FinalDerived final : public Base {};
+
+class AbstractBase {
+public:
+  virtual void f() = 0;
+  virtual ~AbstractBase() {}
+};
+
+class AbstractInherited : public AbstractBase {};
+
+class AbstractOverride : public AbstractInherited {
+public:
+  void f() override {}
+};
+
+void operators() {
+  Base *b = new Derived[10];
+
+  b += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base' can result in undefined behavior if the dynamic type differs from the pointer type [bugprone-pointer-arithmetic-on-polymorphic-object]
+
+  b = b + 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: pointer arithmetic on polymorphic object of type 'Base' can result in undefined behavior if the dynamic type differs from the pointer type [bugprone-pointer-arithmetic-on-polymorphic-object]
+
+  b++;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base' can result in undefined behavior if the dynamic type differs from the pointer type [bugprone-pointer-arithmetic-on-polymorphic-object]
+
+  --b;
+  // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: pointer arithmetic on polymorphic object of type 'Base' can result in undefined behavior if the dynamic type differs from the pointer type [bugprone-pointer-arithmetic-on-polymorphic-object]
+
+  b[1];
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base' can result in undefined behavior if the dynamic type differs from the pointer type [bugprone-pointer-arithmetic-on-polymorphic-object]
+
+  delete[] static_cast<Derived*>(b);
+}
+
+void subclassWarnings() {
+  Base *b = new Base[10];
+
+  // False positive that's impossible to distinguish without
+  // path-sensitive analysis, but the code is bug-prone regardless.
+  b += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base'
+
+  delete[] b;
+
+  // Common false positive is a class that overrides all parent functions.
+  Derived *d = new Derived[10];
+
+  d += 1;
+  // no-warning
+
+  delete[] d;
+
+  // Final classes cannot have a dynamic type.
+  FinalDerived *fd = new FinalDerived[10];
+
+  fd += 1;
+  // no-warning
+
+  delete[] fd;
+}
+
+void abstractWarnings() {
+  // Classes with an abstract member funtion are always matched.
+  AbstractBase *ab = new AbstractOverride[10];
+
+  ab += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'AbstractBase'
+
+  delete[] static_cast<AbstractOverride*>(ab);
+
+  AbstractInherited *ai = new AbstractOverride[10];
+
+  ai += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'AbstractInherited'
+
+  delete[] static_cast<AbstractOverride*>(ai);
+
+  // If all abstract member functions are overridden, the class is not matched.
+  AbstractOverride *ao = new AbstractOverride[10];
+
+  ao += 1;
+  // no-warning
+
+  delete[] ao;
+}
+
+template <typename T>
+void templateWarning(T *t) {
+  // FIXME: Tidy doesn't support template instantiation locations properly.
+  t += 1;
+  // no-warning
+}
+
+void functionArgument(Base *b) {
+  b += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base'
+
+  templateWarning(b);
+}
+
+using BaseAlias = Base;
+using DerivedAlias = Derived;
+using FinalDerivedAlias = FinalDerived;
+
+using BasePtr = Base*;
+using DerivedPtr = Derived*;
+using FinalDerivedPtr = FinalDerived*;
+
+void typeAliases(BaseAlias *b, DerivedAlias *d, FinalDerivedAlias *fd,
+                 BasePtr bp, DerivedPtr dp, FinalDerivedPtr fdp) {
+  b += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base'
+
+  d += 1;
+  // no-warning
+
+  fd += 1;
+  // no-warning
+
+  bp += 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: pointer arithmetic on polymorphic object of type 'Base'
+
+  dp += 1;
+  // no-warning
+
+  fdp += 1;
+  // no-warning
+}
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f149684..36cf615a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -106,6 +106,13 @@ ABI Changes in This Version
   earlier versions of Clang unless such code is built with the compiler option
   `-fms-compatibility-version=19.14` to imitate the MSVC 1914 mangling behavior.
 
+- Fixed Microsoft name mangling for auto non-type template arguments of pointer
+  to member type for MSVC 1920+. This change resolves incompatibilities with code
+  compiled by MSVC 1920+ but will introduce incompatibilities with code compiled by
+  earlier versions of Clang unless such code is built with the compiler option
+  `-fms-compatibility-version=19.14` to imitate the MSVC 1914 mangling behavior.
+  (GH#70899).
+
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 0dbc819..fb3a5d2 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -4533,8 +4533,7 @@ public:
   bool checkTargetAttr(SourceLocation LiteralLoc, StringRef Str);
 
   /// Check Target Version attrs
-  bool checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
-                              StringRef &Str, bool &isDefault);
+  bool checkTargetVersionAttr(SourceLocation Loc, Decl *D, StringRef Str);
   bool checkTargetClonesAttrString(
       SourceLocation LiteralLoc, StringRef Str, const StringLiteral *Literal,
       Decl *D, bool &HasDefault, bool &HasCommas, bool &HasNotDefault,
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
index c0d3fbd..0d95662 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
@@ -34,6 +34,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstdint>
 #include <limits>
@@ -99,6 +100,8 @@ public:
 #define REGION(Id, Parent) Id ## Kind,
 #define REGION_RANGE(Id, First, Last) BEGIN_##Id = First, END_##Id = Last,
 #include "clang/StaticAnalyzer/Core/PathSensitive/Regions.def"
+#undef REGION
+#undef REGION_RANGE
   };
 
 private:
@@ -171,6 +174,8 @@ public:
 
   Kind getKind() const { return kind; }
 
+  StringRef getKindStr() const;
+
   template<typename RegionTy> const RegionTy* getAs() const;
   template <typename RegionTy>
   LLVM_ATTRIBUTE_RETURNS_NONNULL const RegionTy *castAs() const;
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 7f1e9ab..fac14ce 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -368,11 +368,15 @@ public:
   void mangleFunctionEncoding(GlobalDecl GD, bool ShouldMangle);
   void mangleVariableEncoding(const VarDecl *VD);
   void mangleMemberDataPointer(const CXXRecordDecl *RD, const ValueDecl *VD,
+                               const NonTypeTemplateParmDecl *PD,
+                               QualType TemplateArgType,
                                StringRef Prefix = "$");
   void mangleMemberDataPointerInClassNTTP(const CXXRecordDecl *,
                                           const ValueDecl *);
   void mangleMemberFunctionPointer(const CXXRecordDecl *RD,
                                    const CXXMethodDecl *MD,
+                                   const NonTypeTemplateParmDecl *PD,
+                                   QualType TemplateArgType,
                                    StringRef Prefix = "$");
   void mangleFunctionPointer(const FunctionDecl *FD,
                              const NonTypeTemplateParmDecl *PD,
@@ -673,12 +677,17 @@ void MicrosoftCXXNameMangler::mangleVariableEncoding(const VarDecl *VD) {
   }
 }
 
-void MicrosoftCXXNameMangler::mangleMemberDataPointer(const CXXRecordDecl *RD,
-                                                      const ValueDecl *VD,
-                                                      StringRef Prefix) {
+void MicrosoftCXXNameMangler::mangleMemberDataPointer(
+    const CXXRecordDecl *RD, const ValueDecl *VD,
+    const NonTypeTemplateParmDecl *PD, QualType TemplateArgType,
+    StringRef Prefix) {
   // <member-data-pointer> ::= <integer-literal>
   //                       ::= $F <number> <number>
   //                       ::= $G <number> <number> <number>
+  //
+  // <auto-nttp> ::= $ M <type> <integer-literal>
+  // <auto-nttp> ::= $ M <type> F <name> <number>
+  // <auto-nttp> ::= $ M <type> G <name> <number> <number>
 
   int64_t FieldOffset;
   int64_t VBTableOffset;
@@ -707,7 +716,18 @@ void MicrosoftCXXNameMangler::mangleMemberDataPointer(const CXXRecordDecl *RD,
   case MSInheritanceModel::Unspecified: Code = 'G'; break;
   }
 
-  Out << Prefix << Code;
+  Out << Prefix;
+
+  if (VD &&
+      getASTContext().getLangOpts().isCompatibleWithMSVC(
+          LangOptions::MSVC2019) &&
+      PD && PD->getType()->getTypeClass() == Type::Auto &&
+      !TemplateArgType.isNull()) {
+    Out << "M";
+    mangleType(TemplateArgType, SourceRange(), QMM_Drop);
+  }
+
+  Out << Code;
 
   mangleNumber(FieldOffset);
 
@@ -728,7 +748,7 @@ void MicrosoftCXXNameMangler::mangleMemberDataPointerInClassNTTP(
   //                                  ::= 8 <postfix> @ <unqualified-name> @
 
   if (IM != MSInheritanceModel::Single && IM != MSInheritanceModel::Multiple)
-    return mangleMemberDataPointer(RD, VD, "");
+    return mangleMemberDataPointer(RD, VD, nullptr, QualType(), "");
 
   if (!VD) {
     Out << 'N';
@@ -742,14 +762,19 @@ void MicrosoftCXXNameMangler::mangleMemberDataPointerInClassNTTP(
   Out << '@';
 }
 
-void
-MicrosoftCXXNameMangler::mangleMemberFunctionPointer(const CXXRecordDecl *RD,
-                                                     const CXXMethodDecl *MD,
-                                                     StringRef Prefix) {
+void MicrosoftCXXNameMangler::mangleMemberFunctionPointer(
+    const CXXRecordDecl *RD, const CXXMethodDecl *MD,
+    const NonTypeTemplateParmDecl *PD, QualType TemplateArgType,
+    StringRef Prefix) {
   // <member-function-pointer> ::= $1? <name>
   //                           ::= $H? <name> <number>
   //                           ::= $I? <name> <number> <number>
   //                           ::= $J? <name> <number> <number> <number>
+  //
+  // <auto-nttp> ::= $ M <type> 1? <name>
+  // <auto-nttp> ::= $ M <type> H? <name> <number>
+  // <auto-nttp> ::= $ M <type> I? <name> <number> <number>
+  // <auto-nttp> ::= $ M <type> J? <name> <number> <number> <number>
 
   MSInheritanceModel IM = RD->getMSInheritanceModel();
 
@@ -767,7 +792,17 @@ MicrosoftCXXNameMangler::mangleMemberFunctionPointer(const CXXRecordDecl *RD,
   uint64_t VBTableOffset = 0;
   uint64_t VBPtrOffset = 0;
   if (MD) {
-    Out << Prefix << Code << '?';
+    Out << Prefix;
+
+    if (getASTContext().getLangOpts().isCompatibleWithMSVC(
+            LangOptions::MSVC2019) &&
+        PD && PD->getType()->getTypeClass() == Type::Auto &&
+        !TemplateArgType.isNull()) {
+      Out << "M";
+      mangleType(TemplateArgType, SourceRange(), QMM_Drop);
+    }
+
+    Out << Code << '?';
     if (MD->isVirtual()) {
       MicrosoftVTableContext *VTContext =
           cast<MicrosoftVTableContext>(getASTContext().getVTableContext());
@@ -859,7 +894,7 @@ void MicrosoftCXXNameMangler::mangleMemberFunctionPointerInClassNTTP(
 
   if (!MD) {
     if (RD->getMSInheritanceModel() != MSInheritanceModel::Single)
-      return mangleMemberFunctionPointer(RD, MD, "");
+      return mangleMemberFunctionPointer(RD, MD, nullptr, QualType(), "");
 
     Out << 'N';
     return;
@@ -1732,12 +1767,15 @@ void MicrosoftCXXNameMangler::mangleTemplateArg(const TemplateDecl *TD,
     if (isa<FieldDecl>(ND) || isa<IndirectFieldDecl>(ND)) {
       mangleMemberDataPointer(cast<CXXRecordDecl>(ND->getDeclContext())
                                   ->getMostRecentNonInjectedDecl(),
-                              cast<ValueDecl>(ND));
+                              cast<ValueDecl>(ND),
+                              cast<NonTypeTemplateParmDecl>(Parm),
+                              TA.getParamTypeForDecl());
     } else if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(ND)) {
       const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD);
       if (MD && MD->isInstance()) {
         mangleMemberFunctionPointer(
-            MD->getParent()->getMostRecentNonInjectedDecl(), MD);
+            MD->getParent()->getMostRecentNonInjectedDecl(), MD,
+            cast<NonTypeTemplateParmDecl>(Parm), TA.getParamTypeForDecl());
       } else {
         mangleFunctionPointer(FD, cast<NonTypeTemplateParmDecl>(Parm),
                               TA.getParamTypeForDecl());
@@ -1767,12 +1805,12 @@ void MicrosoftCXXNameMangler::mangleTemplateArg(const TemplateDecl *TD,
       const CXXRecordDecl *RD = MPT->getMostRecentCXXRecordDecl();
       if (MPT->isMemberFunctionPointerType() &&
           !isa<FunctionTemplateDecl>(TD)) {
-        mangleMemberFunctionPointer(RD, nullptr);
+        mangleMemberFunctionPointer(RD, nullptr, nullptr, QualType());
         return;
       }
       if (MPT->isMemberDataPointer()) {
         if (!isa<FunctionTemplateDecl>(TD)) {
-          mangleMemberDataPointer(RD, nullptr);
+          mangleMemberDataPointer(RD, nullptr, nullptr, QualType());
           return;
         }
         // nullptr data pointers are always represented with a single field
@@ -1979,9 +2017,10 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
                                                cast_or_null<CXXMethodDecl>(D));
     } else {
       if (T->isMemberDataPointerType())
-        mangleMemberDataPointer(RD, D, "");
+        mangleMemberDataPointer(RD, D, nullptr, QualType(), "");
       else
-        mangleMemberFunctionPointer(RD, cast_or_null<CXXMethodDecl>(D), "");
+        mangleMemberFunctionPointer(RD, cast_or_null<CXXMethodDecl>(D), nullptr,
+                                    QualType(), "");
     }
     return;
   }
diff --git a/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp b/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp
index cc252d5..c8d07f2 100644
--- a/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp
+++ b/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp
@@ -48,7 +48,7 @@ const DiagnosticBuilder &operator<<(const DiagnosticBuilder &DB,
           Stream << PV.second.getAsString();
       });
   Stream << " ]";
-  DB.AddString(Stream.str());
+  DB.AddString(PlatformAsString);
   return DB;
 }
 
@@ -91,7 +91,7 @@ const DiagnosticBuilder &operator<<(const DiagnosticBuilder &DB,
   std::string VersionString;
   raw_string_ostream OS(VersionString);
   OS << Version;
-  DB.AddString(OS.str());
+  DB.AddString(VersionString);
   return DB;
 }
 
@@ -102,7 +102,7 @@ operator<<(const clang::DiagnosticBuilder &DB,
   raw_string_ostream OS(IFAsString);
 
   OS << LibAttr.getKey() << " [ " << LibAttr.getValue() << " ]";
-  DB.AddString(OS.str());
+  DB.AddString(IFAsString);
   return DB;
 }
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 262a254..b3bfdac 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11201,6 +11201,10 @@ static bool CheckMultiVersionFirstFunction(Sema &S, FunctionDecl *FD) {
   // otherwise it is treated as a normal function.
   if (TA && !TA->isDefaultVersion())
     return false;
+  // The target_version attribute only causes Multiversioning if this
+  // declaration is NOT the default version.
+  if (TVA && TVA->isDefaultVersion())
+    return false;
 
   if ((TA || TVA) && CheckMultiVersionValue(S, FD)) {
     FD->setInvalidDecl();
@@ -11234,18 +11238,16 @@ static void patchDefaultTargetVersion(FunctionDecl *From, FunctionDecl *To) {
 
   if (MVKindTo == MultiVersionKind::None &&
       (MVKindFrom == MultiVersionKind::TargetVersion ||
-       MVKindFrom == MultiVersionKind::TargetClones)) {
-    To->setIsMultiVersion();
+       MVKindFrom == MultiVersionKind::TargetClones))
     To->addAttr(TargetVersionAttr::CreateImplicit(
         To->getASTContext(), "default", To->getSourceRange()));
-  }
 }
 
-static bool CheckTargetCausesMultiVersioning(Sema &S, FunctionDecl *OldFD,
-                                             FunctionDecl *NewFD,
-                                             bool &Redeclaration,
-                                             NamedDecl *&OldDecl,
-                                             LookupResult &Previous) {
+static bool CheckDeclarationCausesMultiVersioning(Sema &S, FunctionDecl *OldFD,
+                                                  FunctionDecl *NewFD,
+                                                  bool &Redeclaration,
+                                                  NamedDecl *&OldDecl,
+                                                  LookupResult &Previous) {
   assert(!OldFD->isMultiVersion() && "Unexpected MultiVersion");
 
   // The definitions should be allowed in any order. If we have discovered
@@ -11256,13 +11258,16 @@ static bool CheckTargetCausesMultiVersioning(Sema &S, FunctionDecl *OldFD,
   const auto *NewTA = NewFD->getAttr<TargetAttr>();
   const auto *NewTVA = NewFD->getAttr<TargetVersionAttr>();
   const auto *OldTA = OldFD->getAttr<TargetAttr>();
-  const auto *OldTVA = OldFD->getAttr<TargetVersionAttr>();
+
   // If the old decl is NOT MultiVersioned yet, and we don't cause that
   // to change, this is a simple redeclaration.
-  if ((NewTA && !NewTA->isDefaultVersion() &&
-       (!OldTA || OldTA->getFeaturesStr() == NewTA->getFeaturesStr())) ||
-      (NewTVA && !NewTVA->isDefaultVersion() &&
-       (!OldTVA || OldTVA->getName() == NewTVA->getName())))
+  if (NewTA && !NewTA->isDefaultVersion() &&
+      (!OldTA || OldTA->getFeaturesStr() == NewTA->getFeaturesStr()))
+    return false;
+
+  // The target_version attribute only causes Multiversioning if this
+  // declaration is NOT the default version.
+  if (NewTVA && NewTVA->isDefaultVersion())
     return false;
 
   // Otherwise, this decl causes MultiVersioning.
@@ -11279,8 +11284,7 @@ static bool CheckTargetCausesMultiVersioning(Sema &S, FunctionDecl *OldFD,
   }
 
   // If this is 'default', permit the forward declaration.
-  if ((NewTA && NewTA->isDefaultVersion() && !OldTA) ||
-      (NewTVA && NewTVA->isDefaultVersion() && !OldTVA)) {
+  if (NewTA && NewTA->isDefaultVersion() && !OldTA) {
     Redeclaration = true;
     OldDecl = OldFD;
     OldFD->setIsMultiVersion();
@@ -11312,22 +11316,6 @@ static bool CheckTargetCausesMultiVersioning(Sema &S, FunctionDecl *OldFD,
     }
   }
 
-  if (NewTVA) {
-    llvm::SmallVector<StringRef, 8> Feats;
-    OldTVA->getFeatures(Feats);
-    llvm::sort(Feats);
-    llvm::SmallVector<StringRef, 8> NewFeats;
-    NewTVA->getFeatures(NewFeats);
-    llvm::sort(NewFeats);
-
-    if (Feats == NewFeats) {
-      S.Diag(NewFD->getLocation(), diag::err_multiversion_duplicate);
-      S.Diag(OldFD->getLocation(), diag::note_previous_declaration);
-      NewFD->setInvalidDecl();
-      return true;
-    }
-  }
-
   for (const auto *FD : OldFD->redecls()) {
     const auto *CurTA = FD->getAttr<TargetAttr>();
     const auto *CurTVA = FD->getAttr<TargetVersionAttr>();
@@ -11683,24 +11671,8 @@ static bool CheckMultiVersionFunction(Sema &S, FunctionDecl *NewFD,
 
   FunctionDecl *OldFD = OldDecl->getAsFunction();
 
-  if (!OldFD->isMultiVersion() && MVKind == MultiVersionKind::None) {
-    if (NewTVA || !OldFD->getAttr<TargetVersionAttr>())
-      return false;
-    if (!NewFD->getType()->getAs<FunctionProtoType>()) {
-      // Multiversion declaration doesn't have prototype.
-      S.Diag(NewFD->getLocation(), diag::err_multiversion_noproto);
-      NewFD->setInvalidDecl();
-    } else {
-      // No "target_version" attribute is equivalent to "default" attribute.
-      NewFD->addAttr(TargetVersionAttr::CreateImplicit(
-          S.Context, "default", NewFD->getSourceRange()));
-      NewFD->setIsMultiVersion();
-      OldFD->setIsMultiVersion();
-      OldDecl = OldFD;
-      Redeclaration = true;
-    }
-    return true;
-  }
+  if (!OldFD->isMultiVersion() && MVKind == MultiVersionKind::None)
+    return false;
 
   // Multiversioned redeclarations aren't allowed to omit the attribute, except
   // for target_clones and target_version.
@@ -11717,8 +11689,8 @@ static bool CheckMultiVersionFunction(Sema &S, FunctionDecl *NewFD,
     switch (MVKind) {
     case MultiVersionKind::Target:
     case MultiVersionKind::TargetVersion:
-      return CheckTargetCausesMultiVersioning(S, OldFD, NewFD, Redeclaration,
-                                              OldDecl, Previous);
+      return CheckDeclarationCausesMultiVersioning(
+          S, OldFD, NewFD, Redeclaration, OldDecl, Previous);
     case MultiVersionKind::TargetClones:
       if (OldFD->isUsed(false)) {
         NewFD->setInvalidDecl();
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 6edf1fa..2f16d0f 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3010,12 +3010,10 @@ bool Sema::checkTargetAttr(SourceLocation LiteralLoc, StringRef AttrStr) {
 }
 
 bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
-                                  StringRef &AttrStr, bool &isDefault) {
+                                  StringRef AttrStr) {
   enum FirstParam { Unsupported };
   enum SecondParam { None };
   enum ThirdParam { Target, TargetClones, TargetVersion };
-  if (AttrStr.trim() == "default")
-    isDefault = true;
   llvm::SmallVector<StringRef, 8> Features;
   AttrStr.split(Features, "+");
   for (auto &CurFeature : Features) {
@@ -3035,16 +3033,12 @@ bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
 static void handleTargetVersionAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   StringRef Str;
   SourceLocation LiteralLoc;
-  bool isDefault = false;
   if (!S.checkStringLiteralArgumentAttr(AL, 0, Str, &LiteralLoc) ||
-      S.checkTargetVersionAttr(LiteralLoc, D, Str, isDefault))
+      S.checkTargetVersionAttr(LiteralLoc, D, Str))
     return;
-  // Do not create default only target_version attribute
-  if (!isDefault) {
-    TargetVersionAttr *NewAttr =
-        ::new (S.Context) TargetVersionAttr(S.Context, AL, Str);
-    D->addAttr(NewAttr);
-  }
+  TargetVersionAttr *NewAttr =
+      ::new (S.Context) TargetVersionAttr(S.Context, AL, Str);
+  D->addAttr(NewAttr);
 }
 
 static void handleTargetAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 6879a9a..07b3f79 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -2743,7 +2743,7 @@ Expr *
 buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
                            TypeAliasTemplateDecl *AliasTemplate,
                            ArrayRef<DeducedTemplateArgument> DeduceResults,
-                           Expr *IsDeducible) {
+                           unsigned FirstUndeducedParamIdx, Expr *IsDeducible) {
   Expr *RC = F->getTemplateParameters()->getRequiresClause();
   if (!RC)
     return IsDeducible;
@@ -2803,8 +2803,22 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
 
   for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) {
     const auto &D = DeduceResults[Index];
-    if (D.isNull())
+    if (D.isNull()) { // non-deduced template parameters of f
+      NamedDecl *TP = F->getTemplateParameters()->getParam(Index);
+      MultiLevelTemplateArgumentList Args;
+      Args.setKind(TemplateSubstitutionKind::Rewrite);
+      Args.addOuterTemplateArguments(TemplateArgsForBuildingRC);
+      // Rebuild the template parameter with updated depth and index.
+      NamedDecl *NewParam = transformTemplateParameter(
+          SemaRef, F->getDeclContext(), TP, Args,
+          /*NewIndex=*/FirstUndeducedParamIdx,
+          getTemplateParameterDepth(TP) + AdjustDepth);
+      FirstUndeducedParamIdx += 1;
+      assert(TemplateArgsForBuildingRC[Index].isNull());
+      TemplateArgsForBuildingRC[Index] = Context.getCanonicalTemplateArgument(
+          Context.getInjectedTemplateArg(NewParam));
       continue;
+    }
     TemplateArgumentLoc Input =
         SemaRef.getTrivialTemplateArgumentLoc(D, QualType(), SourceLocation{});
     TemplateArgumentLoc Output;
@@ -2820,8 +2834,8 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
   MultiLevelTemplateArgumentList ArgsForBuildingRC;
   ArgsForBuildingRC.setKind(clang::TemplateSubstitutionKind::Rewrite);
   ArgsForBuildingRC.addOuterTemplateArguments(TemplateArgsForBuildingRC);
-  // For 2), if the underlying F is instantiated from a member template, we need
-  // the entire template argument list, as the constraint AST in the
+  // For 2), if the underlying deduction guide F is nested in a class template,
+  // we need the entire template argument list, as the constraint AST in the
   // require-clause of F remains completely uninstantiated.
   //
   // For example:
@@ -2845,7 +2859,13 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
   // We add the outer template arguments which is [int] to the multi-level arg
   // list to ensure that the occurrence U in `C<U>` will be replaced with int
   // during the substitution.
-  if (F->getInstantiatedFromMemberTemplate()) {
+  //
+  // NOTE: The underlying deduction guide F is instantiated -- either from an
+  // explicitly-written deduction guide member, or from a constructor.
+  // getInstantiatedFromMemberTemplate() can only handle the former case, so we
+  // check the DeclContext kind.
+  if (F->getLexicalDeclContext()->getDeclKind() ==
+      clang::Decl::ClassTemplateSpecialization) {
     auto OuterLevelArgs = SemaRef.getTemplateInstantiationArgs(
         F, F->getLexicalDeclContext(),
         /*Final=*/false, /*Innermost=*/std::nullopt,
@@ -3063,6 +3083,7 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
         Context.getInjectedTemplateArg(NewParam));
     TransformedDeducedAliasArgs[AliasTemplateParamIdx] = NewTemplateArgument;
   }
+  unsigned FirstUndeducedParamIdx = FPrimeTemplateParams.size();
   //   ...followed by the template parameters of f that were not deduced
   //   (including their default template arguments)
   for (unsigned FTemplateParamIdx : NonDeducedTemplateParamsInFIndex) {
@@ -3131,8 +3152,9 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
 
     Expr *IsDeducible = buildIsDeducibleConstraint(
         SemaRef, AliasTemplate, FPrime->getReturnType(), FPrimeTemplateParams);
-    Expr *RequiresClause = buildAssociatedConstraints(
-        SemaRef, F, AliasTemplate, DeduceResults, IsDeducible);
+    Expr *RequiresClause =
+        buildAssociatedConstraints(SemaRef, F, AliasTemplate, DeduceResults,
+                                   FirstUndeducedParamIdx, IsDeducible);
 
     auto *FPrimeTemplateParamList = TemplateParameterList::Create(
         Context, AliasTemplate->getTemplateParameters()->getTemplateLoc(),
diff --git a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
index 238e87a..8dd08f1 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InterCheckerAPI.h"
+#include "clang/AST/OperationKinds.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
@@ -22,10 +23,13 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 #include <functional>
 #include <optional>
@@ -304,6 +308,10 @@ public:
   // Re-usable checks
   ProgramStateRef checkNonNull(CheckerContext &C, ProgramStateRef State,
                                AnyArgExpr Arg, SVal l) const;
+  // Check whether the origin region behind \p Element (like the actual array
+  // region \p Element is from) is initialized.
+  ProgramStateRef checkInit(CheckerContext &C, ProgramStateRef state,
+                            AnyArgExpr Buffer, SVal Element, SVal Size) const;
   ProgramStateRef CheckLocation(CheckerContext &C, ProgramStateRef state,
                                 AnyArgExpr Buffer, SVal Element,
                                 AccessKind Access,
@@ -329,7 +337,7 @@ public:
                          const Stmt *S, StringRef WarningMsg) const;
   void emitAdditionOverflowBug(CheckerContext &C, ProgramStateRef State) const;
   void emitUninitializedReadBug(CheckerContext &C, ProgramStateRef State,
-                             const Expr *E) const;
+                                const Expr *E, StringRef Msg) const;
   ProgramStateRef checkAdditionOverflow(CheckerContext &C,
                                             ProgramStateRef state,
                                             NonLoc left,
@@ -351,16 +359,16 @@ REGISTER_MAP_WITH_PROGRAMSTATE(CStringLength, const MemRegion *, SVal)
 // Individual checks and utility methods.
 //===----------------------------------------------------------------------===//
 
-std::pair<ProgramStateRef , ProgramStateRef >
-CStringChecker::assumeZero(CheckerContext &C, ProgramStateRef state, SVal V,
+std::pair<ProgramStateRef, ProgramStateRef>
+CStringChecker::assumeZero(CheckerContext &C, ProgramStateRef State, SVal V,
                            QualType Ty) {
   std::optional<DefinedSVal> val = V.getAs<DefinedSVal>();
   if (!val)
-    return std::pair<ProgramStateRef , ProgramStateRef >(state, state);
+    return std::pair<ProgramStateRef, ProgramStateRef>(State, State);
 
   SValBuilder &svalBuilder = C.getSValBuilder();
   DefinedOrUnknownSVal zero = svalBuilder.makeZeroVal(Ty);
-  return state->assume(svalBuilder.evalEQ(state, *val, zero));
+  return State->assume(svalBuilder.evalEQ(State, *val, zero));
 }
 
 ProgramStateRef CStringChecker::checkNonNull(CheckerContext &C,
@@ -393,6 +401,149 @@ ProgramStateRef CStringChecker::checkNonNull(CheckerContext &C,
   return stateNonNull;
 }
 
+static std::optional<NonLoc> getIndex(ProgramStateRef State,
+                                      const ElementRegion *ER, CharKind CK) {
+  SValBuilder &SVB = State->getStateManager().getSValBuilder();
+  ASTContext &Ctx = SVB.getContext();
+
+  if (CK == CharKind::Regular) {
+    if (ER->getValueType() != Ctx.CharTy)
+      return {};
+    return ER->getIndex();
+  }
+
+  if (ER->getValueType() != Ctx.WideCharTy)
+    return {};
+
+  QualType SizeTy = Ctx.getSizeType();
+  NonLoc WideSize =
+      SVB.makeIntVal(Ctx.getTypeSizeInChars(Ctx.WideCharTy).getQuantity(),
+                     SizeTy)
+          .castAs<NonLoc>();
+  SVal Offset =
+      SVB.evalBinOpNN(State, BO_Mul, ER->getIndex(), WideSize, SizeTy);
+  if (Offset.isUnknown())
+    return {};
+  return Offset.castAs<NonLoc>();
+}
+
+// Basically 1 -> 1st, 12 -> 12th, etc.
+static void printIdxWithOrdinalSuffix(llvm::raw_ostream &Os, unsigned Idx) {
+  Os << Idx << llvm::getOrdinalSuffix(Idx);
+}
+
+ProgramStateRef CStringChecker::checkInit(CheckerContext &C,
+                                          ProgramStateRef State,
+                                          AnyArgExpr Buffer, SVal Element,
+                                          SVal Size) const {
+
+  // If a previous check has failed, propagate the failure.
+  if (!State)
+    return nullptr;
+
+  const MemRegion *R = Element.getAsRegion();
+  const auto *ER = dyn_cast_or_null<ElementRegion>(R);
+  if (!ER)
+    return State;
+
+  const auto *SuperR = ER->getSuperRegion()->getAs<TypedValueRegion>();
+  if (!SuperR)
+    return State;
+
+  // FIXME: We ought to able to check objects as well. Maybe
+  // UninitializedObjectChecker could help?
+  if (!SuperR->getValueType()->isArrayType())
+    return State;
+
+  SValBuilder &SVB = C.getSValBuilder();
+  ASTContext &Ctx = SVB.getContext();
+
+  const QualType ElemTy = Ctx.getBaseElementType(SuperR->getValueType());
+  const NonLoc Zero = SVB.makeZeroArrayIndex();
+
+  std::optional<Loc> FirstElementVal =
+      State->getLValue(ElemTy, Zero, loc::MemRegionVal(SuperR)).getAs<Loc>();
+  if (!FirstElementVal)
+    return State;
+
+  // Ensure that we wouldn't read uninitialized value.
+  if (Filter.CheckCStringUninitializedRead &&
+      State->getSVal(*FirstElementVal).isUndef()) {
+    llvm::SmallString<258> Buf;
+    llvm::raw_svector_ostream OS(Buf);
+    OS << "The first element of the ";
+    printIdxWithOrdinalSuffix(OS, Buffer.ArgumentIndex + 1);
+    OS << " argument is undefined";
+    emitUninitializedReadBug(C, State, Buffer.Expression, OS.str());
+    return nullptr;
+  }
+
+  // We won't check whether the entire region is fully initialized -- lets just
+  // check that the first and the last element is. So, onto checking the last
+  // element:
+  const QualType IdxTy = SVB.getArrayIndexType();
+
+  NonLoc ElemSize =
+      SVB.makeIntVal(Ctx.getTypeSizeInChars(ElemTy).getQuantity(), IdxTy)
+          .castAs<NonLoc>();
+
+  // FIXME: Check that the size arg to the cstring function is divisible by
+  // size of the actual element type?
+
+  // The type of the argument to the cstring function is either char or wchar,
+  // but thats not the type of the original array (or memory region).
+  // Suppose the following:
+  //   int t[5];
+  //   memcpy(dst, t, sizeof(t) / sizeof(t[0]));
+  // When checking whether t is fully initialized, we see it as char array of
+  // size sizeof(int)*5. If we check the last element as a character, we read
+  // the last byte of an integer, which will be undefined. But just because
+  // that value is undefined, it doesn't mean that the element is uninitialized!
+  // For this reason, we need to retrieve the actual last element with the
+  // correct type.
+
+  // Divide the size argument to the cstring function by the actual element
+  // type. This value will be size of the array, or the index to the
+  // past-the-end element.
+  std::optional<NonLoc> Offset =
+      SVB.evalBinOpNN(State, clang::BO_Div, Size.castAs<NonLoc>(), ElemSize,
+                      IdxTy)
+          .getAs<NonLoc>();
+
+  // Retrieve the index of the last element.
+  const NonLoc One = SVB.makeIntVal(1, IdxTy).castAs<NonLoc>();
+  SVal LastIdx = SVB.evalBinOpNN(State, BO_Sub, *Offset, One, IdxTy);
+
+  if (!Offset)
+    return State;
+
+  SVal LastElementVal =
+      State->getLValue(ElemTy, LastIdx, loc::MemRegionVal(SuperR));
+  if (!isa<Loc>(LastElementVal))
+    return State;
+
+  if (Filter.CheckCStringUninitializedRead &&
+      State->getSVal(LastElementVal.castAs<Loc>()).isUndef()) {
+    const llvm::APSInt *IdxInt = LastIdx.getAsInteger();
+    // If we can't get emit a sensible last element index, just bail out --
+    // prefer to emit nothing in favour of emitting garbage quality reports.
+    if (!IdxInt) {
+      C.addSink();
+      return nullptr;
+    }
+    llvm::SmallString<258> Buf;
+    llvm::raw_svector_ostream OS(Buf);
+    OS << "The last accessed element (at index ";
+    OS << IdxInt->getExtValue();
+    OS << ") in the ";
+    printIdxWithOrdinalSuffix(OS, Buffer.ArgumentIndex + 1);
+    OS << " argument is undefined";
+    emitUninitializedReadBug(C, State, Buffer.Expression, OS.str());
+    return nullptr;
+  }
+  return State;
+}
+
 // FIXME: This was originally copied from ArrayBoundChecker.cpp. Refactor?
 ProgramStateRef CStringChecker::CheckLocation(CheckerContext &C,
                                               ProgramStateRef state,
@@ -413,38 +564,17 @@ ProgramStateRef CStringChecker::CheckLocation(CheckerContext &C,
   if (!ER)
     return state;
 
-  SValBuilder &svalBuilder = C.getSValBuilder();
-  ASTContext &Ctx = svalBuilder.getContext();
-
   // Get the index of the accessed element.
-  NonLoc Idx = ER->getIndex();
-
-  if (CK == CharKind::Regular) {
-    if (ER->getValueType() != Ctx.CharTy)
-      return state;
-  } else {
-    if (ER->getValueType() != Ctx.WideCharTy)
-      return state;
-
-    QualType SizeTy = Ctx.getSizeType();
-    NonLoc WideSize =
-        svalBuilder
-            .makeIntVal(Ctx.getTypeSizeInChars(Ctx.WideCharTy).getQuantity(),
-                        SizeTy)
-            .castAs<NonLoc>();
-    SVal Offset = svalBuilder.evalBinOpNN(state, BO_Mul, Idx, WideSize, SizeTy);
-    if (Offset.isUnknown())
-      return state;
-    Idx = Offset.castAs<NonLoc>();
-  }
+  std::optional<NonLoc> Idx = getIndex(state, ER, CK);
+  if (!Idx)
+    return state;
 
   // Get the size of the array.
   const auto *superReg = cast<SubRegion>(ER->getSuperRegion());
   DefinedOrUnknownSVal Size =
       getDynamicExtent(state, superReg, C.getSValBuilder());
 
-  ProgramStateRef StInBound, StOutBound;
-  std::tie(StInBound, StOutBound) = state->assumeInBoundDual(Idx, Size);
+  auto [StInBound, StOutBound] = state->assumeInBoundDual(*Idx, Size);
   if (StOutBound && !StInBound) {
     // These checks are either enabled by the CString out-of-bounds checker
     // explicitly or implicitly by the Malloc checker.
@@ -459,15 +589,6 @@ ProgramStateRef CStringChecker::CheckLocation(CheckerContext &C,
     return nullptr;
   }
 
-  // Ensure that we wouldn't read uninitialized value.
-  if (Access == AccessKind::read) {
-    if (Filter.CheckCStringUninitializedRead &&
-        StInBound->getSVal(ER).isUndef()) {
-      emitUninitializedReadBug(C, StInBound, Buffer.Expression);
-      return nullptr;
-    }
-  }
-
   // Array bound check succeeded.  From this point forward the array bound
   // should always succeed.
   return StInBound;
@@ -502,6 +623,7 @@ CStringChecker::CheckBufferAccess(CheckerContext &C, ProgramStateRef State,
 
   // Check if the first byte of the buffer is accessible.
   State = CheckLocation(C, State, Buffer, BufStart, Access, CK);
+
   if (!State)
     return nullptr;
 
@@ -526,6 +648,8 @@ CStringChecker::CheckBufferAccess(CheckerContext &C, ProgramStateRef State,
     SVal BufEnd =
         svalBuilder.evalBinOpLN(State, BO_Add, *BufLoc, LastOffset, PtrTy);
     State = CheckLocation(C, State, Buffer, BufEnd, Access, CK);
+    if (Access == AccessKind::read)
+      State = checkInit(C, State, Buffer, BufEnd, *Length);
 
     // If the buffer isn't large enough, abort.
     if (!State)
@@ -694,16 +818,17 @@ void CStringChecker::emitNullArgBug(CheckerContext &C, ProgramStateRef State,
 
 void CStringChecker::emitUninitializedReadBug(CheckerContext &C,
                                               ProgramStateRef State,
-                                              const Expr *E) const {
+                                              const Expr *E,
+                                              StringRef Msg) const {
   if (ExplodedNode *N = C.generateErrorNode(State)) {
-    const char *Msg =
-        "Bytes string function accesses uninitialized/garbage values";
     if (!BT_UninitRead)
       BT_UninitRead.reset(new BugType(Filter.CheckNameCStringUninitializedRead,
                                       "Accessing unitialized/garbage values"));
 
     auto Report =
         std::make_unique<PathSensitiveBugReport>(*BT_UninitRead, Msg, N);
+    Report->addNote("Other elements might also be undefined",
+                    Report->getLocation());
     Report->addRange(E->getSourceRange());
     bugreporter::trackExpressionValue(N, E, *Report);
     C.emitReport(std::move(Report));
diff --git a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
index 6fe929b..693791c 100644
--- a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
+++ b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
@@ -630,6 +630,17 @@ bool MemRegion::canPrintPrettyAsExpr() const {
   return false;
 }
 
+StringRef MemRegion::getKindStr() const {
+  switch (getKind()) {
+#define REGION(Id, Parent)                                                     \
+  case Id##Kind:                                                               \
+    return #Id;
+#include "clang/StaticAnalyzer/Core/PathSensitive/Regions.def"
+#undef REGION
+  }
+  llvm_unreachable("Unkown kind!");
+}
+
 void MemRegion::printPretty(raw_ostream &os) const {
   assert(canPrintPretty() && "This region cannot be printed pretty.");
   os << "'";
diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp
index a4b6f06..6f07a62 100644
--- a/clang/test/AST/ast-dump-ctad-alias.cpp
+++ b/clang/test/AST/ast-dump-ctad-alias.cpp
@@ -53,6 +53,33 @@ Out2<double>::AInner t(1.0);
 // CHECK-NEXT: |       | `-BuiltinType {{.*}} 'double'
 // CHECK-NEXT: |       `-ParmVarDecl {{.*}} 'double'
 
+// GH92596
+template <typename T0>
+struct Out3 {
+  template<class T1, typename T2>
+  struct Foo {
+    // Deduction guide:
+    //   template <typename T1, typename T2, typename V>
+    //   Foo(V, T1) -> Foo<T1, T2>;
+    template<class V> requires Concept<T0, V> // V in require clause of Foo deduction guide: depth 1, index: 2
+    Foo(V, T1);
+  };
+};
+template<class T3>
+using AFoo3 = Out3<int>::Foo<T3, T3>;
+AFoo3 afoo3{0, 1};
+// Verify occurrence V in the require-clause is transformed (depth: 1 => 0, index: 2 => 1) correctly.
+
+// CHECK:      FunctionTemplateDecl {{.*}} implicit <deduction guide for AFoo3>
+// CHECK-NEXT: |-TemplateTypeParmDecl {{.*}} class depth 0 index 0 T3
+// CHECK-NEXT: |-TemplateTypeParmDecl {{.*}} class depth 0 index 1 V
+// CHECK-NEXT: |-BinaryOperator {{.*}} '<dependent type>' '&&'
+// CHECK-NEXT: | |-UnresolvedLookupExpr {{.*}} '<dependent type>' lvalue (no ADL) = 'Concept'
+// CHECK-NEXT: | | |-TemplateArgument type 'int'
+// CHECK-NEXT: | | | `-BuiltinType {{.*}} 'int'
+// CHECK-NEXT: | | `-TemplateArgument type 'type-parameter-0-1'
+// CHECK-NEXT: | |   `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent depth 0 index 1
+
 template <typename... T1>
 struct Foo {
   Foo(T1...);
diff --git a/clang/test/Analysis/bstring_UninitRead.c b/clang/test/Analysis/bstring_UninitRead.c
index c535e01..45e38dd 100644
--- a/clang/test/Analysis/bstring_UninitRead.c
+++ b/clang/test/Analysis/bstring_UninitRead.c
@@ -1,12 +1,11 @@
 // RUN: %clang_analyze_cc1 -verify %s \
 // RUN: -analyzer-checker=core,alpha.unix.cstring
 
-
-// This file is generally for the alpha.unix.cstring.UninitializedRead Checker, the reason for putting it into
-// the separate file because the checker is break the some existing test cases in bstring.c file , so we don't 
-// wanna mess up with some existing test case so it's better to create separate file for it, this file also include 
-// the broken test for the reference in future about the broken tests.
-
+//===----------------------------------------------------------------------===//
+// mempcpy() using character array. This is the easiest case, as memcpy
+// intepretrs the dst and src buffers as character arrays (regardless of their
+// actual type).
+//===----------------------------------------------------------------------===//
 
 typedef typeof(sizeof(int)) size_t;
 
@@ -14,46 +13,120 @@ void clang_analyzer_eval(int);
 
 void *memcpy(void *restrict s1, const void *restrict s2, size_t n);
 
-void top(char *dst) {
+void memcpy_array_fully_uninit(char *dst) {
+  char buf[10];
+  memcpy(dst, buf, 10); // expected-warning{{The first element of the 2nd argument is undefined}}
+                        // expected-note@-1{{Other elements might also be undefined}}
+  (void)buf;
+}
+
+void memcpy_array_partially_uninit(char *dst) {
   char buf[10];
-  memcpy(dst, buf, 10); // expected-warning{{Bytes string function accesses uninitialized/garbage values}}
+  buf[0] = 'i';
+  memcpy(dst, buf, 10); // expected-warning{{The last accessed element (at index 9) in the 2nd argument is undefined}}
+                        // expected-note@-1{{Other elements might also be undefined}}
+  (void)buf;
+}
+
+void memcpy_array_only_init_portion(char *dst) {
+  char buf[10];
+  buf[0] = 'i';
+  memcpy(dst, buf, 1);
+  (void)buf;
+}
+
+void memcpy_array_partially_init_error(char *dst) {
+  char buf[10];
+  buf[0] = 'i';
+  memcpy(dst, buf, 2); // expected-warning{{The last accessed element (at index 1) in the 2nd argument is undefined}}
+                      // expected-note@-1{{Other elements might also be undefined}}
+  (void)buf;
+}
+
+// The interesting case here is that the portion we're copying is initialized,
+// but not the whole matrix. We need to be careful to extract buf[1], and not
+// buf when trying to peel region layers off from the source argument.
+void memcpy_array_from_matrix(char *dst) {
+  char buf[2][2];
+  buf[1][0] = 'i';
+  buf[1][1] = 'j';
+  // FIXME: This is a FP -- we mistakenly retrieve the first element of buf,
+  // instead of the first element of buf[1]. getLValueElement simply peels off
+  // another ElementRegion layer, when in this case it really shouldn't.
+  memcpy(dst, buf[1], 2); // expected-warning{{The first element of the 2nd argument is undefined}}
+                          // expected-note@-1{{Other elements might also be undefined}}
   (void)buf;
 }
 
-//===----------------------------------------------------------------------===
-// mempcpy()
-//===----------------------------------------------------------------------===
+//===----------------------------------------------------------------------===//
+// mempcpy() using non-character arrays.
+//===----------------------------------------------------------------------===//
 
 void *mempcpy(void *restrict s1, const void *restrict s2, size_t n);
 
-void mempcpy14() {
+void memcpy_int_array_fully_init() {
   int src[] = {1, 2, 3, 4};
   int dst[5] = {0};
   int *p;
 
-  p = mempcpy(dst, src, 4 * sizeof(int)); // expected-warning{{Bytes string function accesses uninitialized/garbage values}}
-   // FIXME: This behaviour is actually surprising and needs to be fixed, 
-   // mempcpy seems to consider the very last byte of the src buffer uninitialized
-   // and returning undef unfortunately. It should have returned unknown or a conjured value instead.
+  p = mempcpy(dst, src, 4 * sizeof(int));
+  clang_analyzer_eval(p == &dst[4]);
+}
 
-  clang_analyzer_eval(p == &dst[4]); // no-warning (above is fatal)
+void memcpy_int_array_fully_init2(int *dest) {
+  int t[] = {1, 2, 3};
+  memcpy(dest, t, sizeof(t));
 }
 
+//===----------------------------------------------------------------------===//
+// mempcpy() using nonarrays.
+//===----------------------------------------------------------------------===//
+
 struct st {
   int i;
   int j;
 };
 
-
-void mempcpy15() {
+void mempcpy_struct_partially_uninit() {
   struct st s1 = {0};
   struct st s2;
   struct st *p1;
   struct st *p2;
 
   p1 = (&s2) + 1;
-  p2 = mempcpy(&s2, &s1, sizeof(struct st)); // expected-warning{{Bytes string function accesses uninitialized/garbage values}}
-  // FIXME: It seems same as mempcpy14() case.
-  
-  clang_analyzer_eval(p1 == p2); // no-warning (above is fatal)
+
+  // FIXME: Maybe ask UninitializedObjectChecker whether s1 is fully
+  // initialized?
+  p2 = mempcpy(&s2, &s1, sizeof(struct st));
+
+  clang_analyzer_eval(p1 == p2);
+}
+
+void mempcpy_struct_fully_uninit() {
+  struct st s1;
+  struct st s2;
+
+  // FIXME: Maybe ask UninitializedObjectChecker whether s1 is fully
+  // initialized?
+  mempcpy(&s2, &s1, sizeof(struct st));
+}
+
+// Creduced crash. In this case, an symbolicregion is wrapped in an
+// elementregion for the src argument.
+void *ga_copy_strings_from_0;
+void *memmove();
+void alloc();
+void ga_copy_strings() {
+  int i = 0;
+  for (;; ++i)
+    memmove(alloc, ((char **)ga_copy_strings_from_0)[i], 1);
+}
+
+// Creduced crash. In this case, retrieving the Loc for the first element failed.
+char mov_mdhd_language_map[][4] = {};
+int ff_mov_lang_to_iso639_code;
+char *ff_mov_lang_to_iso639_to;
+void ff_mov_lang_to_iso639() {
+  memcpy(ff_mov_lang_to_iso639_to,
+         mov_mdhd_language_map[ff_mov_lang_to_iso639_code], 4);
 }
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 024aaff..4edfc54 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -428,13 +428,6 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_default
-// CHECK-SAME: () #[[ATTR11]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 111
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs
 // CHECK-SAME: () #[[ATTR11]] {
 // CHECK-NEXT:  entry:
@@ -637,22 +630,18 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@recur
+// CHECK-LABEL: define {{[^@]+}}@fmv_default
 // CHECK-SAME: () #[[ATTR11]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @reca()
-// CHECK-NEXT:    ret void
+// CHECK-NEXT:    ret i32 111
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@main
+// CHECK-LABEL: define {{[^@]+}}@recur
 // CHECK-SAME: () #[[ATTR11]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
-// CHECK-NEXT:    call void @recur()
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 @goo()
-// CHECK-NEXT:    ret i32 [[CALL]]
+// CHECK-NEXT:    call void @reca()
+// CHECK-NEXT:    ret void
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
@@ -818,6 +807,17 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@main
+// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    call void @recur()
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @goo()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf64mmMpmullMsha1
 // CHECK-SAME: () #[[ATTR22:[0-9]+]] {
 // CHECK-NEXT:  entry:
@@ -1020,20 +1020,6 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
-// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_default
-// CHECK-NOFMV-SAME: () #[[ATTR0]] {
-// CHECK-NOFMV-NEXT:  entry:
-// CHECK-NOFMV-NEXT:    ret i32 111
-//
-//
-// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
-// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_c
-// CHECK-NOFMV-SAME: () #[[ATTR0]] {
-// CHECK-NOFMV-NEXT:  entry:
-// CHECK-NOFMV-NEXT:    ret void
-//
-//
-// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@goo
 // CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
@@ -1053,22 +1039,25 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
-// CHECK-NOFMV-LABEL: define {{[^@]+}}@recur
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_c
 // CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
-// CHECK-NOFMV-NEXT:    call void @reca()
 // CHECK-NOFMV-NEXT:    ret void
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
-// CHECK-NOFMV-LABEL: define {{[^@]+}}@main
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_default
 // CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
-// CHECK-NOFMV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
-// CHECK-NOFMV-NEXT:    store i32 0, ptr [[RETVAL]], align 4
-// CHECK-NOFMV-NEXT:    call void @recur()
-// CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @goo()
-// CHECK-NOFMV-NEXT:    ret i32 [[CALL]]
+// CHECK-NOFMV-NEXT:    ret i32 111
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@recur
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    call void @reca()
+// CHECK-NOFMV-NEXT:    ret void
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
@@ -1089,31 +1078,42 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
-// CHECK-NOFMV-LABEL: define {{[^@]+}}@unused_with_default_def
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@unused_with_implicit_default_def
 // CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 1
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
-// CHECK-NOFMV-LABEL: define {{[^@]+}}@unused_with_implicit_default_def
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def
 // CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
-// CHECK-NOFMV-NEXT:    ret i32 1
+// CHECK-NOFMV-NEXT:    ret i32 0
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
-// CHECK-NOFMV-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@default_def_with_version_decls
 // CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
-// CHECK-NOFMV-LABEL: define {{[^@]+}}@default_def_with_version_decls
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@main
 // CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
-// CHECK-NOFMV-NEXT:    ret i32 0
+// CHECK-NOFMV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NOFMV-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NOFMV-NEXT:    call void @recur()
+// CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @goo()
+// CHECK-NOFMV-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@unused_with_default_def
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    ret i32 1
 //
 //.
 // CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp-armv8,+fp16fml,+fullfp16,+neon,+rand,-v9.5a" }
diff --git a/clang/test/CodeGenCXX/fmv-namespace.cpp b/clang/test/CodeGenCXX/fmv-namespace.cpp
index 193f01d..abfff1a 100644
--- a/clang/test/CodeGenCXX/fmv-namespace.cpp
+++ b/clang/test/CodeGenCXX/fmv-namespace.cpp
@@ -17,25 +17,26 @@ int __attribute((target_version("sve"))) foo() { return 2; }
 
 int baz() { return OtherName::foo(); }
 
+namespace Foo {
+int bar();
+__attribute((target_version("default"))) int bar() { return 0; }
+__attribute((target_version("mops"))) int bar() { return 1; }
+}
+
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
 // CHECK: @_ZN4Name3fooEv = weak_odr ifunc i32 (), ptr @_ZN4Name3fooEv.resolver
 // CHECK: @_ZN9OtherName3fooEv = weak_odr ifunc i32 (), ptr @_ZN9OtherName3fooEv.resolver
+// CHECK: @_ZN3Foo3barEv = weak_odr ifunc i32 (), ptr @_ZN3Foo3barEv.resolver
 //.
-// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv.default(
-// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    ret i32 0
-//
-//
 // CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv._Msve(
-// CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK-LABEL: define dso_local noundef i32 @_Z3barv(
-// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN4Name3fooEv()
 // CHECK-NEXT:    ret i32 [[CALL]]
@@ -56,13 +57,13 @@ int baz() { return OtherName::foo(); }
 //
 //
 // CHECK-LABEL: define dso_local noundef i32 @_ZN9OtherName3fooEv._Msve(
-// CHECK-SAME: ) #[[ATTR1]] {
+// CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK-LABEL: define dso_local noundef i32 @_Z3bazv(
-// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-SAME: ) #[[ATTR1]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN9OtherName3fooEv()
 // CHECK-NEXT:    ret i32 [[CALL]]
@@ -81,10 +82,43 @@ int baz() { return OtherName::foo(); }
 // CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_ZN9OtherName3fooEv.default
 //
+//
+// CHECK-LABEL: define dso_local noundef i32 @_ZN3Foo3barEv.default(
+// CHECK-SAME: ) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @_ZN3Foo3barEv._Mmops(
+// CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv.default(
+// CHECK-SAME: ) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define weak_odr ptr @_ZN3Foo3barEv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 576460752303423488
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 576460752303423488
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
+// CHECK-NEXT:    ret ptr @_ZN3Foo3barEv._Mmops
+// CHECK:       [[RESOLVER_ELSE]]:
+// CHECK-NEXT:    ret ptr @_ZN3Foo3barEv.default
+//
 //.
-// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
-// CHECK: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR2]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" }
+// CHECK: attributes #[[ATTR3:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
diff --git a/clang/test/CodeGenCXX/mangle-ms-auto-templates-memptrs.cpp b/clang/test/CodeGenCXX/mangle-ms-auto-templates-memptrs.cpp
new file mode 100644
index 0000000..360ebde
--- /dev/null
+++ b/clang/test/CodeGenCXX/mangle-ms-auto-templates-memptrs.cpp
@@ -0,0 +1,71 @@
+// RUN: %clang_cc1 -std=c++17 -fms-compatibility-version=19.20 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=x86_64-pc-windows-msvc | FileCheck --check-prefix=AFTER %s
+// RUN: %clang_cc1 -std=c++17 -fms-compatibility-version=19.14 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=x86_64-pc-windows-msvc | FileCheck --check-prefix=BEFORE %s
+
+template <auto a>
+class AutoParmTemplate {
+public:
+    AutoParmTemplate() {}
+};
+
+template <auto a>
+auto AutoFunc() {
+    return a;
+}
+
+struct A {};
+struct B {};
+
+struct S             { int a; void f(); virtual void g(); };
+struct M : A, B      { int a; void f(); virtual void g(); };
+struct V : virtual A { int a; void f(); virtual void g(); };
+
+void template_mangling() {
+
+  AutoParmTemplate<&S::f> auto_method_single_inheritance;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$MP8S@@EAAXXZ1?f@1@QEAAXXZ@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$1?f@S@@QEAAXXZ@@QEAA@XZ"
+
+  AutoParmTemplate<&M::f> auto_method_multiple_inheritance;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$MP8M@@EAAXXZH?f@1@QEAAXXZA@@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$H?f@M@@QEAAXXZA@@@QEAA@XZ"
+
+  AutoParmTemplate<&V::f> auto_method_virtual_inheritance;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$MP8V@@EAAXXZI?f@1@QEAAXXZA@A@@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$I?f@V@@QEAAXXZA@A@@@QEAA@XZ"
+
+  AutoFunc<&S::f>();
+  // AFTER: call {{.*}} @"??$AutoFunc@$MP8S@@EAAXXZ1?f@1@QEAAXXZ@@YA?A?<auto>@@XZ"
+  // BEFORE: call {{.*}} @"??$AutoFunc@$1?f@S@@QEAAXXZ@@YA?A?<auto>@@XZ"
+
+  AutoFunc<&M::f>();
+  // AFTER: call {{.*}} @"??$AutoFunc@$MP8M@@EAAXXZH?f@1@QEAAXXZA@@@YA?A?<auto>@@XZ"
+  // BEFORE: call {{.*}} @"??$AutoFunc@$H?f@M@@QEAAXXZA@@@YA?A?<auto>@@XZ"
+
+  AutoFunc<&V::f>();
+  // AFTER: call {{.*}} @"??$AutoFunc@$MP8V@@EAAXXZI?f@1@QEAAXXZA@A@@@YA?A?<auto>@@XZ"
+  // BEFORE: call {{.*}} @"??$AutoFunc@$I?f@V@@QEAAXXZA@A@@@YA?A?<auto>@@XZ"
+
+  AutoParmTemplate<&S::a> auto_data_single_inheritance;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$MPEQS@@H07@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$07@@QEAA@XZ"
+
+  AutoParmTemplate<&M::a> auto_data_multiple_inheritance;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$MPEQM@@H0M@@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$0M@@@QEAA@XZ"
+
+  AutoParmTemplate<&V::a> auto_data_virtual_inheritance;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$MPEQV@@HFBA@A@@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$FBA@A@@@QEAA@XZ"
+
+  AutoFunc<&S::a>();
+  // AFTER: call {{.*}} @"??$AutoFunc@$MPEQS@@H07@@YA?A?<auto>@@XZ"
+  // BEFORE: call {{.*}} @"??$AutoFunc@$07@@YA?A?<auto>@@XZ"
+
+  AutoFunc<&M::a>();
+  // AFTER: call {{.*}} @"??$AutoFunc@$MPEQM@@H0M@@@YA?A?<auto>@@XZ"
+  // BEFORE: call {{.*}} @"??$AutoFunc@$0M@@@YA?A?<auto>@@XZ"
+
+  AutoFunc<&V::a>();
+  // AFTER: call {{.*}} @"??$AutoFunc@$MPEQV@@HFBA@A@@@YA?A?<auto>@@XZ"
+  // BEFORE: call {{.*}} @"??$AutoFunc@$FBA@A@@@YA?A?<auto>@@XZ"
+}
diff --git a/clang/test/CodeGenCXX/mangle-ms-auto-templates-nullptr.cpp b/clang/test/CodeGenCXX/mangle-ms-auto-templates-nullptr.cpp
new file mode 100644
index 0000000..8f98c1e
--- /dev/null
+++ b/clang/test/CodeGenCXX/mangle-ms-auto-templates-nullptr.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -std=c++17 -fms-compatibility-version=19.20 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=x86_64-pc-windows-msvc | FileCheck --check-prefix=AFTER %s
+// RUN: %clang_cc1 -std=c++17 -fms-compatibility-version=19.14 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=x86_64-pc-windows-msvc | FileCheck --check-prefix=BEFORE %s
+
+template <auto a>
+class AutoParmTemplate {
+public:
+    AutoParmTemplate() {}
+};
+
+template <auto a>
+auto AutoFunc() {
+    return a;
+}
+
+void template_mangling() {
+
+  AutoParmTemplate<nullptr> auto_nullptr;
+  // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$M$$T0A@@@QEAA@XZ"
+  // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$0A@@@QEAA@XZ"
+
+  AutoFunc<nullptr>();
+  // AFTER: call {{.*}} @"??$AutoFunc@$M$$T0A@@@YA?A?<auto>@@XZ"
+  // BEFORE: call {{.*}} @"??$AutoFunc@$0A@@@YA?A?<auto>@@XZ"
+}
diff --git a/clang/test/Sema/attr-target-version.c b/clang/test/Sema/attr-target-version.c
index cd5be45..88a927a 100644
--- a/clang/test/Sema/attr-target-version.c
+++ b/clang/test/Sema/attr-target-version.c
@@ -104,6 +104,11 @@ int __attribute__((aarch64_vector_pcs, target_version("sha3"))) combine(void) {
 
 int __attribute__((target_version("fp+aes+pmull+rcpc"))) unspec_args() { return -1; }
 // expected-error@-1 {{multiversioned function must have a prototype}}
-// expected-error@+1 {{multiversioned function must have a prototype}}
 int __attribute__((target_version("default"))) unspec_args() { return 0; }
 int cargs() { return unspec_args(); }
+
+int unspec_args_implicit_default_first();
+// expected-error@-1 {{multiversioned function must have a prototype}}
+// expected-note@+1 {{function multiversioning caused by this declaration}}
+int __attribute__((target_version("aes"))) unspec_args_implicit_default_first() { return -1; }
+int __attribute__((target_version("default"))) unspec_args_implicit_default_first() { return 0; }
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index 3dafe82..a369ce6 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -266,7 +266,7 @@ template <typename U>
 using Bar = Foo<U>; // expected-note {{could not match 'Foo<type-parameter-0-0>' against 'int'}} \
                     // expected-note {{implicit deduction guide declared as 'template <typename U> requires __is_deducible(test18::Bar, Foo<type-parameter-0-0>) Bar(Foo<type-parameter-0-0>) -> Foo<type-parameter-0-0>'}} \
                     // expected-note {{candidate template ignored: constraints not satisfied}} \
-                    // expected-note {{implicit deduction guide declared as 'template <typename T> requires False<T> && __is_deducible(test18::Bar, Foo<int>) Bar(type-parameter-0-0) -> Foo<int>'}} \
+                    // expected-note {{implicit deduction guide declared as 'template <typename T> requires False<type-parameter-0-0> && __is_deducible(test18::Bar, Foo<int>) Bar(type-parameter-0-0) -> Foo<int>'}} \
                     // expected-note {{candidate function template not viable}} \
                     // expected-note {{implicit deduction guide declared as 'template <typename U> requires __is_deducible(test18::Bar, Foo<type-parameter-0-0>) Bar() -> Foo<type-parameter-0-0>'}}
 
@@ -414,4 +414,29 @@ struct A1 {
 template <typename U>
 using AFoo = A1<int>::A2<int>::Foo<U>;
 AFoo case3(1);
+
+// Case4: crashes on the constexpr evaluator due to the mixed-up index for the
+// template parameters `V`.
+template<class T, typename T2>
+struct Case4 {
+  template<class V> requires C<V>
+  Case4(V, T);
+};
+
+template<class T2>
+using ACase4 = Case4<T2, T2>;
+ACase4 case4{0, 1};
+
 } // namespace test24
+
+namespace GH92212 {
+template<typename T, typename...Us>
+struct A{
+  template<typename V> requires __is_same(V, int)
+  A(V);
+};
+
+template<typename...TS>
+using AA = A<int, TS...>;
+AA a{0};
+}
diff --git a/clang/unittests/Frontend/CompilerInstanceTest.cpp b/clang/unittests/Frontend/CompilerInstanceTest.cpp
index a28d72e..8bc705d 100644
--- a/clang/unittests/Frontend/CompilerInstanceTest.cpp
+++ b/clang/unittests/Frontend/CompilerInstanceTest.cpp
@@ -91,7 +91,7 @@ TEST(CompilerInstance, AllowDiagnosticLogWithUnownedDiagnosticConsumer) {
       DiagOpts, DiagPrinter.get(), /*ShouldOwnClient=*/false);
 
   Diags->Report(diag::err_expected) << "no crash";
-  ASSERT_EQ(DiagnosticsOS.str(), "error: expected no crash\n");
+  ASSERT_EQ(DiagnosticOutput, "error: expected no crash\n");
 }
 
 } // anonymous namespace
diff --git a/clang/unittests/Tooling/LexicallyOrderedRecursiveASTVisitorTest.cpp b/clang/unittests/Tooling/LexicallyOrderedRecursiveASTVisitorTest.cpp
index 38079f7..5d16595 100644
--- a/clang/unittests/Tooling/LexicallyOrderedRecursiveASTVisitorTest.cpp
+++ b/clang/unittests/Tooling/LexicallyOrderedRecursiveASTVisitorTest.cpp
@@ -87,7 +87,7 @@ bool LexicallyOrderedDeclVisitor::VisitNamedDecl(const NamedDecl *D) {
   }
   if (EmitDeclIndices)
     OS << "@" << Index++;
-  Matcher.match(OS.str(), D);
+  Matcher.match(Path, D);
   return true;
 }
 
@@ -96,7 +96,7 @@ bool LexicallyOrderedDeclVisitor::VisitDeclRefExpr(const DeclRefExpr *D) {
   llvm::raw_string_ostream OS(Name);
   if (EmitStmtIndices)
     OS << "@" << Index++;
-  Matcher.match(OS.str(), D);
+  Matcher.match(Name, D);
   return true;
 }
 
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 7bf6091..1f83440 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -4786,7 +4786,7 @@ void EmitClangAttrParsedAttrImpl(RecordKeeper &Records, raw_ostream &OS) {
   // Write out the declaration merging check logic.
   OS << "static bool DiagnoseMutualExclusions(Sema &S, const NamedDecl *D, "
      << "const Attr *A) {\n";
-  OS << MergeDeclOS.str();
+  OS << DeclMergeChecks;
   OS << "  return true;\n";
   OS << "}\n\n";
 
@@ -4796,7 +4796,7 @@ void EmitClangAttrParsedAttrImpl(RecordKeeper &Records, raw_ostream &OS) {
   OS << "static bool DiagnoseMutualExclusions(Sema &S, "
      << "const SmallVectorImpl<const Attr *> &C) {\n";
   OS << "  for (const Attr *A : C) {\n";
-  OS << MergeStmtOS.str();
+  OS << StmtMergeChecks;
   OS << "  }\n";
   OS << "  return true;\n";
   OS << "}\n\n";
@@ -4939,7 +4939,7 @@ void EmitClangAttrTextNodeDump(RecordKeeper &Records, raw_ostream &OS) {
       if (!Args.empty())
         OS << "    const auto *SA = cast<" << R.getName()
            << "Attr>(A); (void)SA;\n";
-      OS << SS.str();
+      OS << FunctionContent;
       OS << "  }\n";
     }
   }
@@ -4968,7 +4968,7 @@ void EmitClangAttrNodeTraverse(RecordKeeper &Records, raw_ostream &OS) {
       if (!Args.empty())
         OS << "    const auto *SA = cast<" << R.getName()
            << "Attr>(A); (void)SA;\n";
-      OS << SS.str();
+      OS << FunctionContent;
       OS << "  }\n";
     }
   }
diff --git a/clang/www/analyzer/alpha_checks.html b/clang/www/analyzer/alpha_checks.html
index 501a9bc..1ee44c7 100644
--- a/clang/www/analyzer/alpha_checks.html
+++ b/clang/www/analyzer/alpha_checks.html
@@ -2,7 +2,9 @@
           "http://www.w3.org/TR/html4/strict.dtd">
 <html>
 <head>
-  <title>Alpha Checks</title>
+  <title>Alpha Checkers documentation has moved to clang.llvm.org</title>
+  <link rel="canonical" href="https://clang.llvm.org/docs/analyzer/checkers.html#experimental-checkers"/>
+  <meta http-equiv="refresh" content="0;url=https://clang.llvm.org/docs/analyzer/checkers.html#experimental-checkers" />
   <link type="text/css" rel="stylesheet" href="menu.css">
   <link type="text/css" rel="stylesheet" href="content.css">
   <script type="text/javascript" src="scripts/menu.js"></script>
@@ -17,938 +19,11 @@
 <!--#include virtual="menu.html.incl"-->
 
 <div id="content">
-<h1>Alpha Checkers</h1>
-Experimental checkers in addition to the <a href = "available_checks.html">
-Default Checkers</a>. These are checkers with known issues or limitations that
-keep them from being on by default. They are likely to have false positives.
-Bug reports are welcome but will likely not be investigated for some time.
-Patches welcome!
-<ul>
-<li><a href="#clone_alpha_checkers">Clone Alpha Checkers</a></li>
-<li><a href="#core_alpha_checkers">Core Alpha Checkers</a></li>
-<li><a href="#cplusplus_alpha_checkers">C++ Alpha Checkers</a></li>
-<li><a href="#llvm_alpha_checkers">LLVM Checkers</a></li>
-<li><a href="#valist_alpha_checkers">Variable Argument Alpha Checkers</a></li>
-<li><a href="#deadcode_alpha_checkers">Dead Code Alpha Checkers</a></li>
-<li><a href="#osx_alpha_checkers">OS X Alpha Checkers</a></li>
-<li><a href="#security_alpha_checkers">Security Alpha Checkers</a></li>
-<li><a href="#unix_alpha_checkers">Unix Alpha Checkers</a></li>
-<li><a href="#nondeterminism_alpha_checkers">Non-determinism Alpha Checkers</a></li>
-</ul>
 
-<!-- ============================= clone alpha ============================= -->
-
-<h3 id="clone_alpha_checkers">Clone Alpha Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="alpha.clone.CloneChecker"><div class="namedescr expandable"><span class="name">
-alpha.clone.CloneChecker</span><span class="lang">
-(C, C++, ObjC)</span><div class="descr">
-Reports similar pieces of code.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void log();
-
-int max(int a, int b) { // warn
-  log();
-  if (a > b)
-    return a;
-  return b;
-}
-
-int maxClone(int x, int y) { // similar code here
-  log();
-  if (x > y)
-    return x;
-  return y;
-}
-</pre></div></div></td></tr>
-</tbody></table>
-
-<!-- ============================= core alpha ============================= -->
-<h3 id="core_alpha_checkers">Core Alpha Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="alpha.core.BoolAssignment"><div class="namedescr expandable"><span class="name">
-alpha.core.BoolAssignment</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warn about assigning non-{0,1} values to boolean variables.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  BOOL b = -1; // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.core.CastSize"><div class="namedescr expandable"><span class="name">
-alpha.core.CastSize</span><span class="lang">
-(C)</span><div class="descr">
-Check when casting a malloc'ed type T, whether the size is a multiple of the
-size of T (Works only with <span class="name">unix.Malloc</span>
-or <span class="name">alpha.unix.MallocWithAnnotations</span>
-checks enabled).</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int *x = (int *)malloc(11); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.core.CastToStruct"><div class="namedescr expandable"><span class="name">
-alpha.core.CastToStruct</span><span class="lang">
-(C, C++)</span><div class="descr">
-Check for cast from non-struct pointer to struct pointer.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-// C
-struct s {};
-
-void test(int *p) {
-  struct s *ps = (struct s *) p; // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-// C++
-class c {};
-
-void test(int *p) {
-  c *pc = (c *) p; // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.core.Conversion"><div class="namedescr expandable"><span class="name">
-alpha.core.Conversion</span><span class="lang">
-(C, C++, ObjC)</span><div class="descr">
-Loss of sign or precision in implicit conversions</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(unsigned U, signed S) {
-  if (S > 10) {
-    if (U < S) {
-    }
-  }
-  if (S < -10) {
-    if (U < S) { // warn (loss of sign)
-    }
-  }
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test() {
-  long long A = 1LL << 60;
-  short X = A; // warn (loss of precision)
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.core.DynamicTypeChecker"><div class="namedescr expandable"><span class="name">
-alpha.core.DynamicTypeChecker</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check for cases where the dynamic and the static type of an
-object are unrelated.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-id date = [NSDate date];
-
-// Warning: Object has a dynamic type 'NSDate *' which is
-// incompatible with static type 'NSNumber *'"
-NSNumber *number = date;
-[number doubleValue];
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.core.FixedAddr"><div class="namedescr expandable"><span class="name">
-alpha.core.FixedAddr</span><span class="lang">
-(C)</span><div class="descr">
-Check for assignment of a fixed address to a pointer.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int *p;
-  p = (int *) 0x10000; // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.core.IdenticalExpr"><div class="namedescr expandable"><span class="name">
-alpha.core.IdenticalExpr</span><span class="lang">
-(C, C++)</span><div class="descr">
-Warn about suspicious uses of identical expressions.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-// C
-void test() {
-  int a = 5;
-  int b = a | 4 | a; // warn: identical expr on both sides
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-// C++
-bool f(void);
-
-void test(bool b) {
-  int i = 10;
-  if (f()) { // warn: true and false branches are identical
-    do {
-      i--;
-    } while (f());
-  } else {
-    do {
-      i--;
-    } while (f());
-  }
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.core.PointerArithm"><div class="namedescr expandable"><span class="name">
-alpha.core.PointerArithm</span><span class="lang">
-(C)</span><div class="descr">
-Check for pointer arithmetic on locations other than array
-elements.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int x;
-  int *p;
-  p = &amp;x + 1; // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.core.PointerSub"><div class="namedescr expandable"><span class="name">
-alpha.core.PointerSub</span><span class="lang">
-(C)</span><div class="descr">
-Check for pointer subtractions on two pointers pointing to different memory
-chunks.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int x, y;
-  int d = &amp;y - &amp;x; // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.core.StackAddressAsyncEscape"><div class="namedescr expandable"><span class="name">
-alpha.core.StackAddressAsyncEscape</span><span class="lang">
-(C)</span><div class="descr">
-Check that addresses to stack memory do not escape the function that involves
-<code>dispatch_after</code> or <code>dispatch_async</code>. This checker is
-a part of core.StackAddressEscape, but is
-<a href=https://reviews.llvm.org/D41042>temporarily disabled</a> until some
-false positives are fixed.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-dispatch_block_t test_block_inside_block_async_leak() {
-  int x = 123;
-  void (^inner)(void) = ^void(void) {
-    int y = x;
-    ++y;
-  };
-  void (^outer)(void) = ^void(void) {
-    int z = x;
-    ++z;
-    inner();
-  };
-  return outer; // warn: address of stack-allocated block is captured by a
-                //       returned block
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.core.TestAfterDivZero"><div class="namedescr expandable"><span class="name">
-alpha.core.TestAfterDivZero</span><span class="lang">
-(C, C++, ObjC)</span><div class="descr">
-Check for division by variable that is later compared against 0.
-Either the comparison is useless or there is division by zero.
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(int x) {
-  var = 77 / x;
-  if (x == 0) { } // warn
-}
-</pre></div></div></td></tr>
-
-
-</tbody></table>
-
-<!-- =========================== cplusplus alpha =========================== -->
-<h3 id="cplusplus_alpha_checkers">C++ Alpha Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-<tbody>
-
-
-<tr><td><a id="alpha.cplusplus.DeleteWithNonVirtualDtor"><div class="namedescr expandable"><span class="name">
-alpha.cplusplus.DeleteWithNonVirtualDtor</span><span class="lang">
-(C++)</span><div class="descr">
-Reports destructions of polymorphic objects with a non-virtual destructor in
-their base class
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-NonVirtual *create() {
-  NonVirtual *x = new NVDerived(); // note: Casting from 'NVDerived' to
-                                   //       'NonVirtual' here
-  return x;
-}
-
-void sink(NonVirtual *x) {
-  delete x; // warn: destruction of a polymorphic object with no virtual
-            //       destructor
-}
-</pre></div></div></td></tr>
-
-<tr><td><a id="alpha.cplusplus.InvalidatedIterator"><div class="namedescr expandable"><span class="name">
-alpha.cplusplus.InvalidatedIterator</span><span class="lang">
-(C++)</span><div class="descr">
-Check for use of invalidated iterators.
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void bad_copy_assign_operator_list1(std::list<int> &L1,
-                                    const std::list<int> &L2) {
-  auto i0 = L1.cbegin();
-  L1 = L2;
-  *i0; // warn: invalidated iterator accessed
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.cplusplus.IteratorRange"><div class="namedescr expandable"><span class="name">
-alpha.cplusplus.IteratorRange</span><span class="lang">
-(C++)</span><div class="descr">
-Check for iterators used outside their valid ranges.
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void simple_bad_end(const std::vector<int> &v) {
-  auto i = v.end();
-  *i; // warn: iterator accessed outside of its range
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.cplusplus.MismatchedIterator"><div class="namedescr expandable"><span class="name">
-alpha.cplusplus.MismatchedIterator</span><span class="lang">
-(C++)</span><div class="descr">
-Check for use of iterators of different containers where iterators of the same
-container are expected.
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void bad_insert3(std::vector<int> &v1, std::vector<int> &v2) {
-  v2.insert(v1.cbegin(), v2.cbegin(), v2.cend()); // warn: container accessed
-                                                  //       using foreign
-                                                  //       iterator argument
-  v1.insert(v1.cbegin(), v1.cbegin(), v2.cend()); // warn: iterators of
-                                                  //       different containers
-                                                  //       used where the same
-                                                  //       container is
-                                                  //       expected
-  v1.insert(v1.cbegin(), v2.cbegin(), v1.cend()); // warn: iterators of
-                                                  //       different containers
-                                                  //       used where the same
-                                                  //       container is
-                                                  //       expected
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.cplusplus.Move"><div class="namedescr expandable"><span class="name">
-alpha.cplusplus.Move</span><span class="lang">
-(C++)</span><div class="descr">
-Method calls on a moved-from object and copying a moved-from object will be
-reported.
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-struct A {
-  void foo() {}
-};
-
-void f() {
-  A a;
-  A b = std::move(a); // note: 'a' became 'moved-from' here
-  a.foo();            // warn: method call on a 'moved-from' object 'a'
-}
-</pre></div></div></td></tr>
-
-
-</tbody></table>
-
-
-<!-- =========================== dead code alpha =========================== -->
-<h3 id="deadcode_alpha_checkers">Dead Code Alpha Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="alpha.deadcode.UnreachableCode"><div class="namedescr expandable"><span class="name">
-alpha.deadcode.UnreachableCode</span><span class="lang">
-(C, C++, ObjC)</span><div class="descr">
-Check unreachable code.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-// C
-int test() {
-  int x = 1;
-  while(x);
-  return x; // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-// C++
-void test() {
-  int a = 2;
-
-  while (a > 1)
-    a--;
-
-  if (a > 1)
-    a++; // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-// Objective-C
-void test(id x) {
-  return;
-  [x retain]; // warn
-}
-</pre></div></div></td></tr>
-</tbody></table>
-
-<!-- =========================== llvm alpha =========================== -->
-<h3 id="llvm_alpha_checkers">LLVM Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="alpha.llvm.Conventions"><div class="namedescr expandable"><span class="name">
-alpha.llvm.Conventions</span><span class="lang">
-(C)</span><div class="descr">
-Check code for LLVM codebase conventions:
-<ul>
-  <li>A <code>StringRef</code> should not be bound to a temporary std::string
-  whose lifetime is shorter than the <code>StringRef</code>'s.</li>
-  <li>Clang AST nodes should not have fields that can allocate memory.</li>
-</ul>
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-<!-- TODO: Add examples, as currently it's hard to get this checker working. -->
-</pre></div></div></td></tr>
-
-</tbody></table>
-
-
-<!-- ============================== OS X alpha ============================== -->
-<h3 id="osx_alpha_checkers">OS X Alpha Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="alpha.osx.cocoa.DirectIvarAssignment"><div class="namedescr expandable"><span class="name">
-alpha.osx.cocoa.DirectIvarAssignment</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check that Objective C properties follow the following rule: the property
-should be set with the setter, not though a direct assignment.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@interface MyClass : NSObject {}
-@property (readonly) id A;
-- (void) foo;
-@end
-
-@implementation MyClass
-- (void) foo {
-  _A = 0; // warn
-}
-@end
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.osx.cocoa.DirectIvarAssignmentForAnnotatedFunctions"><div class="namedescr expandable"><span class="name">
-alpha.osx.cocoa.DirectIvarAssignmentForAnnotatedFunctions</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check for direct assignments to instance variables in the methods annotated
-with <code>objc_no_direct_instance_variable_assignment</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@interface MyClass : NSObject {}
-@property (readonly) id A;
-- (void) fAnnotated __attribute__((
-    annotate("objc_no_direct_instance_variable_assignment")));
-- (void) fNotAnnotated;
-@end
-
-@implementation MyClass
-- (void) fAnnotated {
-  _A = 0; // warn
-}
-- (void) fNotAnnotated {
-  _A = 0; // no warn
-}
-@end
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.osx.cocoa.InstanceVariableInvalidation"><div class="namedescr expandable"><span class="name">
-alpha.osx.cocoa.InstanceVariableInvalidation</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check that the invalidatable instance variables are invalidated in the methods
-annotated with <code>objc_instance_variable_invalidator</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@protocol Invalidation &lt;NSObject&gt;
-- (void) invalidate
-  __attribute__((annotate("objc_instance_variable_invalidator")));
-@end
-
-@interface InvalidationImpObj : NSObject &lt;Invalidation&gt;
-@end
-
-@interface SubclassInvalidationImpObj : InvalidationImpObj {
-  InvalidationImpObj *var;
-}
-- (void)invalidate;
-@end
-
-@implementation SubclassInvalidationImpObj
-- (void) invalidate {}
-@end
-// warn: var needs to be invalidated or set to nil
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.osx.cocoa.MissingInvalidationMethod"><div class="namedescr expandable"><span class="name">
-alpha.osx.cocoa.MissingInvalidationMethod</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check that the invalidation methods are present in classes that contain
-invalidatable instance variables.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@protocol Invalidation &lt;NSObject&gt;
-- (void)invalidate
-  __attribute__((annotate("objc_instance_variable_invalidator")));
-@end
-
-@interface NeedInvalidation : NSObject &lt;Invalidation&gt;
-@end
-
-@interface MissingInvalidationMethodDecl : NSObject {
-  NeedInvalidation *Var; // warn
-}
-@end
-
-@implementation MissingInvalidationMethodDecl
-@end
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.osx.cocoa.localizability.PluralMisuseChecker"><div class="namedescr expandable"><span class="name">
-alpha.osx.cocoa.localizability.PluralMisuseChecker</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warns against using one vs. many plural pattern in code
-when generating localized strings.
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-NSString *reminderText =
-  NSLocalizedString(@"None", @"Indicates no reminders");
-if (reminderCount == 1) {
-  // Warning: Plural cases are not supported across all languages.
-  // Use a .stringsdict file instead
-  reminderText =
-    NSLocalizedString(@"1 Reminder", @"Indicates single reminder");
-} else if (reminderCount >= 2) {
-  // Warning: Plural cases are not supported across all languages.
-  // Use a .stringsdict file instead
-  reminderText =
-    [NSString stringWithFormat:
-      NSLocalizedString(@"%@ Reminders", @"Indicates multiple reminders"),
-        reminderCount];
-}
-</pre></div></div></td></tr>
-
-</tbody></table>
-
-<!-- =========================== security alpha =========================== -->
-<h3 id="security_alpha_checkers">Security Alpha Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="alpha.security.ArrayBound"><div class="namedescr expandable"><span class="name">
-alpha.security.ArrayBound</span><span class="lang">
-(C)</span><div class="descr">
-Warn about buffer overflows (older checker).</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  char *s = "";
-  char c = s[1]; // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-struct seven_words {
-  int c[7];
-};
-
-void test() {
-  struct seven_words a, *p;
-  p = &a;
-  p[0] = a;
-  p[1] = a;
-  p[2] = a; // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-// note: requires unix.Malloc or
-// alpha.unix.MallocWithAnnotations checks enabled.
-void test() {
-  int *p = malloc(12);
-  p[3] = 4; // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test() {
-  char a[2];
-  int *b = (int*)a;
-  b[1] = 3; // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.security.ArrayBoundV2"><div class="namedescr expandable"><span class="name">
-alpha.security.ArrayBoundV2</span><span class="lang">
-(C)</span><div class="descr">
-Warn about buffer overflows (newer checker).</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  char *s = "";
-  char c = s[1]; // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test() {
-  int buf[100];
-  int *p = buf;
-  p = p + 99;
-  p[1] = 1; // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-// note: compiler has internal check for this.
-// Use -Wno-array-bounds to suppress compiler warning.
-void test() {
-  int buf[100][100];
-  buf[0][-1] = 1; // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-// note: requires alpha.security.taint check turned on.
-void test() {
-  char s[] = "abc";
-  int x = getchar();
-  char c = s[x]; // warn: index is tainted
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.security.MallocOverflow"><div class="namedescr expandable"><span class="name">
-alpha.security.MallocOverflow</span><span class="lang">
-(C)</span><div class="descr">
-Check for overflows in the arguments to <code>malloc()</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(int n) {
-  void *p = malloc(n * sizeof(int)); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.security.MmapWriteExec"><div class="namedescr expandable"><span class="name">
-alpha.security.MmapWriteExec</span><span class="lang">
-(C)</span><div class="descr">
-Warn on <code>mmap()<code> calls that are both writable and executable.
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(int n) {
-  void *c = mmap(NULL, 32, PROT_READ | PROT_WRITE | PROT_EXEC,
-                 MAP_PRIVATE | MAP_ANON, -1, 0);
-  // warn: Both PROT_WRITE and PROT_EXEC flags are set. This can lead to
-  //       exploitable memory regions, which could be overwritten with malicious
-  //       code
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.security.ReturnPtrRange"><div class="namedescr expandable"><span class="name">
-alpha.security.ReturnPtrRange</span><span class="lang">
-(C)</span><div class="descr">
-Check for an out-of-bound pointer being returned to callers.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-static int A[10];
-
-int *test() {
-  int *p = A + 10;
-  return p; // warn
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-int test(void) {
-  int x;
-  return x; // warn: undefined or garbage returned
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.security.taint.TaintPropagation"><div class="namedescr expandable"><span class="name">
-alpha.security.taint.TaintPropagation</span><span class="lang">
-(C)</span><div class="descr">
-Generate taint information used by other checkers.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  char x = getchar(); // 'x' marked as tainted
-  system(&x); // warn: untrusted data is passed to a system call
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-// note: compiler internally checks if the second param to
-// sprintf is a string literal or not.
-// Use -Wno-format-security to suppress compiler warning.
-void test() {
-  char s[10], buf[10];
-  fscanf(stdin, "%s", s); // 's' marked as tainted
-
-  sprintf(buf, s); // warn: untrusted data as a format string
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test() {
-  size_t ts;
-  scanf("%zd", &ts); // 'ts' marked as tainted
-  int *p = (int *)malloc(ts * sizeof(int));
-    // warn: untrusted data as buffer size
-}
-</pre></div></div></td></tr>
-
-</tbody></table>
-
-<!-- ============================= unix alpha ============================= -->
-<h3 id="unix_alpha_checkers">Unix Alpha Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-<tbody>
-
-
-<tr><td><a id="alpha.unix.Chroot"><div class="namedescr expandable"><span class="name">
-alpha.unix.Chroot</span><span class="lang">
-(C)</span><div class="descr">
-Check improper use of <code>chroot</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void f();
-
-void test() {
-  chroot("/usr/local");
-  f(); // warn: no call of chdir("/") immediately after chroot
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.unix.PthreadLock"><div class="namedescr expandable"><span class="name">
-alpha.unix.PthreadLock</span><span class="lang">
-(C)</span><div class="descr">
-Simple lock -> unlock checker; applies to:<div class=functions>
-pthread_mutex_lock<br>
-pthread_rwlock_rdlock<br>
-pthread_rwlock_wrlock<br>
-lck_mtx_lock<br>
-lck_rw_lock_exclusive<br>
-lck_rw_lock_shared<br>
-pthread_mutex_trylock<br>
-pthread_rwlock_tryrdlock<br>
-pthread_rwlock_tryrwlock<br>
-lck_mtx_try_lock<br>
-lck_rw_try_lock_exclusive<br>
-lck_rw_try_lock_shared<br>
-pthread_mutex_unlock<br>
-pthread_rwlock_unlock<br>
-lck_mtx_unlock<br>
-lck_rw_done</div></div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-pthread_mutex_t mtx;
-
-void test() {
-  pthread_mutex_lock(&mtx);
-  pthread_mutex_lock(&mtx);
-    // warn: this lock has already been acquired
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-lck_mtx_t lck1, lck2;
-
-void test() {
-  lck_mtx_lock(&lck1);
-  lck_mtx_lock(&lck2);
-  lck_mtx_unlock(&lck1);
-    // warn: this was not the most recently acquired lock
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-lck_mtx_t lck1, lck2;
-
-void test() {
-  if (lck_mtx_try_lock(&lck1) == 0)
-    return;
-
-  lck_mtx_lock(&lck2);
-  lck_mtx_unlock(&lck1);
-    // warn: this was not the most recently acquired lock
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.unix.SimpleStream"><div class="namedescr expandable"><span class="name">
-alpha.unix.SimpleStream</span><span class="lang">
-(C)</span><div class="descr">
-Check for misuses of stream APIs:<div class=functions>
-fopen<br>
-fclose</div>(demo checker, the subject of the demo
-(<a href="https://llvm.org/devmtg/2012-11/Zaks-Rose-Checker24Hours.pdf">Slides</a>
-,<a href="https://youtu.be/kdxlsP5QVPw">Video</a>)
-by Anna Zaks and Jordan Rose presented at the <a href="https://llvm.org/devmtg/2012-11/">
-2012 LLVM Developers' Meeting).</a></div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  FILE *F = fopen("myfile.txt", "w");
-} // warn: opened file is never closed
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test() {
-  FILE *F = fopen("myfile.txt", "w");
-
-  if (F)
-    fclose(F);
-
-  fclose(F); // warn: closing a previously closed file stream
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.unix.cstring.BufferOverlap"><div class="namedescr expandable"><span class="name">
-alpha.unix.cstring.BufferOverlap</span><span class="lang">
-(C)</span><div class="descr">
-Checks for overlap in two buffer arguments; applies to:<div class=functions>
-memcpy<br>
-mempcpy</div></div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int a[4] = {0};
-  memcpy(a + 2, a + 1, 8); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="alpha.unix.cstring.NotNullTerminated"><div class="namedescr expandable"><span class="name">
-alpha.unix.cstring.NotNullTerminated</span><span class="lang">
-(C)</span><div class="descr">
-Check for arguments which are not null-terminated strings; applies
-to:<div class=functions>
-strlen<br>
-strnlen<br>
-strcpy<br>
-strncpy<br>
-strcat<br>
-strncat</div></div></div></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int y = strlen((char *)&test); // warn
-}
-</pre></div></div></a></td></tr>
-
-
-<tr><td><a id="alpha.unix.cstring.OutOfBounds"><div class="namedescr expandable"><span class="name">
-alpha.unix.cstring.OutOfBounds</span><span class="lang">
-(C)</span><div class="descr">
-Check for out-of-bounds access in string functions; applies
-to:<div class=functions>
-strncopy<br>
-strncat</div></div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(char *y) {
-  char x[4];
-  if (strlen(y) == 4)
-    strncpy(x, y, 5); // warn
-}
-</pre></div></div></td></tr>
-
-</tbody></table>
-
-<!-- =========================== nondeterminism alpha =========================== -->
-<h3 id="nondeterminism_alpha_checkers">Non-determinism Alpha Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="alpha.nondeterminism.PointerIteration"><div class="namedescr expandable"><span class="name">
-alpha.nondeterminism.PointerIteration</span><span class="lang">
-(C++)</span><div class="descr">
-Check for non-determinism caused by iterating unordered containers of pointers.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-// C++
-void test() {
- int a = 1, b = 2;
- std::unordered_set<int *> UnorderedPtrSet = {&a, &b};
-
- for (auto i : UnorderedPtrSet) // warn
-   f(i);
-}
-</pre></div></div></td></tr>
-<tr><td><a id="alpha.nondeterminism.PointerSorting"><div class="namedescr expandable"><span class="name">
-alpha.nondeterminism.PointerSorting</span><span class="lang">
-(C++)</span><div class="descr">
-Check for non-determinism caused by sorting of pointers.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-// C++
-void test() {
- int a = 1, b = 2;
- std::vector<int *> V = {&a, &b};
- std::sort(V.begin(), V.end()); // warn
-}
-</pre></div></div></td></tr>
-</tbody></table>
+<h1>The clangd documentation has moved to clang.llvm.org</h1>
+<p style="color:red; font-size:200%">This page is deprecated and will be removed in release 21.0</p>
+<a href="https://clang.llvm.org/docs/analyzer/checkers.html#experimental-checkers">The new site<a>
+<script>window.location='https://clang.llvm.org/docs/analyzer/checkers.html#experimental-checkers'</script>
 
 </div> <!-- page -->
 </div> <!-- content -->
diff --git a/clang/www/analyzer/available_checks.html b/clang/www/analyzer/available_checks.html
index c23865e..7be155a 100644
--- a/clang/www/analyzer/available_checks.html
+++ b/clang/www/analyzer/available_checks.html
@@ -2,7 +2,9 @@
           "http://www.w3.org/TR/html4/strict.dtd">
 <html>
 <head>
-  <title>Available Checkers</title>
+  <title>Available Checkers documentation has moved to clang.llvm.org</title>
+  <link rel="canonical" href="https://clang.llvm.org/docs/analyzer/checkers.html"/>
+  <meta http-equiv="refresh" content="0;url=https://clang.llvm.org/docs/analyzer/checkers.html" />
   <link type="text/css" rel="stylesheet" href="menu.css">
   <link type="text/css" rel="stylesheet" href="content.css">
   <script type="text/javascript" src="scripts/menu.js"></script>
@@ -17,1742 +19,11 @@
 <!--#include virtual="menu.html.incl"-->
 
 <div id="content">
-<h1>Available Checkers</h1>
-The analyzer performs checks that are categorized into families or "checkers". The
-default set of checkers covers a variety of checks targeted at finding security
-and API usage bugs, dead code, and other logic errors. See the
-<a href = "#default_checkers">Default Checkers</a> list below. In addition to
-these, the analyzer contains a number of <a href = "alpha_checks.html">
-Experimental (Alpha) Checkers</a>.
 
-<h3>Writeups with examples of some of the bugs that the analyzer finds</h3>
-<ul>
-<li><a href="http://www.mobileorchard.com/bug-finding-with-clang-5-resources-to-get-you-started/">Bug Finding With Clang: 5 Resources To Get You Started</a></li>
-<li><a href="https://fruitstandsoftware.mrrooni.com/blog/blog/2008/08/04/finding-memory-leaks-with-the-llvmclang-static-analyzer/">Finding Memory Leaks With The LLVM/Clang Static Analyzer</a></li>
-<li><a href="https://weblog.rogueamoeba.com/2008/07/14/the-clang-static-analyzer/">Under the Microscope - The Clang Static Analyzer</a></li>
-<li><a href="https://www.mikeash.com/pyblog/friday-qa-2009-03-06-using-the-clang-static-analyzer.html">Mike Ash - Using the Clang Static Analyzer</a></li>
-</ul>
-
-<h2 id="default_checkers">Default Checkers</h2>
-<ul>
-<li><a href="#core_checkers">Core Checkers</a> model core language features and perform general-purpose checks such as division by zero, null pointer dereference, usage of uninitialized values, etc.</li>
-<li><a href="#cplusplus_checkers">C++ Checkers</a> perform C++-specific checks</li>
-<li><a href="#deadcode_checkers">Dead Code Checkers</a> check for unused code</li>
-<li><a href="#nullability_checkers">Nullability Checkers</a> </li>
-<li><a href="#optin_checkers">Optin Checkers</a> </li>
-<li><a href="#osx_checkers">OS X Checkers</a> perform Objective-C-specific checks and check the use of Apple's SDKs (OS X and iOS)</li>
-<li><a href="#security_checkers">Security Checkers</a> check for insecure API usage and perform checks based on the CERT Secure Coding Standards</li>
-<li><a href="#unix_checkers">Unix Checkers</a> check the use of Unix and POSIX APIs</li>
-</ul>
-
-<!-- =========================== core =========================== -->
-<h3 id="core_checkers">Core Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="core.CallAndMessage"><div class="namedescr expandable"><span class="name">
-core.CallAndMessage</span><span class="lang">
-(C, C++, ObjC)</span><div class="descr">
-Check for logical errors for function calls and Objective-C message expressions
-(e.g., uninitialized arguments, null function pointers).</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-// C
-struct S {
-  int x;
-};
-
-void f(struct S s);
-
-void test() {
-  struct S s;
-  f(s); // warn: passed-by-value arg contain uninitialized data
-}
-</pre></div>
-<div class="example"><pre>
-// C
-void test() {
-  void (*foo)(void);
-  foo(); // warn: function pointer is uninitialized
-}
-</pre></div>
-<div class="example"><pre>
-// C
-void test() {
-  void (*foo)(void);
-  foo = 0;
-  foo(); // warn: function pointer is null
-}
-</pre></div>
-<div class="example"><pre>
-// C++
-class C {
-public:
-  void f();
-};
-
-void test() {
-  C *pc;
-  pc-&gt;f(); // warn: object pointer is uninitialized
-}
-</pre></div>
-<div class="example"><pre>
-// C++
-class C {
-public:
-  void f();
-};
-
-void test() {
-  C *pc = 0;
-  pc-&gt;f(); // warn: object pointer is null
-}
-</pre></div>
-<div class="example"><pre>
-// Objective-C
-@interface MyClass : NSObject
-@property (readwrite,assign) id x;
-- (long double)longDoubleM;
-@end
-
-void test() {
-  MyClass *obj1;
-  long double ld1 = [obj1 longDoubleM];
-    // warn: receiver is uninitialized
-}
-</pre></div>
-<div class="example"><pre>
-// Objective-C
-@interface MyClass : NSObject
-@property (readwrite,assign) id x;
-- (long double)longDoubleM;
-@end
-
-void test() {
-  MyClass *obj1;
-  id i = obj1.x; // warn: uninitialized object pointer
-}
-</pre></div>
-<div class="example"><pre>
-// Objective-C
-@interface Subscriptable : NSObject
-- (id)objectAtIndexedSubscript:(unsigned int)index;
-@end
-
-@interface MyClass : Subscriptable
-@property (readwrite,assign) id x;
-- (long double)longDoubleM;
-@end
-
-void test() {
-  MyClass *obj1;
-  id i = obj1[0]; // warn: uninitialized object pointer
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="core.DivideZero"><div class="namedescr expandable"><span class="name">
-core.DivideZero</span><span class="lang">
-(C, C++, ObjC)</span><div class="descr">
-Check for division by zero.</div></div></a>co</td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(int z) {
-  if (z == 0)
-    int x = 1 / z; // warn
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  int x = 1;
-  int y = x % 0; // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="core.NonNullParamChecker"><div class="namedescr expandable"><span class="name">
-core.NonNullParamChecker</span><span class="lang">
-(C, C++, ObjC)</span><div class="descr">
-Check for null pointers passed as arguments to a function whose arguments are
-marked with the <code>nonnull</code> attribute.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-int f(int *p) __attribute__((nonnull));
-
-void test(int *p) {
-  if (!p)
-    f(p); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="core.NullDereference"><div class="namedescr expandable"><span class="name">
-core.NullDereference</span><span class="lang">
-(C, C++, ObjC)</span><div class="descr">
-Check for dereferences of null pointers.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-// C
-void test(int *p) {
-  if (p)
-    return;
-
-  int x = p[0]; // warn
-}
-</pre></div>
-<div class="example"><pre>
-// C
-void test(int *p) {
-  if (!p)
-    *p = 0; // warn
-}
-</pre></div>
-<div class="example"><pre>
-// C++
-class C {
-public:
-  int x;
-};
-
-void test() {
-  C *pc = 0;
-  int k = pc->x; // warn
-}
-</pre></div>
-<div class="example"><pre>
-// Objective-C
-@interface MyClass {
-@public
-  int x;
-}
-@end
-
-void test() {
-  MyClass *obj = 0;
-  obj-&gt;x = 1; // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="core.StackAddressEscape"><div class="namedescr expandable"><span class="name">
-core.StackAddressEscape</span><span class="lang">
-(C)</span><div class="descr">
-Check that addresses of stack memory do not escape the function.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-char const *p;
-
-void test() {
-  char const str[] = "string";
-  p = str; // warn
-}
-</pre></div>
-<div class="example"><pre>
-void* test() {
-   return __builtin_alloca(12); // warn
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  static int *x;
-  int y;
-  x = &amp;y; // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="core.UndefinedBinaryOperatorResult"><div class="namedescr expandable"><span class="name">
-core.UndefinedBinaryOperatorResult</span><span class="lang">
-(C)</span><div class="descr">
-Check for undefined results of binary operators.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int x;
-  int y = x + 1; // warn: left operand is garbage
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="core.VLASize"><div class="namedescr expandable"><span class="name">
-core.VLASize</span><span class="lang">
-(C)</span><div class="descr">
-Check for declarations of VLA of undefined or zero size.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int x;
-  int vla1[x]; // warn: garbage as size
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  int x = 0;
-  int vla2[x]; // warn: zero size
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="core.uninitialized.ArraySubscript"><div class="namedescr expandable"><span class="name">
-core.uninitialized.ArraySubscript</span><span class="lang">
-(C)</span><div class="descr">
-Check for uninitialized values used as array subscripts.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int i, a[10];
-  int x = a[i]; // warn: array subscript is undefined
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="core.uninitialized.Assign"><div class="namedescr expandable"><span class="name">
-core.uninitialized.Assign</span><span class="lang">
-(C)</span><div class="descr">
-Check for assigning uninitialized values.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int x;
-  x |= 1; // warn: left expression is uninitialized
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="core.uninitialized.Branch"><div class="namedescr expandable"><span class="name">
-core.uninitialized.Branch</span><span class="lang">
-(C)</span><div class="descr">
-Check for uninitialized values used as branch conditions.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int x;
-  if (x) // warn
-    return;
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="core.uninitialized.CapturedBlockVariable"><div class="namedescr expandable"><span class="name">
-core.uninitialized.CapturedBlockVariable</span><span class="lang">
-(C)</span><div class="descr">
-Check for blocks that capture uninitialized values.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int x;
-  ^{ int y = x; }(); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="core.uninitialized.UndefReturn"><div class="namedescr expandable"><span class="name">
-core.uninitialized.UndefReturn</span><span class="lang">
-(C)</span><div class="descr">
-Check for uninitialized values being returned to the caller.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-int test() {
-  int x;
-  return x; // warn
-}
-</pre></div></div></td></tr>
-
-</tbody></table>
-
-<!-- =========================== C++ =========================== -->
-<h3 id="cplusplus_checkers">C++ Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-
-<tr><td><a id="cplusplus.ArrayDelete"><div class="namedescr expandable"><span class="name">
-cplusplus.ArrayDelete</span><span class="lang">
-(C++)</span><div class="descr">
-Reports destructions of arrays of polymorphic objects that are destructed as
-their base class. If the type of an array-pointer is different from the type of
-its underlying objects, calling <code>delete[]</code> is undefined.
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-class Base {
-public:
-  virtual ~Base() {}
-};
-class Derived : public Base {};
-
-Base *create() {
-  Base *x = new Derived[10]; // note: Casting from 'Derived' to 'Base' here
-  return x;
-}
-
-void sink(Base *x) {
-  delete[] x; // warn: Deleting an array of 'Derived' objects as their base class 'Base' undefined
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="cplusplus.NewDelete"><div class="namedescr expandable"><span class="name">
-cplusplus.NewDelete</span><span class="lang">
-(C++)</span><div class="descr">
-Check for double-free, use-after-free and offset problems involving C++ <code>
-delete</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void f(int *p);
-
-void testUseMiddleArgAfterDelete(int *p) {
-  delete p;
-  f(p); // warn: use after free
-}
-</pre></div>
-<div class="example"><pre>
-class SomeClass {
-public:
-  void f();
-};
-
-void test() {
-  SomeClass *c = new SomeClass;
-  delete c;
-  c-&gt;f(); // warn: use after free
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  int *p = (int *)__builtin_alloca(sizeof(int));
-  delete p; // warn: deleting memory allocated by alloca
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  int *p = new int;
-  delete p;
-  delete p; // warn: attempt to free released
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  int i;
-  delete &amp;i; // warn: delete address of local
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  int *p = new int[1];
-  delete[] (++p);
-    // warn: argument to 'delete[]' is offset by 4 bytes
-    // from the start of memory allocated by 'new[]'
-}
-</pre></div></div></td></tr>
-
-<tr><td><a id="cplusplus.NewDeleteLeaks"><div class="namedescr expandable"><span class="name">
-cplusplus.NewDeleteLeaks</span><span class="lang">
-(C++)</span><div class="descr">
-Check for memory leaks. Traces memory managed by <code>new</code>/<code>
-delete</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int *p = new int;
-} // warn
-</pre></div></div></td></tr>
-
-</tbody></table>
-
-<!-- =========================== dead code =========================== -->
-<h3 id="deadcode_checkers">Dead Code Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="deadcode.DeadStores"><div class="namedescr expandable"><span class="name">
-deadcode.DeadStores</span><span class="lang">
-(C)</span><div class="descr">
-Check for values stored to variables that are never read afterwards.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int x;
-  x = 1; // warn
-}
-</pre></div></div></td></tr>
-
-</tbody></table>
-
-<!-- =========================== nullability =========================== -->
-<h3 id="nullability_checkers">Nullability Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="nullability.NullPassedToNonnull"><div class="namedescr expandable"><span class="name">
-nullability.NullPassedToNonnull</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warns when a null pointer is passed to a pointer which has a
-_Nonnull type.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-if (name != nil)
-  return;
-// Warning: nil passed to a callee that requires a non-null 1st parameter
-NSString *greeting = [@"Hello " stringByAppendingString:name];
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="nullability.NullReturnedFromNonnull"><div class="namedescr expandable"><span class="name">
-nullability.NullReturnedFromNonnull</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warns when a null pointer is returned from a function that has
-_Nonnull return type.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-- (nonnull id)firstChild {
-  id result = nil;
-  if ([_children count] > 0)
-    result = _children[0];
-
-  // Warning: nil returned from a method that is expected
-  // to return a non-null value
-  return result;
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="nullability.NullableDereferenced"><div class="namedescr expandable"><span class="name">
-nullability.NullableDereferenced</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warns when a nullable pointer is dereferenced.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-struct LinkedList {
-  int data;
-  struct LinkedList *next;
-};
-
-struct LinkedList * _Nullable getNext(struct LinkedList *l);
-
-void updateNextData(struct LinkedList *list, int newData) {
-  struct LinkedList *next = getNext(list);
-  // Warning: Nullable pointer is dereferenced
-  next->data = 7;
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="nullability.NullablePassedToNonnull"><div class="namedescr expandable"><span class="name">
-nullability.NullablePassedToNonnull</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warns when a nullable pointer is passed to a pointer which has a _Nonnull type.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-typedef struct Dummy { int val; } Dummy;
-Dummy *_Nullable returnsNullable();
-void takesNonnull(Dummy *_Nonnull);
-
-void test() {
-  Dummy *p = returnsNullable();
-  takesNonnull(p); // warn
-}
-</pre></div></div></td></tr>
-
-</tbody></table>
-
-<!-- =========================== optin =========================== -->
-<h3 id="optin_checkers">Optin Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tr><td><a id="cplusplus.UninitializedObject"><div class="namedescr expandable"><span class="name">
-cplusplus.UninitializedObject</span><span class="lang">
-(C++)</span><div class="descr">
-This checker reports uninitialized fields in objects created after a constructor
-call. It doesn't only find direct uninitialized fields, but rather makes a deep
-inspection of the object, analyzing all of it's fields subfields. <br>
-The checker regards inherited fields as direct fields, so one will recieve
-warnings for uninitialized inherited data members as well. <br>
-<br>
-It has several options:
-<ul>
-  <li>
-    "<code>Pedantic</code>" (boolean). If its not set or is set to false, the
-    checker won't emit warnings for objects that don't have at least one
-    initialized field. This may be set with <br>
-    <code>-analyzer-config cplusplus.UninitializedObject:Pedantic=true</code>.
-  </li>
-  <li>
-    "<code>NotesAsWarnings</code>" (boolean). If set to true, the checker will
-    emit a warning for each uninitalized field, as opposed to emitting one
-    warning per constructor call, and listing the uninitialized fields that
-    belongs to it in notes. Defaults to false. <br>
-    <code>-analyzer-config cplusplus.UninitializedObject:NotesAsWarnings=true</code>.
-  </li>
-  <li>
-    "<code>CheckPointeeInitialization</code>" (boolean). If set to false, the
-    checker will not analyze the pointee of pointer/reference fields, and will
-    only check whether the object itself is initialized. Defaults to false. <br>
-    <code>-analyzer-config cplusplus.UninitializedObject:CheckPointeeInitialization=true</code>.
-  </li>
-  <li>
-    "<code>IgnoreRecordsWithField</code>" (string). If supplied, the checker
-    will not analyze structures that have a field with a name or type name that
-    matches the given pattern. Defaults to <code>""</code>.
-
-    <code>-analyzer-config cplusplus.UninitializedObject:IgnoreRecordsWithField="[Tt]ag|[Kk]ind"</code>.
-  </li>
-</ul></div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-// With Pedantic and CheckPointeeInitialization set to true
-
-struct A {
-  struct B {
-    int x; // note: uninitialized field 'this->b.x'
-           // note: uninitialized field 'this->bptr->x'
-    int y; // note: uninitialized field 'this->b.y'
-           // note: uninitialized field 'this->bptr->y'
-  };
-  int *iptr; // note: uninitialized pointer 'this->iptr'
-  B b;
-  B *bptr;
-  char *cptr; // note: uninitialized pointee 'this->cptr'
-
-  A (B *bptr, char *cptr) : bptr(bptr), cptr(cptr) {}
-};
-
-void f() {
-  A::B b;
-  char c;
-  A a(&b, &c); // warning: 6 uninitialized fields
-               //          after the constructor call
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-// With Pedantic set to false and
-// CheckPointeeInitialization set to true
-// (every field is uninitialized)
-
-struct A {
-  struct B {
-    int x;
-    int y;
-  };
-  int *iptr;
-  B b;
-  B *bptr;
-  char *cptr;
-
-  A (B *bptr, char *cptr) : bptr(bptr), cptr(cptr) {}
-};
-
-void f() {
-  A::B b;
-  char c;
-  A a(&b, &c); // no warning
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-// With Pedantic and CheckPointeeInitialization set to false
-// (pointees are regarded as initialized)
-
-struct A {
-  struct B {
-    int x; // note: uninitialized field 'this->b.x'
-    int y; // note: uninitialized field 'this->b.y'
-  };
-  int *iptr; // note: uninitialized pointer 'this->iptr'
-  B b;
-  B *bptr;
-  char *cptr;
-
-  A (B *bptr, char *cptr) : bptr(bptr), cptr(cptr) {}
-};
-
-void f() {
-  A::B b;
-  char c;
-  A a(&b, &c); // warning: 3 uninitialized fields
-               //          after the constructor call
-}
-</pre></div></div></td></tr>
-
-
-<tbody>
-<tr><td><a id="optin.cplusplus.VirtualCall"><div class="namedescr expandable"><span class="name">
-optin.cplusplus.VirtualCall</span><span class="lang">
-(C++)</span><div class="descr">
-Check virtual member function calls during construction or
-destruction.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-class A {
-public:
-  A() {
-    f(); // warn
-  }
-  virtual void f();
-};
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-class A {
-public:
-  ~A() {
-    this-&gt;f(); // warn
-  }
-  virtual void f();
-};
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="optin.mpi.MPI-Checker"><div class="namedescr expandable"><span class="name">
-optin.mpi.MPI-Checker</span><span class="lang">
-(C)</span><div class="descr">
-Checks MPI code</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  double buf = 0;
-  MPI_Request sendReq1;
-  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM,
-      0, MPI_COMM_WORLD, &sendReq1);
-} // warn: request 'sendReq1' has no matching wait.
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test() {
-  double buf = 0;
-  MPI_Request sendReq;
-  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq);
-  MPI_Irecv(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // warn
-  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // warn
-  MPI_Wait(&sendReq, MPI_STATUS_IGNORE);
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void missingNonBlocking() {
-  int rank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Request sendReq1[10][10][10];
-  MPI_Wait(&sendReq1[1][7][9], MPI_STATUS_IGNORE); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="optin.osx.cocoa.localizability.EmptyLocalizationContextChecker"><div class="namedescr expandable"><span class="name">
-optin.osx.cocoa.localizability.EmptyLocalizationContextChecker</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check that NSLocalizedString macros include a comment for context.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-- (void)test {
-  NSString *string = NSLocalizedString(@"LocalizedString", nil); // warn
-  NSString *string2 = NSLocalizedString(@"LocalizedString", @" "); // warn
-  NSString *string3 = NSLocalizedStringWithDefaultValue(
-    @"LocalizedString", nil, [[NSBundle alloc] init], nil,@""); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="optin.osx.cocoa.localizability.NonLocalizedStringChecker"><div class="namedescr expandable"><span class="name">
-optin.osx.cocoa.localizability.NonLocalizedStringChecker</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warns about uses of non-localized NSStrings passed to UI methods
-expecting localized NSStrings</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-NSString *alarmText =
-  NSLocalizedString(@"Enabled", @"Indicates alarm is turned on");
-if (!isEnabled) {
-  alarmText = @"Disabled";
-}
-UILabel *alarmStateLabel = [[UILabel alloc] init];
-
-// Warning: User-facing text should use localized string macro
-[alarmStateLabel setText:alarmText];
-</pre></div></div></td></tr>
-
-</tbody></table>
-
-<!-- =========================== OS X =========================== -->
-<h3 id="osx_checkers">OS X Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="osx.API"><div class="namedescr expandable"><span class="name">
-osx.API</span><span class="lang">
-(C)</span><div class="descr">
-Check for proper uses of various Apple APIs:<div class=functions>
-dispatch_once</div></div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  dispatch_once_t pred = 0;
-  dispatch_once(&amp;pred, ^(){}); // warn: dispatch_once uses local
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.NumberObjectConversion"><div class="namedescr expandable"><span class="name">
-osx.NumberObjectConversion</span><span class="lang">
-(C, C++, ObjC)</span><div class="descr">
-Check for erroneous conversions of objects representing numbers
-into numbers</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-NSNumber *photoCount = [albumDescriptor objectForKey:@"PhotoCount"];
-// Warning: Comparing a pointer value of type 'NSNumber *'
-// to a scalar integer value
-if (photoCount > 0) {
-  [self displayPhotos];
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.SecKeychainAPI"><div class="namedescr expandable"><span class="name">
-osx.SecKeychainAPI</span><span class="lang">
-(C)</span><div class="descr">
-Check for improper uses of the Security framework's Keychain APIs:<div class=functions>
-SecKeychainItemCopyContent<br>
-SecKeychainFindGenericPassword<br>
-SecKeychainFindInternetPassword<br>
-SecKeychainItemFreeContent<br>
-SecKeychainItemCopyAttributesAndData<br>
-SecKeychainItemFreeAttributesAndData</div></div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  unsigned int *ptr = 0;
-  UInt32 length;
-
-  SecKeychainItemFreeContent(ptr, &amp;length);
-    // warn: trying to free data which has not been allocated
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  unsigned int *ptr = 0;
-  UInt32 *length = 0;
-  void *outData;
-
-  OSStatus st =
-    SecKeychainItemCopyContent(2, ptr, ptr, length, outData);
-    // warn: data is not released
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  unsigned int *ptr = 0;
-  UInt32 *length = 0;
-  void *outData;
-
-  OSStatus st =
-    SecKeychainItemCopyContent(2, ptr, ptr, length, &amp;outData);
-
-  SecKeychainItemFreeContent(ptr, outData);
-    // warn: only call free if a non-NULL buffer was returned
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  unsigned int *ptr = 0;
-  UInt32 *length = 0;
-  void *outData;
-
-  OSStatus st =
-    SecKeychainItemCopyContent(2, ptr, ptr, length, &amp;outData);
-
-  st = SecKeychainItemCopyContent(2, ptr, ptr, length, &amp;outData);
-    // warn: release data before another call to the allocator
-
-  if (st == noErr)
-    SecKeychainItemFreeContent(ptr, outData);
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  SecKeychainItemRef itemRef = 0;
-  SecKeychainAttributeInfo *info = 0;
-  SecItemClass *itemClass = 0;
-  SecKeychainAttributeList *attrList = 0;
-  UInt32 *length = 0;
-  void *outData = 0;
-
-  OSStatus st =
-    SecKeychainItemCopyAttributesAndData(itemRef, info,
-                                         itemClass, &amp;attrList,
-                                         length, &amp;outData);
-
-  SecKeychainItemFreeContent(attrList, outData);
-    // warn: deallocator doesn't match the allocator
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.AtSync"><div class="namedescr expandable"><span class="name">
-osx.cocoa.AtSync</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check for nil pointers used as mutexes for <code>@synchronized</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(id x) {
-  if (!x)
-    @synchronized(x) {} // warn: nil value used as mutex
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  id y;
-  @synchronized(y) {} // warn: uninitialized value used as mutex
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.ClassRelease"><div class="namedescr expandable"><span class="name">
-osx.cocoa.ClassRelease</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check for sending <code>retain</code>, <code>release</code>, or <code>
-autorelease</code> directly to a class.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@interface MyClass : NSObject
-@end
-
-void test(void) {
-  [MyClass release]; // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.Dealloc"><div class="namedescr expandable"><span class="name">
-osx.cocoa.Dealloc</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warn about Objective-C classes that lack a correct implementation
-of <code>-dealloc</code>.
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@interface MyObject : NSObject {
-  id _myproperty;
-}
-@end
-
-@implementation MyObject // warn: lacks 'dealloc'
-@end
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-@interface MyObject : NSObject {}
-@property(assign) id myproperty;
-@end
-
-@implementation MyObject // warn: does not send 'dealloc' to super
-- (void)dealloc {
-  self.myproperty = 0;
-}
-@end
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-@interface MyObject : NSObject {
-  id _myproperty;
-}
-@property(retain) id myproperty;
-@end
-
-@implementation MyObject
-@synthesize myproperty = _myproperty;
-  // warn: var was retained but wasn't released
-- (void)dealloc {
-  [super dealloc];
-}
-@end
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-@interface MyObject : NSObject {
-  id _myproperty;
-}
-@property(assign) id myproperty;
-@end
-
-@implementation MyObject
-@synthesize myproperty = _myproperty;
-  // warn: var wasn't retained but was released
-- (void)dealloc {
-  [_myproperty release];
-  [super dealloc];
-}
-@end
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.IncompatibleMethodTypes"><div class="namedescr expandable"><span class="name">
-osx.cocoa.IncompatibleMethodTypes</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check for an incompatible type signature when overriding an Objective-C method.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@interface MyClass1 : NSObject
-- (int)foo;
-@end
-
-@implementation MyClass1
-- (int)foo { return 1; }
-@end
-
-@interface MyClass2 : MyClass1
-- (float)foo;
-@end
-
-@implementation MyClass2
-- (float)foo { return 1.0; } // warn
-@end
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.MissingSuperCall"><div class="namedescr expandable"><span class="name">
-osx.cocoa.MissingSuperCall</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warn about Objective-C methods that lack a necessary call to super. (Note: The
-compiler now has a warning for methods annotated with <code>objc_requires_super</code>
-attribute. The checker exists to check methods in the Cocoa frameworks
-that haven't yet adopted this attribute.)</div></div></a></td>
-<td><div class="example"><pre>
-@interface Test : UIViewController
-@end
-@implementation test
-- (void)viewDidLoad {} // warn
-@end
-</pre></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.NSAutoreleasePool"><div class="namedescr expandable"><span class="name">
-osx.cocoa.NSAutoreleasePool</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warn for suboptimal uses of NSAutoreleasePool in Objective-C
-GC mode (<code>-fobjc-gc</code> compiler option).</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
-  [pool release]; // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.NSError"><div class="namedescr expandable"><span class="name">
-osx.cocoa.NSError</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check usage of <code>NSError**</code> parameters.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@interface A : NSObject
-- (void)foo:(NSError **)error;
-@end
-
-@implementation A
-- (void)foo:(NSError **)error {
-  // warn: method accepting NSError** should have a non-void
-  // return value
-}
-@end
-</pre></div>
-<div class="example"><pre>
-@interface A : NSObject
-- (BOOL)foo:(NSError **)error;
-@end
-
-@implementation A
-- (BOOL)foo:(NSError **)error {
-  *error = 0; // warn: potential null dereference
-  return 0;
-}
-@end
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.NilArg"><div class="namedescr expandable"><span class="name">
-osx.cocoa.NilArg</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check for prohibited nil arguments in specific Objective-C method calls:<div class=functions>
-- caseInsensitiveCompare:<br>
-- compare:<br>
-- compare:options:<br>
-- compare:options:range:<br>
-- compare:options:range:locale:<br>
-- componentsSeparatedByCharactersInSet:<br>
-- initWithFormat:</div></div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-NSComparisonResult test(NSString *s) {
-  NSString *aString = nil;
-  return [s caseInsensitiveCompare:aString];
-    // warn: argument to 'NSString' method
-    // 'caseInsensitiveCompare:' cannot be nil
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.ObjCGenerics"><div class="namedescr expandable"><span class="name">
-osx.cocoa.ObjCGenerics</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check for type errors when using Objective-C generics</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-NSMutableArray<NSString *> *names = [NSMutableArray array];
-NSMutableArray *birthDates = names;
-
-// Warning: Conversion from value of type 'NSDate *'
-// to incompatible type 'NSString *'
-[birthDates addObject: [NSDate date]];
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.RetainCount"><div class="namedescr expandable"><span class="name">
-osx.cocoa.RetainCount</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check for leaks and violations of the Cocoa Memory Management rules.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  NSString *s = [[NSString alloc] init]; // warn
-}
-</pre></div>
-<div class="example"><pre>
-CFStringRef test(char *bytes) {
-  return CFStringCreateWithCStringNoCopy(
-           0, bytes, NSNEXTSTEPStringEncoding, 0); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.SelfInit"><div class="namedescr expandable"><span class="name">
-osx.cocoa.SelfInit</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check that <code>self</code> is properly initialized inside an initializer
-method.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@interface MyObj : NSObject {
-  id x;
-}
-- (id)init;
-@end
-
-@implementation MyObj
-- (id)init {
-  [super init];
-  x = 0; // warn: instance variable used while 'self' is not
-         // initialized
-  return 0;
-}
-@end
-</pre></div>
-<div class="example"><pre>
-@interface MyObj : NSObject
-- (id)init;
-@end
-
-@implementation MyObj
-- (id)init {
-  [super init];
-  return self; // warn: returning uninitialized 'self'
-}
-@end
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.SuperDealloc"><div class="namedescr expandable"><span class="name">
-osx.cocoa.SuperDealloc</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warn about improper use of '[super dealloc]' in Objective-C</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@interface SuperDeallocThenReleaseIvarClass : NSObject {
-  NSObject *_ivar;
-}
-@end
-
-@implementation SuperDeallocThenReleaseIvarClass
-- (void)dealloc {
-  [super dealloc];
-  [_ivar release]; // warn
-}
-@end
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.UnusedIvars"><div class="namedescr expandable"><span class="name">
-osx.cocoa.UnusedIvars</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warn about private ivars that are never used.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@interface MyObj : NSObject {
-@private
-  id x; // warn
-}
-@end
-
-@implementation MyObj
-@end
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.cocoa.VariadicMethodTypes"><div class="namedescr expandable"><span class="name">
-osx.cocoa.VariadicMethodTypes</span><span class="lang">
-(ObjC)</span><div class="descr">
-Check for passing non-Objective-C types to variadic collection initialization
-methods that expect only Objective-C types.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  [NSSet setWithObjects:@"Foo", "Bar", nil];
-    // warn: argument should be an ObjC pointer type, not 'char *'
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.coreFoundation.CFError"><div class="namedescr expandable"><span class="name">
-osx.coreFoundation.CFError</span><span class="lang">
-(C)</span><div class="descr">
-Check usage of <code>CFErrorRef*</code> parameters.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(CFErrorRef *error) {
-  // warn: function accepting CFErrorRef* should have a
-  // non-void return
-}
-</pre></div>
-<div class="example"><pre>
-int foo(CFErrorRef *error) {
-  *error = 0; // warn: potential null dereference
-  return 0;
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.coreFoundation.CFNumber"><div class="namedescr expandable"><span class="name">
-osx.coreFoundation.CFNumber</span><span class="lang">
-(C)</span><div class="descr">
-Check for improper uses of <code>CFNumberCreate</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-CFNumberRef test(unsigned char x) {
-  return CFNumberCreate(0, kCFNumberSInt16Type, &amp;x);
-   // warn: 8 bit integer is used to initialize a 16 bit integer
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.coreFoundation.CFRetainRelease"><div class="namedescr expandable"><span class="name">
-osx.coreFoundation.CFRetainRelease</span><span class="lang">
-(C)</span><div class="descr">
-Check for null arguments to <code>CFRetain</code>, <code>CFRelease</code>,
-<code>CFMakeCollectable</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(CFTypeRef p) {
-  if (!p)
-    CFRetain(p); // warn
-}
-</pre></div>
-<div class="example"><pre>
-void test(int x, CFTypeRef p) {
-  if (p)
-    return;
-
-  CFRelease(p); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.coreFoundation.containers.OutOfBounds"><div class="namedescr expandable"><span class="name">
-osx.coreFoundation.containers.OutOfBounds</span><span class="lang">
-(C)</span><div class="descr">
-Checks for index out-of-bounds when using <code>CFArray</code> API.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  CFArrayRef A = CFArrayCreate(0, 0, 0, &amp;kCFTypeArrayCallBacks);
-  CFArrayGetValueAtIndex(A, 0); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="osx.coreFoundation.containers.PointerSizedValues"><div class="namedescr expandable"><span class="name">
-osx.coreFoundation.containers.PointerSizedValues</span><span class="lang">
-(C)</span><div class="descr">
-Warns if <code>CFArray</code>, <code>CFDictionary</code>, <code>CFSet</code> are
-created with non-pointer-size values.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int x[] = { 1 };
-  CFArrayRef A = CFArrayCreate(0, (const void **)x, 1,
-                               &amp;kCFTypeArrayCallBacks); // warn
-}
-</pre></div></div></td></tr>
-
-</tbody></table>
-
-<!-- =========================== security =========================== -->
-<h3 id="security_checkers">Security Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="security.FloatLoopCounter"><div class="namedescr expandable"><span class="name">
-security.FloatLoopCounter</span><span class="lang">
-(C)</span><div class="descr">
-Warn on using a floating point value as a loop counter (CERT: FLP30-C,
-FLP30-CPP).</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  for (float x = 0.1f; x <= 1.0f; x += 0.1f) {} // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="security.insecureAPI.UncheckedReturn"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.UncheckedReturn</span><span class="lang">
-(C)</span><div class="descr">
-Warn on uses of functions whose return values must be always checked:<div class=functions>
-setuid<br>
-setgid<br>
-seteuid<br>
-setegid<br>
-setreuid<br>
-setregid</div></div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  setuid(1); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="security.insecureAPI.bcmp"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.bcmp</span><span class="lang">
-(C)</span><div class="descr">
-Warn on uses of the <code>bcmp</code> function.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  bcmp(ptr0, ptr1, n); // warn
-}
-</pre></div></div></td></tr>
-
-<tr><td><a id="security.insecureAPI.bcopy"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.bcopy</span><span class="lang">
-(C)</span><div class="descr">
-Warn on uses of the <code>bcopy</code> function.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  bcopy(src, dst, n); // warn
-}
-</pre></div></div></td></tr>
-
-<tr><td><a id="security.insecureAPI.bzero"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.bzero</span><span class="lang">
-(C)</span><div class="descr">
-Warn on uses of the <code>bzero</code> function.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  bzero(ptr, n); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="security.insecureAPI.getpw"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.getpw</span><span class="lang">
-(C)</span><div class="descr">
-Warn on uses of the <code>getpw</code> function.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  char buff[1024];
-  getpw(2, buff); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="security.insecureAPI.gets"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.gets</span><span class="lang">
-(C)</span><div class="descr">
-Warn on uses of the <code>gets</code> function.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  char buff[1024];
-  gets(buff); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="security.insecureAPI.mkstemp"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.mkstemp</span><span class="lang">
-(C)</span><div class="descr">
-Warn when <code>mktemp</code>, <code>mkstemp</code>, <code>mkstemps</code> or
-<code>mkdtemp</code> is passed fewer than 6
-X's in the format string.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  mkstemp("XX"); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="security.insecureAPI.mktemp"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.mktemp</span><span class="lang">
-(C)</span><div class="descr">
-Warn on uses of the <code>mktemp</code> function.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  char *x = mktemp("/tmp/zxcv"); // warn: insecure, use mkstemp
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="security.insecureAPI.rand"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.rand</span><span class="lang">
-(C)</span><div class="descr">
-Warn on uses of inferior random number generating functions (only if <code>arc4random</code>
-function is available):<div class=functions>
-drand48<br>
-erand48<br>
-jrand48<br>
-lcong48<br>
-lrand48<br>
-mrand48<br>
-nrand48<br>
-random<br>
-rand_r</div></div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  random(); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="security.insecureAPI.strcpy"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.strcpy</span><span class="lang">
-(C)</span><div class="descr">
-Warn on uses of the <code>strcpy</code> and <code>strcat</code> functions.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  char x[4];
-  char *y = "abcd";
-
-  strcpy(x, y); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="security.insecureAPI.vfork"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.vfork</span><span class="lang">
-(C)</span><div class="descr">
-Warn on uses of the <code>vfork</code> function.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  vfork(); // warn
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="security.insecureAPI.decodeValueOfObjCType"><div class="namedescr expandable"><span class="name">
-security.insecureAPI.decodeValueOfObjCType</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warn on uses of the <code>-[NSCoder decodeValueOfObjCType:at:]</code> method.
-The safe alternative is <code>-[NSCoder decodeValueOfObjCType:at:size:]</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test(NSCoder *decoder) {
-  // This would be a vulnerability on 64-bit platforms
-  // but not on 32-bit platforms.
-  NSUInteger x;
-  [decoder decodeValueOfObjCType:"I" at:&x]; // warn
-}
-</pre></div></div></td></tr>
-
-</tbody></table>
-
-<!-- =========================== unix =========================== -->
-<h3 id="unix_checkers">Unix Checkers</h3>
-<table class="checkers">
-<colgroup><col class="namedescr"><col class="example"></colgroup>
-<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
-
-<tbody>
-<tr><td><a id="unix.API"><div class="namedescr expandable"><span class="name">
-unix.API</span><span class="lang">
-(C)</span><div class="descr">
-Check calls to various UNIX/POSIX functions:<div class=functions>
-open<br>
-pthread_once<br>
-calloc<br>
-malloc<br>
-realloc<br>
-alloca<br></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-// Currently the check is performed for apple targets only.
-void test(const char *path) {
-  int fd = open(path, O_CREAT);
-    // warn: call to 'open' requires a third argument when the
-    // 'O_CREAT' flag is set
-}
-</pre></div>
-<div class="example"><pre>
-void f();
-
-void test() {
-  pthread_once_t pred = {0x30B1BCBA, {0}};
-  pthread_once(&amp;pred, f);
-    // warn: call to 'pthread_once' uses the local variable
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  void *p = malloc(0); // warn: allocation size of 0 bytes
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  void *p = calloc(0, 42); // warn: allocation size of 0 bytes
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  void *p = malloc(1);
-  p = realloc(p, 0); // warn: allocation size of 0 bytes
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  void *p = alloca(0); // warn: allocation size of 0 bytes
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  void *p = valloc(0); // warn: allocation size of 0 bytes
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="unix.Malloc"><div class="namedescr expandable"><span class="name">
-unix.Malloc</span><span class="lang">
-(C)</span><div class="descr">
-Check for memory leaks, double free, and use-after-free and offset problems
-involving <code>malloc</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int *p = malloc(1);
-  free(p);
-  free(p); // warn: attempt to free released memory
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  int *p = malloc(sizeof(int));
-  free(p);
-  *p = 1; // warn: use after free
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  int *p = malloc(1);
-  if (p)
-    return; // warn: memory is never released
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  int a[] = { 1 };
-  free(a); // warn: argument is not allocated by malloc
-}
-</pre></div>
-<div class="example"><pre>
-void test() {
-  int *p = malloc(sizeof(char));
-  p = p - 1;
-  free(p); // warn: argument to free() is offset by -4 bytes
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="unix.MallocSizeof"><div class="namedescr expandable"><span class="name">
-unix.MallocSizeof</span><span class="lang">
-(C)</span><div class="descr">
-Check for dubious <code>malloc</code>, <code>calloc</code> or
-<code>realloc</code> arguments involving <code>sizeof</code>.</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  long *p = malloc(sizeof(short));
-    // warn: result is converted to 'long *', which is
-    // incompatible with operand type 'short'
-  free(p);
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="unix.MismatchedDeallocator"><div class="namedescr expandable"><span class="name">
-unix.MismatchedDeallocator</span><span class="lang">
-(C, C++, ObjC)</span><div class="descr">
-Check for mismatched deallocators (e.g. passing a pointer allocating
-with <code>new</code> to <code>free()</code>).</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-// C, C++
-void test() {
-  int *p = (int *)malloc(sizeof(int));
-  delete p; // warn
-}
-</pre></div>
-<div class="example"><pre>
-// C, C++
-void __attribute((ownership_returns(malloc))) *user_malloc(size_t);
-
-void test() {
-  int *p = (int *)user_malloc(sizeof(int));
-  delete p; // warn
-}
-</pre></div>
-<div class="example"><pre>
-// C, C++
-void test() {
-  int *p = new int;
-  free(p); // warn
-}
-</pre></div>
-<div class="example"><pre>
-// C, C++
-void test() {
-  int *p = new int[1];
-  realloc(p, sizeof(long)); // warn
-}
-</pre></div>
-<div class="example"><pre>
-// C, C++
-template &lt;typename T&gt;
-struct SimpleSmartPointer {
-  T *ptr;
-
-  explicit SimpleSmartPointer(T *p = 0) : ptr(p) {}
-  ~SimpleSmartPointer() {
-    delete ptr; // warn
-  }
-};
-
-void test() {
-  SimpleSmartPointer&lt;int&gt; a((int *)malloc(4));
-}
-</pre></div>
-<div class="example"><pre>
-// C++
-void test() {
-  int *p = (int *)operator new(0);
-  delete[] p; // warn
-}
-</pre></div>
-<div class="example"><pre>
-// Objective-C, C++
-void test(NSUInteger dataLength) {
-  int *p = new int;
-  NSData *d = [NSData dataWithBytesNoCopy:p
-               length:sizeof(int) freeWhenDone:1];
-    // warn +dataWithBytesNoCopy:length:freeWhenDone: cannot take
-    // ownership of memory allocated by 'new'
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="unix.Vfork"><div class="namedescr expandable"><span class="name">
-unix.Vfork</span><span class="lang">
-(C)</span><div class="descr">
-Check for proper usage of vfork</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-int test(int x) {
-  pid_t pid = vfork(); // warn
-  if (pid != 0)
-    return 0;
-
-  switch (x) {
-  case 0:
-    pid = 1;
-    execl("", "", 0);
-    _exit(1);
-    break;
-  case 1:
-    x = 0; // warn: this assignment is prohibited
-    break;
-  case 2:
-    foo(); // warn: this function call is prohibited
-    break;
-  default:
-    return 0; // warn: return is prohibited
-  }
-
-  while(1);
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="unix.cstring.BadSizeArg"><div class="namedescr expandable"><span class="name">
-unix.cstring.BadSizeArg</span><span class="lang">
-(C)</span><div class="descr">
-Check the size argument passed to <code>strncat</code> for common erroneous
-patterns. Use <code>-Wno-strncat-size</code> compiler option to mute other
-<code>strncat</code>-related compiler warnings.
-</div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  char dest[3];
-  strncat(dest, "***", sizeof(dest));
-    // warn: potential buffer overflow
-}
-</pre></div></div></td></tr>
-
-
-<tr><td><a id="unix.cstring.NullArg"><div class="namedescr expandable"><span class="name">
-unix.cstring.NullArg</span><span class="lang">
-(C)</span><div class="descr">
-Check for null pointers being passed as arguments to C string functions:<div class=functions>
-strlen<br>
-strnlen<br>
-strcpy<br>
-strncpy<br>
-strcat<br>
-strncat<br>
-strcmp<br>
-strncmp<br>
-strcasecmp<br>
-strncasecmp</div></div></div></a></td>
-<td><div class="example"><pre>
-int test() {
-  return strlen(0); // warn
-}
-</pre></div></td></tr>
-
-</tbody></table>
+<h1>The clangd documentation has moved to clang.llvm.org</h1>
+<p style="color:red; font-size:200%">This page is deprecated and will be removed in release 21.0</p>
+<a href="https://clang.llvm.org/docs/analyzer/checkers.html">The new site<a>
+<script>window.location='https://clang.llvm.org/docs/analyzer/checkers.html'</script>
 
 </div> <!-- page -->
 </div> <!-- content -->
diff --git a/compiler-rt/include/profile/MemProfData.inc b/compiler-rt/include/profile/MemProfData.inc
index 3dc8847..3f785bd 100644
--- a/compiler-rt/include/profile/MemProfData.inc
+++ b/compiler-rt/include/profile/MemProfData.inc
@@ -39,6 +39,8 @@
 #define MEMPROF_RAW_SUPPORTED_VERSIONS                                         \
   { 3ULL, 4ULL }
 
+#define MEMPROF_V3_MIB_SIZE 132ULL;
+
 #define MEMPROF_BUILDID_MAX_SIZE 32ULL
 
 namespace llvm {
diff --git a/compiler-rt/test/fuzzer/fuzzer-leak.test b/compiler-rt/test/fuzzer/fuzzer-leak.test
index dd22fde..43d896e 100644
--- a/compiler-rt/test/fuzzer/fuzzer-leak.test
+++ b/compiler-rt/test/fuzzer/fuzzer-leak.test
@@ -1,4 +1,6 @@
 REQUIRES: lsan
+// See https://github.com/llvm/llvm-project/issues/97712.
+UNSUPPORTED: target={{.*}}
 
 RUN: %cpp_compiler %S/LeakTest.cpp -o %t-LeakTest
 RUN: %cpp_compiler %S/ThreadedLeakTest.cpp -o %t-ThreadedLeakTest
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index e2b55fc..7df3905 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -72,11 +72,8 @@ void DataSharingProcessor::processStep2(mlir::Operation *op, bool isLoop) {
     firOpBuilder.setInsertionPointAfter(op);
     insertDeallocs();
   } else {
-    // insert dummy instruction to mark the insertion position
-    mlir::Value undefMarker = firOpBuilder.create<fir::UndefOp>(
-        op->getLoc(), firOpBuilder.getIndexType());
+    mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
     insertDeallocs();
-    firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp());
   }
 }
 
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 9ad092a1..5b2cf793 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1136,7 +1136,7 @@ static void genSingleClauses(lower::AbstractConverter &converter,
 static void genTargetClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
-    mlir::Location loc, bool processHostOnlyClauses, bool processReduction,
+    mlir::Location loc, bool processHostOnlyClauses,
     mlir::omp::TargetClauseOps &clauseOps,
     llvm::SmallVectorImpl<const semantics::Symbol *> &mapSyms,
     llvm::SmallVectorImpl<mlir::Location> &mapLocs,
@@ -1678,7 +1678,7 @@ static mlir::omp::TargetOp
 genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
             semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
             mlir::Location loc, const ConstructQueue &queue,
-            ConstructQueue::iterator item, bool outerCombined = false) {
+            ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   lower::StatementContext stmtCtx;
 
@@ -1692,10 +1692,9 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   llvm::SmallVector<mlir::Location> mapLocs, devicePtrLocs, deviceAddrLocs;
   llvm::SmallVector<mlir::Type> mapTypes, devicePtrTypes, deviceAddrTypes;
   genTargetClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
-                   processHostOnlyClauses, /*processReduction=*/outerCombined,
-                   clauseOps, mapSyms, mapLocs, mapTypes, deviceAddrSyms,
-                   deviceAddrLocs, deviceAddrTypes, devicePtrSyms,
-                   devicePtrLocs, devicePtrTypes);
+                   processHostOnlyClauses, clauseOps, mapSyms, mapLocs,
+                   mapTypes, deviceAddrSyms, deviceAddrLocs, deviceAddrTypes,
+                   devicePtrSyms, devicePtrLocs, devicePtrTypes);
 
   llvm::SmallVector<const semantics::Symbol *> privateSyms;
   DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
@@ -2101,8 +2100,7 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_target:
-    genTargetOp(converter, symTable, semaCtx, eval, loc, queue, item,
-                /*outerCombined=*/false);
+    genTargetOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_target_data:
     genTargetDataOp(converter, symTable, semaCtx, eval, loc, queue, item);
diff --git a/libc/src/__support/FPUtil/generic/FMod.h b/libc/src/__support/FPUtil/generic/FMod.h
index f840a92..0b9244e 100644
--- a/libc/src/__support/FPUtil/generic/FMod.h
+++ b/libc/src/__support/FPUtil/generic/FMod.h
@@ -160,7 +160,8 @@ template <typename T> struct FModDivisionInvMultHelper {
 template <typename T, typename U = typename FPBits<T>::StorageType,
           typename DivisionHelper = FModDivisionSimpleHelper<U>>
 class FMod {
-  static_assert(cpp::is_floating_point_v<T> && cpp::is_unsigned_v<U> &&
+  static_assert(cpp::is_floating_point_v<T> &&
+                    is_unsigned_integral_or_big_int_v<U> &&
                     (sizeof(U) * CHAR_BIT > FPBits<T>::FRACTION_LEN),
                 "FMod instantiated with invalid type.");
 
diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp
index 083deb8..b6483bb 100644
--- a/libc/src/__support/OSUtil/linux/fcntl.cpp
+++ b/libc/src/__support/OSUtil/linux/fcntl.cpp
@@ -22,6 +22,14 @@
 namespace LIBC_NAMESPACE::internal {
 
 int fcntl(int fd, int cmd, void *arg) {
+#if SYS_fcntl
+  constexpr auto FCNTL_SYSCALL_ID = SYS_fcntl;
+#elif defined(SYS_fcntl64)
+  constexpr auto FCNTL_SYSCALL_ID = SYS_fcntl64;
+#else
+#error "fcntl and fcntl64 syscalls not available."
+#endif
+
   switch (cmd) {
   case F_OFD_SETLKW: {
     struct flock *flk = reinterpret_cast<struct flock *>(arg);
@@ -33,7 +41,7 @@ int fcntl(int fd, int cmd, void *arg) {
     flk64.l_len = flk->l_len;
     flk64.l_pid = flk->l_pid;
     // create a syscall
-    return LIBC_NAMESPACE::syscall_impl<int>(SYS_fcntl, fd, cmd, &flk64);
+    return LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
   }
   case F_OFD_GETLK:
   case F_OFD_SETLK: {
@@ -46,7 +54,8 @@ int fcntl(int fd, int cmd, void *arg) {
     flk64.l_len = flk->l_len;
     flk64.l_pid = flk->l_pid;
     // create a syscall
-    int retVal = LIBC_NAMESPACE::syscall_impl<int>(SYS_fcntl, fd, cmd, &flk64);
+    int retVal =
+        LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
     // On failure, return
     if (retVal == -1)
       return -1;
@@ -67,8 +76,8 @@ int fcntl(int fd, int cmd, void *arg) {
   }
   case F_GETOWN: {
     struct f_owner_ex fex;
-    int ret =
-        LIBC_NAMESPACE::syscall_impl<int>(SYS_fcntl, fd, F_GETOWN_EX, &fex);
+    int ret = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd,
+                                                F_GETOWN_EX, &fex);
     if (ret >= 0)
       return fex.type == F_OWNER_PGRP ? -fex.pid : fex.pid;
     libc_errno = -ret;
@@ -77,7 +86,7 @@ int fcntl(int fd, int cmd, void *arg) {
   // The general case
   default: {
     int retVal = LIBC_NAMESPACE::syscall_impl<int>(
-        SYS_fcntl, fd, cmd, reinterpret_cast<void *>(arg));
+        FCNTL_SYSCALL_ID, fd, cmd, reinterpret_cast<void *>(arg));
     if (retVal >= 0) {
       return retVal;
     }
diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h
index 59a3912..82a5251 100644
--- a/libc/src/__support/big_int.h
+++ b/libc/src/__support/big_int.h
@@ -731,6 +731,10 @@ public:
     return *result.div(other);
   }
 
+  LIBC_INLINE constexpr BigInt operator%=(const BigInt &other) {
+    return *this->div(other);
+  }
+
   LIBC_INLINE constexpr BigInt &operator*=(const BigInt &other) {
     *this = *this * other;
     return *this;
diff --git a/libc/src/sched/linux/sched_rr_get_interval.cpp b/libc/src/sched/linux/sched_rr_get_interval.cpp
index 9d7d0e9..5dcac203 100644
--- a/libc/src/sched/linux/sched_rr_get_interval.cpp
+++ b/libc/src/sched/linux/sched_rr_get_interval.cpp
@@ -36,7 +36,7 @@ LLVM_LIBC_FUNCTION(int, sched_rr_get_interval,
                                             tid, &ts32);
     if (ret == 0) {
       tp->tv_sec = ts32.tv_sec;
-      tp->tv_nsec = ts32.tv_nsec;
+      tp->tv_nsec = static_cast<long int>(ts32.tv_nsec);
     }
   } else
     // When tp is a nullptr, we still do the syscall to set ret and errno
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 98f263b7..e1de08e 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -239,7 +239,7 @@ if(NOT LIBC_TARGET_OS_IS_GPU)
 
   add_custom_command(TARGET libc_str_to_float_comparison_test
                      POST_BUILD
-                     COMMAND $<TARGET_FILE:libc_str_to_float_comparison_test> ${float_test_file}
+                     COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:libc_str_to_float_comparison_test> ${float_test_file}
                      DEPENDS ${float_test_file}
                      COMMENT "Test the strtof and strtod implementations against precomputed results."
                      VERBATIM)
diff --git a/libc/test/src/__support/time/linux/timeout_test.cpp b/libc/test/src/__support/time/linux/timeout_test.cpp
index 886d438..37b3763 100644
--- a/libc/test/src/__support/time/linux/timeout_test.cpp
+++ b/libc/test/src/__support/time/linux/timeout_test.cpp
@@ -45,7 +45,7 @@ TEST(LlvmLibcSupportLinuxTimeoutTest, NoChangeIfClockIsMonotonic) {
   ensure_monotonicity(*result);
   ASSERT_FALSE(result->is_realtime());
   ASSERT_EQ(result->get_timespec().tv_sec, static_cast<time_t>(10000));
-  ASSERT_EQ(result->get_timespec().tv_nsec, static_cast<time_t>(0));
+  ASSERT_EQ(result->get_timespec().tv_nsec, static_cast<long int>(0));
 }
 TEST(LlvmLibcSupportLinuxTimeoutTest, ValidAfterConversion) {
   timespec ts;
diff --git a/libc/test/src/math/smoke/CopySignTest.h b/libc/test/src/math/smoke/CopySignTest.h
index 1eb323a..7dc6f5a 100644
--- a/libc/test/src/math/smoke/CopySignTest.h
+++ b/libc/test/src/math/smoke/CopySignTest.h
@@ -42,9 +42,9 @@ public:
     StorageType v = 0;
     for (int i = 0; i <= COUNT; ++i, v += STEP) {
       FPBits x_bits = FPBits(v);
-      T x = T(v);
       if (x_bits.is_nan() || x_bits.is_inf())
         continue;
+      T x = x_bits.get_val();
 
       T res1 = func(x, -x);
       ASSERT_FP_EQ(res1, -x);
diff --git a/libc/test/src/sys/mman/linux/CMakeLists.txt b/libc/test/src/sys/mman/linux/CMakeLists.txt
index 0762a2d..b63c76f 100644
--- a/libc/test/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/test/src/sys/mman/linux/CMakeLists.txt
@@ -138,6 +138,7 @@ add_libc_unittest(
     libc.include.sys_mman
     libc.include.sys_syscall
     libc.src.errno.errno
+    libc.src.fcntl.fcntl
     libc.src.sys.mman.shm_open
     libc.src.sys.mman.shm_unlink
     libc.src.sys.mman.mmap
diff --git a/libc/test/src/sys/mman/linux/shm_test.cpp b/libc/test/src/sys/mman/linux/shm_test.cpp
index 3b1a2aa..4b8971f 100644
--- a/libc/test/src/sys/mman/linux/shm_test.cpp
+++ b/libc/test/src/sys/mman/linux/shm_test.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/OSUtil/syscall.h"
+#include "src/fcntl/fcntl.h"
 #include "src/sys/mman/mmap.h"
 #include "src/sys/mman/munmap.h"
 #include "src/sys/mman/shm_open.h"
@@ -29,9 +30,7 @@ TEST(LlvmLibcShmTest, Basic) {
               returns(GE(0)).with_errno(EQ(0)));
 
   // check that FD_CLOEXEC is set by default.
-  // TODO: use fcntl when implemented.
-  // https://github.com/llvm/llvm-project/issues/84968
-  long flag = LIBC_NAMESPACE::syscall_impl(SYS_fcntl, fd, F_GETFD);
+  long flag = LIBC_NAMESPACE::fcntl(fd, F_GETFD);
   ASSERT_GE(static_cast<int>(flag), 0);
   EXPECT_NE(static_cast<int>(flag) & FD_CLOEXEC, 0);
 
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index e748ff6..f617ffd 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -269,7 +269,7 @@
 "`3355 <https://wg21.link/LWG3355>`__","The memory algorithms should support move-only input iterators introduced by P1207","Prague","|Complete|","15.0","|ranges|"
 "`3356 <https://wg21.link/LWG3356>`__","``__cpp_lib_nothrow_convertible``\  should be ``__cpp_lib_is_nothrow_convertible``\ ","Prague","|Complete|","12.0"
 "`3358 <https://wg21.link/LWG3358>`__","|sect|\ [span.cons] is mistaken that ``to_address``\  can throw","Prague","|Complete|","17.0"
-"`3359 <https://wg21.link/LWG3359>`__","``<chrono>``\  leap second support should allow for negative leap seconds","Prague","|Complete|","19.0","|chrono|"
+"`3359 <https://wg21.link/LWG3359>`__","``<chrono>``\  leap second support should allow for negative leap seconds","Prague","|In Progress|","","|chrono|"
 "`3360 <https://wg21.link/LWG3360>`__","``three_way_comparable_with``\  is inconsistent with similar concepts","Prague","|Nothing To Do|","","|spaceship|"
 "`3362 <https://wg21.link/LWG3362>`__","Strike ``stop_source``\ 's ``operator!=``\ ","Prague","",""
 "`3363 <https://wg21.link/LWG3363>`__","``drop_while_view``\  should opt-out of ``sized_range``\ ","Prague","|Nothing To Do|","","|ranges|"
diff --git a/libcxx/src/tzdb.cpp b/libcxx/src/tzdb.cpp
index 5951e59..2d07796 100644
--- a/libcxx/src/tzdb.cpp
+++ b/libcxx/src/tzdb.cpp
@@ -626,29 +626,49 @@ static void __parse_leap_seconds(vector<leap_second>& __leap_seconds, istream&&
   // seconds since 1 January 1970.
   constexpr auto __offset = sys_days{1970y / January / 1} - sys_days{1900y / January / 1};
 
-  while (true) {
-    switch (__input.peek()) {
-    case istream::traits_type::eof():
-      return;
-
-    case ' ':
-    case '\t':
-    case '\n':
-      __input.get();
-      continue;
+  struct __entry {
+    sys_seconds __timestamp;
+    seconds __value;
+  };
+  vector<__entry> __entries;
+  [&] {
+    while (true) {
+      switch (__input.peek()) {
+      case istream::traits_type::eof():
+        return;
+
+      case ' ':
+      case '\t':
+      case '\n':
+        __input.get();
+        continue;
+
+      case '#':
+        chrono::__skip_line(__input);
+        continue;
+      }
 
-    case '#':
+      sys_seconds __date = sys_seconds{seconds{chrono::__parse_integral(__input, false)}} - __offset;
+      chrono::__skip_mandatory_whitespace(__input);
+      seconds __value{chrono::__parse_integral(__input, false)};
       chrono::__skip_line(__input);
-      continue;
-    }
 
-    sys_seconds __date = sys_seconds{seconds{chrono::__parse_integral(__input, false)}} - __offset;
-    chrono::__skip_mandatory_whitespace(__input);
-    seconds __value{chrono::__parse_integral(__input, false)};
-    chrono::__skip_line(__input);
-
-    __leap_seconds.emplace_back(std::__private_constructor_tag{}, __date, __value);
-  }
+      __entries.emplace_back(__date, __value);
+    }
+  }();
+  // The Standard requires the leap seconds to be sorted. The file
+  // leap-seconds.list usually provides them in sorted order, but that is not
+  // guaranteed so we ensure it here.
+  ranges::sort(__entries, {}, &__entry::__timestamp);
+
+  // The database should contain the number of seconds inserted by a leap
+  // second (1 or -1). So the difference between the two elements is stored.
+  // std::ranges::views::adjacent has not been implemented yet.
+  (void)ranges::adjacent_find(__entries, [&](const __entry& __first, const __entry& __second) {
+    __leap_seconds.emplace_back(
+        std::__private_constructor_tag{}, __second.__timestamp, __second.__value - __first.__value);
+    return false;
+  });
 }
 
 void __init_tzdb(tzdb& __tzdb, __tz::__rules_storage_type& __rules) {
@@ -667,10 +687,6 @@ void __init_tzdb(tzdb& __tzdb, __tz::__rules_storage_type& __rules) {
   // The latter is much easier to parse, it seems Howard shares that
   // opinion.
   chrono::__parse_leap_seconds(__tzdb.leap_seconds, ifstream{__root / "leap-seconds.list"});
-  // The Standard requires the leap seconds to be sorted. The file
-  // leap-seconds.list usually provides them in sorted order, but that is not
-  // guaranteed so we ensure it here.
-  std::ranges::sort(__tzdb.leap_seconds);
 }
 
 #ifdef _WIN32
diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
index 6403658..e6adda3 100644
--- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
+++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
@@ -158,6 +158,11 @@ void framework_self_test() {
 template <typename T> class UncompressibleAllocator : public std::allocator<T> {
  public:
   char X;
+
+  template <class U>
+  struct rebind {
+    using other = UncompressibleAllocator<U>;
+  };
 };
 
 void string_test() {
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/leap_seconds.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/leap_seconds.pass.cpp
index 25a0f00..d7ae219 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.db/leap_seconds.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/leap_seconds.pass.cpp
@@ -83,32 +83,39 @@ static void test_leap_seconds() {
 2303683200  12  # 1 Jan 1973
 2287785600  11  # 1 Jul 1972
 2272060800  10  # 1 Jan 1972
-86400        1  # 2 Jan 1900 Dummy entry to test before 1970
+86400        9  # 2 Jan 1900 Dummy entry to test before 1970
+1            8  # 2 Jan 1900 Dummy entry to test before 1970
+
+# Fictional negative leap second
+2303769600  11  # 2 Jan 1973
 
 # largest accepted value by the parser
-5764607523034234879 2
+5764607523034234879 12
 )");
 
-  assert(result.leap_seconds.size() == 5);
+  assert(result.leap_seconds.size() == 6);
 
   assert(result.leap_seconds[0].date() == sys_seconds{sys_days{1900y / January / 2}});
   assert(result.leap_seconds[0].value() == 1s);
 
   assert(result.leap_seconds[1].date() == sys_seconds{sys_days{1972y / January / 1}});
-  assert(result.leap_seconds[1].value() == 10s);
+  assert(result.leap_seconds[1].value() == 1s);
 
   assert(result.leap_seconds[2].date() == sys_seconds{sys_days{1972y / July / 1}});
-  assert(result.leap_seconds[2].value() == 11s);
+  assert(result.leap_seconds[2].value() == 1s);
 
   assert(result.leap_seconds[3].date() == sys_seconds{sys_days{1973y / January / 1}});
-  assert(result.leap_seconds[3].value() == 12s);
+  assert(result.leap_seconds[3].value() == 1s);
+
+  assert(result.leap_seconds[4].date() == sys_seconds{sys_days{1973y / January / 2}});
+  assert(result.leap_seconds[4].value() == -1s);
 
-  assert(result.leap_seconds[4].date() ==
+  assert(result.leap_seconds[5].date() ==
          sys_seconds{5764607523034234879s
                      // The database uses 1900-01-01 as epoch.
                      - std::chrono::duration_cast<std::chrono::seconds>(
                            sys_days{1970y / January / 1} - sys_days{1900y / January / 1})});
-  assert(result.leap_seconds[4].value() == 2s);
+  assert(result.leap_seconds[5].value() == 1s);
 }
 
 int main(int, const char**) {
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/leap_seconds.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/leap_seconds.pass.cpp
index f873ad3..39023c3 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/leap_seconds.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/leap_seconds.pass.cpp
@@ -33,34 +33,33 @@ using namespace std::literals::chrono_literals;
 // At the moment of writing that list is the actual list in the IANA database.
 // If in the future more leap seconds can be added.
 static const std::array leap_seconds = {
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1972y / std::chrono::January / 1}}, 10s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1972y / std::chrono::July / 1}}, 11s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1973y / std::chrono::January / 1}}, 12s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1974y / std::chrono::January / 1}}, 13s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1975y / std::chrono::January / 1}}, 14s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1976y / std::chrono::January / 1}}, 15s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1977y / std::chrono::January / 1}}, 16s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1978y / std::chrono::January / 1}}, 17s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1979y / std::chrono::January / 1}}, 18s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1980y / std::chrono::January / 1}}, 19s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1981y / std::chrono::July / 1}}, 20s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1982y / std::chrono::July / 1}}, 21s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1983y / std::chrono::July / 1}}, 22s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1985y / std::chrono::July / 1}}, 23s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1988y / std::chrono::January / 1}}, 24s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1990y / std::chrono::January / 1}}, 25s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1991y / std::chrono::January / 1}}, 26s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1992y / std::chrono::July / 1}}, 27s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1993y / std::chrono::July / 1}}, 28s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1994y / std::chrono::July / 1}}, 29s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1996y / std::chrono::January / 1}}, 30s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1997y / std::chrono::July / 1}}, 31s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1999y / std::chrono::January / 1}}, 32s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{2006y / std::chrono::January / 1}}, 33s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{2009y / std::chrono::January / 1}}, 34s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{2012y / std::chrono::July / 1}}, 35s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{2015y / std::chrono::July / 1}}, 36s),
-    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{2017y / std::chrono::January / 1}}, 37s)};
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1972y / std::chrono::July / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1973y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1974y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1975y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1976y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1977y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1978y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1979y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1980y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1981y / std::chrono::July / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1982y / std::chrono::July / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1983y / std::chrono::July / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1985y / std::chrono::July / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1988y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1990y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1991y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1992y / std::chrono::July / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1993y / std::chrono::July / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1994y / std::chrono::July / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1996y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1997y / std::chrono::July / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{1999y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{2006y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{2009y / std::chrono::January / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{2012y / std::chrono::July / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{2015y / std::chrono::July / 1}}, 1s),
+    std::make_pair(std::chrono::sys_seconds{std::chrono::sys_days{2017y / std::chrono::January / 1}}, 1s)};
 
 int main(int, const char**) {
   const std::chrono::tzdb& tzdb = std::chrono::get_tzdb();
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp
index 0b40ac9..e6812e9 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03
 
 // FIXME: Why does this start to fail with GCC 14?
-// XFAIL: gcc-14
+// XFAIL: !(c++11 || c++14) && gcc-14
 
 // See https://llvm.org/PR31384.
 
diff --git a/libcxx/trigger b/libcxx/trigger
deleted file mode 100644
index e69de29..0000000
--- a/libcxx/trigger
+++ /dev/null
diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp
index 1fd120a..35fd478 100644
--- a/lld/MinGW/Driver.cpp
+++ b/lld/MinGW/Driver.cpp
@@ -199,7 +199,7 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   // a GNU compatible linker. As long as an output for the -v option
   // contains "GNU" or "with BFD", they recognize us as GNU-compatible.
   if (args.hasArg(OPT_v) || args.hasArg(OPT_version))
-    message(getLLDVersion() + ", compatible with GNU linkers");
+    message(getLLDVersion() + " (compatible with GNU linkers)");
 
   // The behavior of -v or --version is a bit strange, but this is
   // needed for compatibility with GNU linkers.
diff --git a/lld/test/MinGW/driver.test b/lld/test/MinGW/driver.test
index 44ec588..cbccd09 100644
--- a/lld/test/MinGW/driver.test
+++ b/lld/test/MinGW/driver.test
@@ -268,7 +268,9 @@ APPCONTAINER: -appcontainer
 RUN: ld.lld -m i386pep --version 2>&1 | FileCheck -check-prefix=VERSION %s
 RUN: ld.lld -m i386pep -v 2>&1 | FileCheck -check-prefix=VERSION %s
 RUN: not ld.lld -m i386pep -v xyz 2>&1 | FileCheck -check-prefix=VERSION %s
-VERSION: LLD {{.*}}, compatible with GNU linkers
+# This literal string is required for compatibility with older Meson versions,
+# see https://github.com/mesonbuild/meson/pull/13383.
+VERSION: LLD {{.*}} (compatible with GNU linkers)
 
 RUN: ld.lld -m i386pep --help 2>&1 | FileCheck -check-prefix=HELP %s
 HELP: USAGE:
diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h
index ff7c60b..2d6e5a4 100644
--- a/lldb/include/lldb/Target/Language.h
+++ b/lldb/include/lldb/Target/Language.h
@@ -363,6 +363,10 @@ public:
     return false;
   }
 
+  /// Returns true if this Language supports exception breakpoints via a
+  /// corresponding LanguageRuntime plugin.
+  virtual bool SupportsExceptionBreakpoints() const { return false; }
+
 protected:
   // Classes that inherit from Language can see and modify these
 
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index cd4c779..a5fe927 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -308,9 +308,6 @@ public:
         case eLanguageTypeC_plus_plus_14:
           m_exception_language = eLanguageTypeC_plus_plus;
           break;
-        case eLanguageTypeObjC:
-          m_exception_language = eLanguageTypeObjC;
-          break;
         case eLanguageTypeObjC_plus_plus:
           error_context =
               "Set exception breakpoints separately for c++ and objective-c";
@@ -319,6 +316,12 @@ public:
           error_context = "Unknown language type for exception breakpoint";
           break;
         default:
+          if (Language *languagePlugin = Language::FindPlugin(language)) {
+            if (languagePlugin->SupportsExceptionBreakpoints()) {
+              m_exception_language = language;
+              break;
+            }
+          }
           error_context = "Unsupported language type for exception breakpoint";
         }
         if (!error_context.empty())
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index ad467c3..05cfa05 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -202,194 +202,6 @@ bool lldb_private::formatters::LibcxxUniquePointerSummaryProvider(
   return true;
 }
 
-/*
- (lldb) fr var ibeg --raw --ptr-depth 1
- (std::__1::__map_iterator<std::__1::__tree_iterator<std::__1::pair<int,
- std::__1::basic_string<char, std::__1::char_traits<char>,
- std::__1::allocator<char> > >, std::__1::__tree_node<std::__1::pair<int,
- std::__1::basic_string<char, std::__1::char_traits<char>,
- std::__1::allocator<char> > >, void *> *, long> >) ibeg = {
- __i_ = {
- __ptr_ = 0x0000000100103870 {
- std::__1::__tree_node_base<void *> = {
- std::__1::__tree_end_node<std::__1::__tree_node_base<void *> *> = {
- __left_ = 0x0000000000000000
- }
- __right_ = 0x0000000000000000
- __parent_ = 0x00000001001038b0
- __is_black_ = true
- }
- __value_ = {
- first = 0
- second = { std::string }
- */
-
-lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
-    LibCxxMapIteratorSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
-    : SyntheticChildrenFrontEnd(*valobj_sp), m_pair_ptr(), m_pair_sp() {
-  if (valobj_sp)
-    Update();
-}
-
-lldb::ChildCacheState
-lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
-  m_pair_sp.reset();
-  m_pair_ptr = nullptr;
-
-  ValueObjectSP valobj_sp = m_backend.GetSP();
-  if (!valobj_sp)
-    return lldb::ChildCacheState::eRefetch;
-
-  TargetSP target_sp(valobj_sp->GetTargetSP());
-
-  if (!target_sp)
-    return lldb::ChildCacheState::eRefetch;
-
-  // this must be a ValueObject* because it is a child of the ValueObject we
-  // are producing children for it if were a ValueObjectSP, we would end up
-  // with a loop (iterator -> synthetic -> child -> parent == iterator) and
-  // that would in turn leak memory by never allowing the ValueObjects to die
-  // and free their memory
-  m_pair_ptr = valobj_sp
-                   ->GetValueForExpressionPath(
-                       ".__i_.__ptr_->__value_", nullptr, nullptr,
-                       ValueObject::GetValueForExpressionPathOptions()
-                           .DontCheckDotVsArrowSyntax()
-                           .SetSyntheticChildrenTraversal(
-                               ValueObject::GetValueForExpressionPathOptions::
-                                   SyntheticChildrenTraversal::None),
-                       nullptr)
-                   .get();
-
-  if (!m_pair_ptr) {
-    m_pair_ptr = valobj_sp
-                     ->GetValueForExpressionPath(
-                         ".__i_.__ptr_", nullptr, nullptr,
-                         ValueObject::GetValueForExpressionPathOptions()
-                             .DontCheckDotVsArrowSyntax()
-                             .SetSyntheticChildrenTraversal(
-                                 ValueObject::GetValueForExpressionPathOptions::
-                                     SyntheticChildrenTraversal::None),
-                         nullptr)
-                     .get();
-    if (m_pair_ptr) {
-      auto __i_(valobj_sp->GetChildMemberWithName("__i_"));
-      if (!__i_) {
-        m_pair_ptr = nullptr;
-        return lldb::ChildCacheState::eRefetch;
-      }
-      CompilerType pair_type(
-          __i_->GetCompilerType().GetTypeTemplateArgument(0));
-      std::string name;
-      uint64_t bit_offset_ptr;
-      uint32_t bitfield_bit_size_ptr;
-      bool is_bitfield_ptr;
-      pair_type = pair_type.GetFieldAtIndex(
-          0, name, &bit_offset_ptr, &bitfield_bit_size_ptr, &is_bitfield_ptr);
-      if (!pair_type) {
-        m_pair_ptr = nullptr;
-        return lldb::ChildCacheState::eRefetch;
-      }
-
-      auto addr(m_pair_ptr->GetValueAsUnsigned(LLDB_INVALID_ADDRESS));
-      m_pair_ptr = nullptr;
-      if (addr && addr != LLDB_INVALID_ADDRESS) {
-        auto ts = pair_type.GetTypeSystem();
-        auto ast_ctx = ts.dyn_cast_or_null<TypeSystemClang>();
-        if (!ast_ctx)
-          return lldb::ChildCacheState::eRefetch;
-
-        // Mimick layout of std::__tree_iterator::__ptr_ and read it in
-        // from process memory.
-        //
-        // The following shows the contiguous block of memory:
-        //
-        //        +-----------------------------+ class __tree_end_node
-        // __ptr_ | pointer __left_;            |
-        //        +-----------------------------+ class __tree_node_base
-        //        | pointer __right_;           |
-        //        | __parent_pointer __parent_; |
-        //        | bool __is_black_;           |
-        //        +-----------------------------+ class __tree_node
-        //        | __node_value_type __value_; | <<< our key/value pair
-        //        +-----------------------------+
-        //
-        CompilerType tree_node_type = ast_ctx->CreateStructForIdentifier(
-            llvm::StringRef(),
-            {{"ptr0",
-              ast_ctx->GetBasicType(lldb::eBasicTypeVoid).GetPointerType()},
-             {"ptr1",
-              ast_ctx->GetBasicType(lldb::eBasicTypeVoid).GetPointerType()},
-             {"ptr2",
-              ast_ctx->GetBasicType(lldb::eBasicTypeVoid).GetPointerType()},
-             {"cw", ast_ctx->GetBasicType(lldb::eBasicTypeBool)},
-             {"payload", pair_type}});
-        std::optional<uint64_t> size = tree_node_type.GetByteSize(nullptr);
-        if (!size)
-          return lldb::ChildCacheState::eRefetch;
-        WritableDataBufferSP buffer_sp(new DataBufferHeap(*size, 0));
-        ProcessSP process_sp(target_sp->GetProcessSP());
-        Status error;
-        process_sp->ReadMemory(addr, buffer_sp->GetBytes(),
-                               buffer_sp->GetByteSize(), error);
-        if (error.Fail())
-          return lldb::ChildCacheState::eRefetch;
-        DataExtractor extractor(buffer_sp, process_sp->GetByteOrder(),
-                                process_sp->GetAddressByteSize());
-        auto pair_sp = CreateValueObjectFromData(
-            "pair", extractor, valobj_sp->GetExecutionContextRef(),
-            tree_node_type);
-        if (pair_sp)
-          m_pair_sp = pair_sp->GetChildAtIndex(4);
-      }
-    }
-  }
-
-  return lldb::ChildCacheState::eRefetch;
-}
-
-llvm::Expected<uint32_t> lldb_private::formatters::
-    LibCxxMapIteratorSyntheticFrontEnd::CalculateNumChildren() {
-  return 2;
-}
-
-lldb::ValueObjectSP
-lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::GetChildAtIndex(
-    uint32_t idx) {
-  if (m_pair_ptr)
-    return m_pair_ptr->GetChildAtIndex(idx);
-  if (m_pair_sp)
-    return m_pair_sp->GetChildAtIndex(idx);
-  return lldb::ValueObjectSP();
-}
-
-bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
-    MightHaveChildren() {
-  return true;
-}
-
-size_t lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
-    GetIndexOfChildWithName(ConstString name) {
-  if (name == "first")
-    return 0;
-  if (name == "second")
-    return 1;
-  return UINT32_MAX;
-}
-
-lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
-    ~LibCxxMapIteratorSyntheticFrontEnd() {
-  // this will be deleted when its parent dies (since it's a child object)
-  // delete m_pair_ptr;
-}
-
-SyntheticChildrenFrontEnd *
-lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEndCreator(
-    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
-  return (valobj_sp ? new LibCxxMapIteratorSyntheticFrontEnd(valobj_sp)
-                    : nullptr);
-}
-
 lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd::
     LibCxxUnorderedMapIteratorSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
     : SyntheticChildrenFrontEnd(*valobj_sp) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
index 7fe15d1..21dba015 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
@@ -87,31 +87,6 @@ bool LibcxxContainerSummaryProvider(ValueObject &valobj, Stream &stream,
 bool LibcxxSpanSummaryProvider(ValueObject &valobj, Stream &stream,
                                const TypeSummaryOptions &options);
 
-class LibCxxMapIteratorSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
-public:
-  LibCxxMapIteratorSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
-
-  llvm::Expected<uint32_t> CalculateNumChildren() override;
-
-  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
-
-  lldb::ChildCacheState Update() override;
-
-  bool MightHaveChildren() override;
-
-  size_t GetIndexOfChildWithName(ConstString name) override;
-
-  ~LibCxxMapIteratorSyntheticFrontEnd() override;
-
-private:
-  ValueObject *m_pair_ptr;
-  lldb::ValueObjectSP m_pair_sp;
-};
-
-SyntheticChildrenFrontEnd *
-LibCxxMapIteratorSyntheticFrontEndCreator(CXXSyntheticChildren *,
-                                          lldb::ValueObjectSP);
-
 /// Formats libcxx's std::unordered_map iterators
 ///
 /// In raw form a std::unordered_map::iterator is represented as follows:
@@ -248,6 +223,10 @@ LibcxxStdMapSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                      lldb::ValueObjectSP);
 
 SyntheticChildrenFrontEnd *
+LibCxxMapIteratorSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                          lldb::ValueObjectSP);
+
+SyntheticChildrenFrontEnd *
 LibcxxStdUnorderedMapSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                               lldb::ValueObjectSP);
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
index 6c2bc1a..c2bb355 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
@@ -208,6 +208,27 @@ private:
   size_t m_count = UINT32_MAX;
   std::map<size_t, MapIterator> m_iterators;
 };
+
+class LibCxxMapIteratorSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  LibCxxMapIteratorSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+  bool MightHaveChildren() override;
+
+  size_t GetIndexOfChildWithName(ConstString name) override;
+
+  ~LibCxxMapIteratorSyntheticFrontEnd() override;
+
+private:
+  ValueObject *m_pair_ptr;
+  lldb::ValueObjectSP m_pair_sp;
+};
 } // namespace formatters
 } // namespace lldb_private
 
@@ -456,3 +477,169 @@ lldb_private::formatters::LibcxxStdMapSyntheticFrontEndCreator(
     CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
   return (valobj_sp ? new LibcxxStdMapSyntheticFrontEnd(valobj_sp) : nullptr);
 }
+
+lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
+    LibCxxMapIteratorSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : SyntheticChildrenFrontEnd(*valobj_sp), m_pair_ptr(), m_pair_sp() {
+  if (valobj_sp)
+    Update();
+}
+
+lldb::ChildCacheState
+lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::Update() {
+  m_pair_sp.reset();
+  m_pair_ptr = nullptr;
+
+  ValueObjectSP valobj_sp = m_backend.GetSP();
+  if (!valobj_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  TargetSP target_sp(valobj_sp->GetTargetSP());
+
+  if (!target_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  // this must be a ValueObject* because it is a child of the ValueObject we
+  // are producing children for it if were a ValueObjectSP, we would end up
+  // with a loop (iterator -> synthetic -> child -> parent == iterator) and
+  // that would in turn leak memory by never allowing the ValueObjects to die
+  // and free their memory
+  m_pair_ptr = valobj_sp
+                   ->GetValueForExpressionPath(
+                       ".__i_.__ptr_->__value_", nullptr, nullptr,
+                       ValueObject::GetValueForExpressionPathOptions()
+                           .DontCheckDotVsArrowSyntax()
+                           .SetSyntheticChildrenTraversal(
+                               ValueObject::GetValueForExpressionPathOptions::
+                                   SyntheticChildrenTraversal::None),
+                       nullptr)
+                   .get();
+
+  if (!m_pair_ptr) {
+    m_pair_ptr = valobj_sp
+                     ->GetValueForExpressionPath(
+                         ".__i_.__ptr_", nullptr, nullptr,
+                         ValueObject::GetValueForExpressionPathOptions()
+                             .DontCheckDotVsArrowSyntax()
+                             .SetSyntheticChildrenTraversal(
+                                 ValueObject::GetValueForExpressionPathOptions::
+                                     SyntheticChildrenTraversal::None),
+                         nullptr)
+                     .get();
+    if (m_pair_ptr) {
+      auto __i_(valobj_sp->GetChildMemberWithName("__i_"));
+      if (!__i_) {
+        m_pair_ptr = nullptr;
+        return lldb::ChildCacheState::eRefetch;
+      }
+      CompilerType pair_type(
+          __i_->GetCompilerType().GetTypeTemplateArgument(0));
+      std::string name;
+      uint64_t bit_offset_ptr;
+      uint32_t bitfield_bit_size_ptr;
+      bool is_bitfield_ptr;
+      pair_type = pair_type.GetFieldAtIndex(
+          0, name, &bit_offset_ptr, &bitfield_bit_size_ptr, &is_bitfield_ptr);
+      if (!pair_type) {
+        m_pair_ptr = nullptr;
+        return lldb::ChildCacheState::eRefetch;
+      }
+
+      auto addr(m_pair_ptr->GetValueAsUnsigned(LLDB_INVALID_ADDRESS));
+      m_pair_ptr = nullptr;
+      if (addr && addr != LLDB_INVALID_ADDRESS) {
+        auto ts = pair_type.GetTypeSystem();
+        auto ast_ctx = ts.dyn_cast_or_null<TypeSystemClang>();
+        if (!ast_ctx)
+          return lldb::ChildCacheState::eRefetch;
+
+        // Mimick layout of std::__tree_iterator::__ptr_ and read it in
+        // from process memory.
+        //
+        // The following shows the contiguous block of memory:
+        //
+        //        +-----------------------------+ class __tree_end_node
+        // __ptr_ | pointer __left_;            |
+        //        +-----------------------------+ class __tree_node_base
+        //        | pointer __right_;           |
+        //        | __parent_pointer __parent_; |
+        //        | bool __is_black_;           |
+        //        +-----------------------------+ class __tree_node
+        //        | __node_value_type __value_; | <<< our key/value pair
+        //        +-----------------------------+
+        //
+        CompilerType tree_node_type = ast_ctx->CreateStructForIdentifier(
+            llvm::StringRef(),
+            {{"ptr0",
+              ast_ctx->GetBasicType(lldb::eBasicTypeVoid).GetPointerType()},
+             {"ptr1",
+              ast_ctx->GetBasicType(lldb::eBasicTypeVoid).GetPointerType()},
+             {"ptr2",
+              ast_ctx->GetBasicType(lldb::eBasicTypeVoid).GetPointerType()},
+             {"cw", ast_ctx->GetBasicType(lldb::eBasicTypeBool)},
+             {"payload", pair_type}});
+        std::optional<uint64_t> size = tree_node_type.GetByteSize(nullptr);
+        if (!size)
+          return lldb::ChildCacheState::eRefetch;
+        WritableDataBufferSP buffer_sp(new DataBufferHeap(*size, 0));
+        ProcessSP process_sp(target_sp->GetProcessSP());
+        Status error;
+        process_sp->ReadMemory(addr, buffer_sp->GetBytes(),
+                               buffer_sp->GetByteSize(), error);
+        if (error.Fail())
+          return lldb::ChildCacheState::eRefetch;
+        DataExtractor extractor(buffer_sp, process_sp->GetByteOrder(),
+                                process_sp->GetAddressByteSize());
+        auto pair_sp = CreateValueObjectFromData(
+            "pair", extractor, valobj_sp->GetExecutionContextRef(),
+            tree_node_type);
+        if (pair_sp)
+          m_pair_sp = pair_sp->GetChildAtIndex(4);
+      }
+    }
+  }
+
+  return lldb::ChildCacheState::eRefetch;
+}
+
+llvm::Expected<uint32_t> lldb_private::formatters::
+    LibCxxMapIteratorSyntheticFrontEnd::CalculateNumChildren() {
+  return 2;
+}
+
+lldb::ValueObjectSP
+lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::GetChildAtIndex(
+    uint32_t idx) {
+  if (m_pair_ptr)
+    return m_pair_ptr->GetChildAtIndex(idx);
+  if (m_pair_sp)
+    return m_pair_sp->GetChildAtIndex(idx);
+  return lldb::ValueObjectSP();
+}
+
+bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
+    MightHaveChildren() {
+  return true;
+}
+
+size_t lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
+    GetIndexOfChildWithName(ConstString name) {
+  if (name == "first")
+    return 0;
+  if (name == "second")
+    return 1;
+  return UINT32_MAX;
+}
+
+lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::
+    ~LibCxxMapIteratorSyntheticFrontEnd() {
+  // this will be deleted when its parent dies (since it's a child object)
+  // delete m_pair_ptr;
+}
+
+SyntheticChildrenFrontEnd *
+lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  return (valobj_sp ? new LibCxxMapIteratorSyntheticFrontEnd(valobj_sp)
+                    : nullptr);
+}
diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
index a50f4b0..a61d0f1 100644
--- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
+++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
@@ -194,6 +194,8 @@ public:
 
   llvm::StringRef GetInstanceVariableName() override { return "self"; }
 
+  bool SupportsExceptionBreakpoints() const override { return true; }
+
   // PluginInterface protocol
   llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
diff --git a/lldb/test/API/python_api/thread/TestThreadAPI.py b/lldb/test/API/python_api/thread/TestThreadAPI.py
index d5fc775..a743022 100644
--- a/lldb/test/API/python_api/thread/TestThreadAPI.py
+++ b/lldb/test/API/python_api/thread/TestThreadAPI.py
@@ -51,7 +51,8 @@ class ThreadAPITestCase(TestBase):
         """Test SBThread.frame with negative indexes."""
         self.build()
         self.validate_negative_indexing()
-
+ 
+    @expectedFailureAll(oslist=["windows"])
     def test_StepInstruction(self):
         """Test that StepInstruction preserves the plan stack."""
         self.build()
diff --git a/lldb/unittests/ValueObject/DumpValueObjectOptionsTests.cpp b/lldb/unittests/ValueObject/DumpValueObjectOptionsTests.cpp
index af6fa55..950e981 100644
--- a/lldb/unittests/ValueObject/DumpValueObjectOptionsTests.cpp
+++ b/lldb/unittests/ValueObject/DumpValueObjectOptionsTests.cpp
@@ -18,6 +18,8 @@
 
 #include "gtest/gtest.h"
 
+#include <type_traits>
+
 using namespace lldb;
 using namespace lldb_private;
 
@@ -70,29 +72,12 @@ public:
     m_type_system = m_holder->GetAST();
   }
 
-  CompilerType
-  MakeEnumType(const std::vector<std::pair<const char *, int>> enumerators,
-               bool is_signed) {
-    CompilerType int_type = m_type_system->GetBuiltinTypeForEncodingAndBitSize(
-        is_signed ? lldb::eEncodingSint : lldb::eEncodingUint, 32);
-    CompilerType enum_type = m_type_system->CreateEnumerationType(
-        "TestEnum", m_type_system->GetTranslationUnitDecl(),
-        OptionalClangModuleID(), Declaration(), int_type, false);
-
-    m_type_system->StartTagDeclarationDefinition(enum_type);
-    Declaration decl;
-    for (auto [name, value] : enumerators)
-      m_type_system->AddEnumerationValueToEnumerationType(enum_type, decl, name,
-                                                          value, 32);
-    m_type_system->CompleteTagDeclarationDefinition(enum_type);
-
-    return enum_type;
-  }
-
-  void TestDumpValueObject(
-      CompilerType enum_type,
-      const std::vector<
-          std::tuple<uint32_t, DumpValueObjectOptions, const char *>> &tests) {
+  template <typename UnderlyingType>
+  void TestDumpEnum(
+      const std::vector<std::pair<const char *, UnderlyingType>> enumerators,
+      const std::vector<std::tuple<UnderlyingType, DumpValueObjectOptions,
+                                   const char *>> &tests) {
+    CompilerType enum_type = MakeEnumType(enumerators);
     StreamString strm;
     ConstString var_name("test_var");
     ByteOrder endian = endian::InlHostByteOrder();
@@ -108,6 +93,27 @@ public:
     }
   }
 
+  template <typename UnderlyingType>
+  CompilerType MakeEnumType(
+      const std::vector<std::pair<const char *, UnderlyingType>> enumerators) {
+    CompilerType int_type = m_type_system->GetBuiltinTypeForEncodingAndBitSize(
+        std::is_same<UnderlyingType, int>::value ? lldb::eEncodingSint
+                                                 : lldb::eEncodingUint,
+        32);
+    CompilerType enum_type = m_type_system->CreateEnumerationType(
+        "TestEnum", m_type_system->GetTranslationUnitDecl(),
+        OptionalClangModuleID(), Declaration(), int_type, false);
+
+    m_type_system->StartTagDeclarationDefinition(enum_type);
+    Declaration decl;
+    for (auto [name, value] : enumerators)
+      m_type_system->AddEnumerationValueToEnumerationType(enum_type, decl, name,
+                                                          value, 32);
+    m_type_system->CompleteTagDeclarationDefinition(enum_type);
+
+    return enum_type;
+  }
+
   ExecutionContext m_exe_ctx;
   TypeSystemClang *m_type_system;
 
@@ -126,25 +132,23 @@ private:
 
 TEST_F(ValueObjectMockProcessTest, EmptyEnum) {
   // All values of an empty enum should be shown as plain numbers.
-  TestDumpValueObject(MakeEnumType({}, false),
-                      {{0, {}, "(TestEnum) test_var = 0\n"},
-                       {1, {}, "(TestEnum) test_var = 1\n"},
-                       {2, {}, "(TestEnum) test_var = 2\n"}});
-
-  TestDumpValueObject(MakeEnumType({}, true),
-                      {{-2, {}, "(TestEnum) test_var = -2\n"},
-                       {-1, {}, "(TestEnum) test_var = -1\n"},
-                       {0, {}, "(TestEnum) test_var = 0\n"},
-                       {1, {}, "(TestEnum) test_var = 1\n"},
-                       {2, {}, "(TestEnum) test_var = 2\n"}});
+  TestDumpEnum<unsigned>({}, {{0, {}, "(TestEnum) test_var = 0\n"},
+                              {1, {}, "(TestEnum) test_var = 1\n"},
+                              {2, {}, "(TestEnum) test_var = 2\n"}});
+
+  TestDumpEnum<int>({}, {{-2, {}, "(TestEnum) test_var = -2\n"},
+                         {-1, {}, "(TestEnum) test_var = -1\n"},
+                         {0, {}, "(TestEnum) test_var = 0\n"},
+                         {1, {}, "(TestEnum) test_var = 1\n"},
+                         {2, {}, "(TestEnum) test_var = 2\n"}});
 }
 
 TEST_F(ValueObjectMockProcessTest, Enum) {
   // This is not a bitfield-like enum, so values are printed as decimal by
   // default. Also we only show the enumerator name if the value is an
   // exact match.
-  TestDumpValueObject(
-      MakeEnumType({{"test_2", 2}, {"test_3", 3}}, false),
+  TestDumpEnum<unsigned>(
+      {{"test_2", 2}, {"test_3", 3}},
       {{0, {}, "(TestEnum) test_var = 0\n"},
        {1, {}, "(TestEnum) test_var = 1\n"},
        {2, {}, "(TestEnum) test_var = test_2\n"},
@@ -167,8 +171,8 @@ TEST_F(ValueObjectMockProcessTest, BitFieldLikeEnum) {
   // set. lldb treats this as a "bitfield like enum". This means we show values
   // as hex, and values without exact matches are shown as a combination of
   // enumerators and any remaining value left over.
-  TestDumpValueObject(
-      MakeEnumType({{"test_2", 2}, {"test_4", 4}}, false),
+  TestDumpEnum<unsigned>(
+      {{"test_2", 2}, {"test_4", 4}},
       {
           {0, {}, "(TestEnum) test_var = 0x0\n"},
           {1, {}, "(TestEnum) test_var = 0x1\n"},
diff --git a/llvm/cmake/config.guess b/llvm/cmake/config.guess
index 2444ed7..96cc554 100644
--- a/llvm/cmake/config.guess
+++ b/llvm/cmake/config.guess
@@ -1028,11 +1028,7 @@ EOF
 	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     s390:Linux:*:* | s390x:Linux:*:*)
-	if [ "$(grep -Ei 'debian|ubuntu' /etc/lsb-release)" ]; then
-	    echo ${UNAME_MACHINE}-linux-gnu
-	else
-	    echo ${UNAME_MACHINE}-ibm-linux
-	fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     sh64*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index c98332d..b9f02d6 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19441,6 +19441,37 @@ will be on any later loop iteration.
 This intrinsic will only return 0 if the input count is also 0. A non-zero input
 count will produce a non-zero result.
 
+'``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
+      declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
+
+Overview:
+"""""""""
+
+The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the
+concatenation of the two vector operands down to the number of elements dictated
+by the result type. The result type is a vector type that matches the type of the
+first operand vector.
+
+Arguments:
+""""""""""
+
+Both arguments must be vectors of matching element types. The first argument type must
+match the result type, while the second argument type must have a vector length that is a
+positive integer multiple of the first vector/result type. The arguments must be either be
+both fixed or both scalable vectors.
+
+
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 07c5319..a6bfd55 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -369,10 +369,6 @@ Changes to the LLVM tools
   jumping in reverse direction with shift+L/R/B). (`#95662
   <https://github.com/llvm/llvm-project/pull/95662>`_).
 
-* llvm-config now quotes and escapes paths emitted in stdout, to account for
-  spaces or other special characters in path.
-  (`#97305 <https://github.com/llvm/llvm-project/pull/97305>`_).
-
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/include/llvm/Analysis/LazyValueInfo.h b/llvm/include/llvm/Analysis/LazyValueInfo.h
index 1ac355e..3ca3e5a 100644
--- a/llvm/include/llvm/Analysis/LazyValueInfo.h
+++ b/llvm/include/llvm/Analysis/LazyValueInfo.h
@@ -60,32 +60,29 @@ namespace llvm {
       return *this;
     }
 
-    /// This is used to return true/false/dunno results.
-    enum Tristate { Unknown = -1, False = 0, True = 1 };
-
     // Public query interface.
 
     /// Determine whether the specified value comparison with a constant is
     /// known to be true or false on the specified CFG edge. Pred is a CmpInst
     /// predicate.
-    Tristate getPredicateOnEdge(CmpInst::Predicate Pred, Value *V, Constant *C,
-                                BasicBlock *FromBB, BasicBlock *ToBB,
-                                Instruction *CxtI = nullptr);
+    Constant *getPredicateOnEdge(CmpInst::Predicate Pred, Value *V, Constant *C,
+                                 BasicBlock *FromBB, BasicBlock *ToBB,
+                                 Instruction *CxtI = nullptr);
 
     /// Determine whether the specified value comparison with a constant is
     /// known to be true or false at the specified instruction. \p Pred is a
     /// CmpInst predicate. If \p UseBlockValue is true, the block value is also
     /// taken into account.
-    Tristate getPredicateAt(CmpInst::Predicate Pred, Value *V, Constant *C,
-                            Instruction *CxtI, bool UseBlockValue);
+    Constant *getPredicateAt(CmpInst::Predicate Pred, Value *V, Constant *C,
+                             Instruction *CxtI, bool UseBlockValue);
 
     /// Determine whether the specified value comparison is known to be true
     /// or false at the specified instruction. While this takes two Value's,
     /// it still requires that one of them is a constant.
     /// \p Pred is a CmpInst predicate.
     /// If \p UseBlockValue is true, the block value is also taken into account.
-    Tristate getPredicateAt(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
-                            Instruction *CxtI, bool UseBlockValue);
+    Constant *getPredicateAt(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
+                             Instruction *CxtI, bool UseBlockValue);
 
     /// Determine whether the specified value is known to be a constant at the
     /// specified instruction. Return null if not.
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 7a54fe5..c74e766 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -269,6 +269,11 @@ public:
 
   const Loop *getInnermostLoop() const { return InnermostLoop; }
 
+  DenseMap<const SCEV *, std::pair<const SCEV *, const SCEV *>> &
+  getPointerBounds() {
+    return PointerBounds;
+  }
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and
   /// applies dynamic knowledge to simplify SCEV expressions and convert them
@@ -327,6 +332,10 @@ private:
   /// backwards-vectorizable or unknown (triggering a runtime check).
   unsigned MaxTargetVectorWidthInBits = 0;
 
+  /// Mapping of SCEV expressions to their expanded pointer bounds (pair of
+  /// start and end pointer expressions).
+  DenseMap<const SCEV *, std::pair<const SCEV *, const SCEV *>> PointerBounds;
+
   /// Check whether there is a plausible dependence between the two
   /// accesses.
   ///
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 0ded98f..01624de 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -728,6 +728,9 @@ public:
     switch (ICA.getID()) {
     default:
       break;
+    case Intrinsic::experimental_vector_histogram_add:
+      // For now, we want explicit support from the target for histograms.
+      return InstructionCost::getInvalid();
     case Intrinsic::allow_runtime_check:
     case Intrinsic::allow_ubsan_check:
     case Intrinsic::annotation:
diff --git a/llvm/include/llvm/CodeGen/IndirectThunks.h b/llvm/include/llvm/CodeGen/IndirectThunks.h
index 9b064ab..6c16b32 100644
--- a/llvm/include/llvm/CodeGen/IndirectThunks.h
+++ b/llvm/include/llvm/CodeGen/IndirectThunks.h
@@ -1,4 +1,4 @@
-//===---- IndirectThunks.h - Indirect Thunk Base Class ----------*- C++ -*-===//
+//===---- IndirectThunks.h - Indirect thunk insertion helpers ---*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// Contains a base class for Passes that inject an MI thunk.
+/// Contains a base ThunkInserter class that simplifies injection of MI thunks
+/// as well as a default implementation of MachineFunctionPass wrapping
+/// several `ThunkInserter`s for targets to extend.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -15,26 +17,95 @@
 #define LLVM_CODEGEN_INDIRECTTHUNKS_H
 
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 
 namespace llvm {
 
+/// This class assists in inserting MI thunk functions into the module and
+/// rewriting the existing machine functions to call these thunks.
+///
+/// One of the common cases is implementing security mitigations that involve
+/// replacing some machine code patterns with calls to special thunk functions.
+///
+/// Inserting a module pass late in the codegen pipeline may increase memory
+/// usage, as it serializes the transformations and forces preceding passes to
+/// produce machine code for all functions before running the module pass.
+/// For that reason, ThunkInserter can be driven by a MachineFunctionPass by
+/// passing one MachineFunction at a time to its `run(MMI, MF)` method.
+/// Then, the derived class should
+/// * call createThunkFunction from its insertThunks method exactly once for
+///   each of the thunk functions to be inserted
+/// * populate the thunk in its populateThunk method
+///
+/// Note that if some other pass is responsible for rewriting the functions,
+/// the insertThunks method may simply create all possible thunks at once,
+/// probably postponed until the first occurrence of possibly affected MF.
+///
+/// Alternatively, insertThunks method can rewrite MF by itself and only insert
+/// the thunks being called. In that case InsertedThunks variable can be used
+/// to track which thunks were already inserted.
+///
+/// In any case, the thunk function has to be inserted on behalf of some other
+/// function and then populated on its own "iteration" later - this is because
+/// MachineFunctionPass will see the newly created functions, but they first
+/// have to go through the preceding passes from the same pass manager,
+/// possibly even through the instruction selector.
+//
+// FIXME Maybe implement a documented and less surprising way of modifying
+//       the module from a MachineFunctionPass that is restricted to inserting
+//       completely new functions to the module.
 template <typename Derived, typename InsertedThunksTy = bool>
 class ThunkInserter {
   Derived &getDerived() { return *static_cast<Derived *>(this); }
 
-protected:
   // A variable used to track whether (and possible which) thunks have been
   // inserted so far. InsertedThunksTy is usually a bool, but can be other types
   // to represent more than one type of thunk. Requires an |= operator to
   // accumulate results.
   InsertedThunksTy InsertedThunks;
-  void doInitialization(Module &M) {}
+
+protected:
+  // Interface for subclasses to use.
+
+  /// Create an empty thunk function.
+  ///
+  /// The new function will eventually be passed to populateThunk. If multiple
+  /// thunks are created, populateThunk can distinguish them by their names.
   void createThunkFunction(MachineModuleInfo &MMI, StringRef Name,
                            bool Comdat = true, StringRef TargetAttrs = "");
 
+protected:
+  // Interface for subclasses to implement.
+  //
+  // Note: all functions are non-virtual and are called via getDerived().
+  // Note: only doInitialization() has an implementation.
+
+  /// Initializes thunk inserter.
+  void doInitialization(Module &M) {}
+
+  /// Returns common prefix for thunk function's names.
+  const char *getThunkPrefix(); // undefined
+
+  /// Checks if MF may use thunks (true - maybe, false - definitely not).
+  bool mayUseThunk(const MachineFunction &MF); // undefined
+
+  /// Rewrites the function if necessary, returns the set of thunks added.
+  InsertedThunksTy insertThunks(MachineModuleInfo &MMI, MachineFunction &MF,
+                                InsertedThunksTy ExistingThunks); // undefined
+
+  /// Populate the thunk function with instructions.
+  ///
+  /// If multiple thunks are created, the content that must be inserted in the
+  /// thunk function body should be derived from the MF's name.
+  ///
+  /// Depending on the preceding passes in the pass manager, by the time
+  /// populateThunk is called, MF may have a few target-specific instructions
+  /// (such as a single MBB containing the return instruction).
+  void populateThunk(MachineFunction &MF); // undefined
+
 public:
   void init(Module &M) {
     InsertedThunks = InsertedThunksTy{};
@@ -53,7 +124,7 @@ void ThunkInserter<Derived, InsertedThunksTy>::createThunkFunction(
 
   Module &M = const_cast<Module &>(*MMI.getModule());
   LLVMContext &Ctx = M.getContext();
-  auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
+  auto *Type = FunctionType::get(Type::getVoidTy(Ctx), false);
   Function *F = Function::Create(Type,
                                  Comdat ? GlobalValue::LinkOnceODRLinkage
                                         : GlobalValue::InternalLinkage,
@@ -95,19 +166,15 @@ bool ThunkInserter<Derived, InsertedThunksTy>::run(MachineModuleInfo &MMI,
                                                    MachineFunction &MF) {
   // If MF is not a thunk, check to see if we need to insert a thunk.
   if (!MF.getName().starts_with(getDerived().getThunkPrefix())) {
-    // Only add a thunk if one of the functions has the corresponding feature
-    // enabled in its subtarget, and doesn't enable external thunks. The target
-    // can use InsertedThunks to detect whether relevant thunks have already
-    // been inserted.
-    // FIXME: Conditionalize on indirect calls so we don't emit a thunk when
-    // nothing will end up calling it.
-    // FIXME: It's a little silly to look at every function just to enumerate
-    // the subtargets, but eventually we'll want to look at them for indirect
-    // calls, so maybe this is OK.
-    if (!getDerived().mayUseThunk(MF, InsertedThunks))
+    // Only add thunks if one of the functions may use them.
+    if (!getDerived().mayUseThunk(MF))
       return false;
 
-    InsertedThunks |= getDerived().insertThunks(MMI, MF);
+    // The target can use InsertedThunks to detect whether relevant thunks
+    // have already been inserted.
+    // FIXME: Provide the way for insertThunks to notify us whether it changed
+    //        the MF, instead of conservatively assuming it did.
+    InsertedThunks |= getDerived().insertThunks(MMI, MF, InsertedThunks);
     return true;
   }
 
@@ -116,6 +183,40 @@ bool ThunkInserter<Derived, InsertedThunksTy>::run(MachineModuleInfo &MMI,
   return true;
 }
 
+/// Basic implementation of MachineFunctionPass wrapping one or more
+/// `ThunkInserter`s passed as type parameters.
+template <typename... Inserters>
+class ThunkInserterPass : public MachineFunctionPass {
+protected:
+  std::tuple<Inserters...> TIs;
+
+  ThunkInserterPass(char &ID) : MachineFunctionPass(ID) {}
+
+public:
+  bool doInitialization(Module &M) override {
+    initTIs(M, TIs);
+    return false;
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+    return runTIs(MMI, MF, TIs);
+  }
+
+private:
+  template <typename... ThunkInserterT>
+  static void initTIs(Module &M,
+                      std::tuple<ThunkInserterT...> &ThunkInserters) {
+    (..., std::get<ThunkInserterT>(ThunkInserters).init(M));
+  }
+
+  template <typename... ThunkInserterT>
+  static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
+                     std::tuple<ThunkInserterT...> &ThunkInserters) {
+    return (0 | ... | std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF));
+  }
+};
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 7b0e5e7..8e189e9 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -680,10 +680,8 @@ public:
                       bool isTarget = false, bool isOpaque = false);
   SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL,
                             bool isTarget = false);
-  SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL,
-                                 bool LegalTypes = true);
-  SDValue getShiftAmountConstant(const APInt &Val, EVT VT, const SDLoc &DL,
-                                 bool LegalTypes = true);
+  SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL);
+  SDValue getShiftAmountConstant(const APInt &Val, EVT VT, const SDLoc &DL);
   SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL,
                                bool isTarget = false);
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9a0df8b..55b60b0 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -400,14 +400,10 @@ public:
   virtual MVT getScalarShiftAmountTy(const DataLayout &, EVT) const;
 
   /// Returns the type for the shift amount of a shift opcode. For vectors,
-  /// returns the input type. For scalars, behavior depends on \p LegalTypes. If
-  /// \p LegalTypes is true, calls getScalarShiftAmountTy, otherwise uses
-  /// pointer type. If getScalarShiftAmountTy or pointer type cannot represent
-  /// all possible shift amounts, returns MVT::i32. In general, \p LegalTypes
-  /// should be set to true for calls during type legalization and after type
-  /// legalization has been completed.
-  EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL,
-                       bool LegalTypes = true) const;
+  /// returns the input type. For scalars, calls getScalarShiftAmountTy.
+  /// If getScalarShiftAmountTy type cannot represent all possible shift
+  /// amounts, returns MVT::i32.
+  EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const;
 
   /// Return the preferred type to use for a shift opcode, given the shifted
   /// amount type is \p ShiftValueTy.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index c7d383a..95dbd28 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2640,6 +2640,12 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType
                                                      [llvm_anyvector_ty],
                                                      [IntrNoMem]>;
 
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+                                                                       [llvm_anyvector_ty, llvm_anyvector_ty],
+                                                                       [IntrNoMem]>;
+
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
 
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index b30e6a6..674c47eb 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1370,6 +1370,7 @@ LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
 
       // If V is the condition of the branch itself, then we know exactly what
       // it is.
+      // NB: The condition on a `br` can't be a vector type.
       if (Condition == Val)
         return ValueLatticeElement::get(ConstantInt::get(
                               Type::getInt1Ty(Val->getContext()), isTrueDest));
@@ -1723,7 +1724,7 @@ Constant *LazyValueInfo::getConstant(Value *V, Instruction *CxtI) {
   if (Result.isConstantRange()) {
     const ConstantRange &CR = Result.getConstantRange();
     if (const APInt *SingleVal = CR.getSingleElement())
-      return ConstantInt::get(V->getContext(), *SingleVal);
+      return ConstantInt::get(V->getType(), *SingleVal);
   }
   return nullptr;
 }
@@ -1758,7 +1759,7 @@ Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB,
   if (Result.isConstantRange()) {
     const ConstantRange &CR = Result.getConstantRange();
     if (const APInt *SingleVal = CR.getSingleElement())
-      return ConstantInt::get(V->getContext(), *SingleVal);
+      return ConstantInt::get(V->getType(), *SingleVal);
   }
   return nullptr;
 }
@@ -1774,45 +1775,26 @@ ConstantRange LazyValueInfo::getConstantRangeOnEdge(Value *V,
   return toConstantRange(Result, V->getType(), /*UndefAllowed*/ true);
 }
 
-static LazyValueInfo::Tristate
-getPredicateResult(CmpInst::Predicate Pred, Constant *C,
-                   const ValueLatticeElement &Val, const DataLayout &DL) {
+static Constant *getPredicateResult(CmpInst::Predicate Pred, Constant *C,
+                                    const ValueLatticeElement &Val,
+                                    const DataLayout &DL) {
   // If we know the value is a constant, evaluate the conditional.
-  Constant *Res = nullptr;
-  if (Val.isConstant()) {
-    Res = ConstantFoldCompareInstOperands(Pred, Val.getConstant(), C, DL);
-    if (ConstantInt *ResCI = dyn_cast_or_null<ConstantInt>(Res))
-      return ResCI->isZero() ? LazyValueInfo::False : LazyValueInfo::True;
-    return LazyValueInfo::Unknown;
-  }
+  if (Val.isConstant())
+    return ConstantFoldCompareInstOperands(Pred, Val.getConstant(), C, DL);
 
+  Type *ResTy = CmpInst::makeCmpResultType(C->getType());
   if (Val.isConstantRange()) {
     ConstantInt *CI = dyn_cast<ConstantInt>(C);
-    if (!CI) return LazyValueInfo::Unknown;
+    if (!CI)
+      return nullptr;
 
     const ConstantRange &CR = Val.getConstantRange();
-    if (Pred == ICmpInst::ICMP_EQ) {
-      if (!CR.contains(CI->getValue()))
-        return LazyValueInfo::False;
-
-      if (CR.isSingleElement())
-        return LazyValueInfo::True;
-    } else if (Pred == ICmpInst::ICMP_NE) {
-      if (!CR.contains(CI->getValue()))
-        return LazyValueInfo::True;
-
-      if (CR.isSingleElement())
-        return LazyValueInfo::False;
-    } else {
-      // Handle more complex predicates.
-      ConstantRange TrueValues =
-          ConstantRange::makeExactICmpRegion(Pred, CI->getValue());
-      if (TrueValues.contains(CR))
-        return LazyValueInfo::True;
-      if (TrueValues.inverse().contains(CR))
-        return LazyValueInfo::False;
-    }
-    return LazyValueInfo::Unknown;
+    ConstantRange RHS(CI->getValue());
+    if (CR.icmp(Pred, RHS))
+      return ConstantInt::getTrue(ResTy);
+    if (CR.icmp(CmpInst::getInversePredicate(Pred), RHS))
+      return ConstantInt::getFalse(ResTy);
+    return nullptr;
   }
 
   if (Val.isNotConstant()) {
@@ -1820,29 +1802,29 @@ getPredicateResult(CmpInst::Predicate Pred, Constant *C,
     // "V != C1".
     if (Pred == ICmpInst::ICMP_EQ) {
       // !C1 == C -> false iff C1 == C.
-      Res = ConstantFoldCompareInstOperands(ICmpInst::ICMP_NE,
-                                            Val.getNotConstant(), C, DL);
+      Constant *Res = ConstantFoldCompareInstOperands(
+          ICmpInst::ICMP_NE, Val.getNotConstant(), C, DL);
       if (Res && Res->isNullValue())
-        return LazyValueInfo::False;
+        return ConstantInt::getFalse(ResTy);
     } else if (Pred == ICmpInst::ICMP_NE) {
       // !C1 != C -> true iff C1 == C.
-      Res = ConstantFoldCompareInstOperands(ICmpInst::ICMP_NE,
-                                            Val.getNotConstant(), C, DL);
+      Constant *Res = ConstantFoldCompareInstOperands(
+          ICmpInst::ICMP_NE, Val.getNotConstant(), C, DL);
       if (Res && Res->isNullValue())
-        return LazyValueInfo::True;
+        return ConstantInt::getTrue(ResTy);
     }
-    return LazyValueInfo::Unknown;
+    return nullptr;
   }
 
-  return LazyValueInfo::Unknown;
+  return nullptr;
 }
 
 /// Determine whether the specified value comparison with a constant is known to
 /// be true or false on the specified CFG edge. Pred is a CmpInst predicate.
-LazyValueInfo::Tristate
-LazyValueInfo::getPredicateOnEdge(CmpInst::Predicate Pred, Value *V,
-                                  Constant *C, BasicBlock *FromBB,
-                                  BasicBlock *ToBB, Instruction *CxtI) {
+Constant *LazyValueInfo::getPredicateOnEdge(CmpInst::Predicate Pred, Value *V,
+                                            Constant *C, BasicBlock *FromBB,
+                                            BasicBlock *ToBB,
+                                            Instruction *CxtI) {
   Module *M = FromBB->getModule();
   ValueLatticeElement Result =
       getOrCreateImpl(M).getValueOnEdge(V, FromBB, ToBB, CxtI);
@@ -1850,10 +1832,9 @@ LazyValueInfo::getPredicateOnEdge(CmpInst::Predicate Pred, Value *V,
   return getPredicateResult(Pred, C, Result, M->getDataLayout());
 }
 
-LazyValueInfo::Tristate LazyValueInfo::getPredicateAt(CmpInst::Predicate Pred,
-                                                      Value *V, Constant *C,
-                                                      Instruction *CxtI,
-                                                      bool UseBlockValue) {
+Constant *LazyValueInfo::getPredicateAt(CmpInst::Predicate Pred, Value *V,
+                                        Constant *C, Instruction *CxtI,
+                                        bool UseBlockValue) {
   // Is or is not NonNull are common predicates being queried. If
   // isKnownNonZero can tell us the result of the predicate, we can
   // return it quickly. But this is only a fastpath, and falling
@@ -1862,18 +1843,19 @@ LazyValueInfo::Tristate LazyValueInfo::getPredicateAt(CmpInst::Predicate Pred,
   const DataLayout &DL = M->getDataLayout();
   if (V->getType()->isPointerTy() && C->isNullValue() &&
       isKnownNonZero(V->stripPointerCastsSameRepresentation(), DL)) {
+    Type *ResTy = CmpInst::makeCmpResultType(C->getType());
     if (Pred == ICmpInst::ICMP_EQ)
-      return LazyValueInfo::False;
+      return ConstantInt::getFalse(ResTy);
     else if (Pred == ICmpInst::ICMP_NE)
-      return LazyValueInfo::True;
+      return ConstantInt::getTrue(ResTy);
   }
 
   auto &Impl = getOrCreateImpl(M);
   ValueLatticeElement Result =
       UseBlockValue ? Impl.getValueInBlock(V, CxtI->getParent(), CxtI)
                     : Impl.getValueAt(V, CxtI);
-  Tristate Ret = getPredicateResult(Pred, C, Result, DL);
-  if (Ret != Unknown)
+  Constant *Ret = getPredicateResult(Pred, C, Result, DL);
+  if (Ret)
     return Ret;
 
   // Note: The following bit of code is somewhat distinct from the rest of LVI;
@@ -1904,7 +1886,7 @@ LazyValueInfo::Tristate LazyValueInfo::getPredicateAt(CmpInst::Predicate Pred,
   // analysis below.
   pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
   if (PI == PE)
-    return Unknown;
+    return nullptr;
 
   // If V is a PHI node in the same block as the context, we need to ask
   // questions about the predicate as applied to the incoming value along
@@ -1912,23 +1894,23 @@ LazyValueInfo::Tristate LazyValueInfo::getPredicateAt(CmpInst::Predicate Pred,
   // known along all incoming edges.
   if (auto *PHI = dyn_cast<PHINode>(V))
     if (PHI->getParent() == BB) {
-      Tristate Baseline = Unknown;
+      Constant *Baseline = nullptr;
       for (unsigned i = 0, e = PHI->getNumIncomingValues(); i < e; i++) {
         Value *Incoming = PHI->getIncomingValue(i);
         BasicBlock *PredBB = PHI->getIncomingBlock(i);
         // Note that PredBB may be BB itself.
-        Tristate Result =
+        Constant *Result =
             getPredicateOnEdge(Pred, Incoming, C, PredBB, BB, CxtI);
 
         // Keep going as long as we've seen a consistent known result for
         // all inputs.
         Baseline = (i == 0) ? Result /* First iteration */
                             : (Baseline == Result ? Baseline
-                                                  : Unknown); /* All others */
-        if (Baseline == Unknown)
+                                                  : nullptr); /* All others */
+        if (!Baseline)
           break;
       }
-      if (Baseline != Unknown)
+      if (Baseline)
         return Baseline;
     }
 
@@ -1939,11 +1921,11 @@ LazyValueInfo::Tristate LazyValueInfo::getPredicateAt(CmpInst::Predicate Pred,
     // For predecessor edge, determine if the comparison is true or false
     // on that edge. If they're all true or all false, we can conclude
     // the value of the comparison in this block.
-    Tristate Baseline = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI);
-    if (Baseline != Unknown) {
+    Constant *Baseline = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI);
+    if (Baseline) {
       // Check that all remaining incoming values match the first one.
       while (++PI != PE) {
-        Tristate Ret = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI);
+        Constant *Ret = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI);
         if (Ret != Baseline)
           break;
       }
@@ -1954,13 +1936,12 @@ LazyValueInfo::Tristate LazyValueInfo::getPredicateAt(CmpInst::Predicate Pred,
     }
   }
 
-  return Unknown;
+  return nullptr;
 }
 
-LazyValueInfo::Tristate LazyValueInfo::getPredicateAt(CmpInst::Predicate Pred,
-                                                      Value *LHS, Value *RHS,
-                                                      Instruction *CxtI,
-                                                      bool UseBlockValue) {
+Constant *LazyValueInfo::getPredicateAt(CmpInst::Predicate Pred, Value *LHS,
+                                        Value *RHS, Instruction *CxtI,
+                                        bool UseBlockValue) {
   if (auto *C = dyn_cast<Constant>(RHS))
     return getPredicateAt(Pred, LHS, C, CxtI, UseBlockValue);
   if (auto *C = dyn_cast<Constant>(LHS))
@@ -1975,19 +1956,14 @@ LazyValueInfo::Tristate LazyValueInfo::getPredicateAt(CmpInst::Predicate Pred,
     ValueLatticeElement L =
         getOrCreateImpl(M).getValueInBlock(LHS, CxtI->getParent(), CxtI);
     if (L.isOverdefined())
-      return LazyValueInfo::Unknown;
+      return nullptr;
 
     ValueLatticeElement R =
         getOrCreateImpl(M).getValueInBlock(RHS, CxtI->getParent(), CxtI);
     Type *Ty = CmpInst::makeCmpResultType(LHS->getType());
-    if (Constant *Res = L.getCompare(Pred, Ty, R, M->getDataLayout())) {
-      if (Res->isNullValue())
-        return LazyValueInfo::False;
-      if (Res->isOneValue())
-        return LazyValueInfo::True;
-    }
+    return L.getCompare(Pred, Ty, R, M->getDataLayout());
   }
-  return LazyValueInfo::Unknown;
+  return nullptr;
 }
 
 void LazyValueInfo::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 38bf6d8..f132e45 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -203,11 +203,18 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
 ///
 /// There is no conflict when the intervals are disjoint:
 /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
-static std::pair<const SCEV *, const SCEV *>
-getStartAndEndForAccess(const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy,
-                        PredicatedScalarEvolution &PSE) {
+static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
+    const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy,
+    PredicatedScalarEvolution &PSE,
+    DenseMap<const SCEV *, std::pair<const SCEV *, const SCEV *>>
+        &PointerBounds) {
   ScalarEvolution *SE = PSE.getSE();
 
+  auto [Iter, Ins] = PointerBounds.insert(
+      {PtrExpr, {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
+  if (!Ins)
+    return Iter->second;
+
   const SCEV *ScStart;
   const SCEV *ScEnd;
 
@@ -244,7 +251,8 @@ getStartAndEndForAccess(const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy,
   const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
   ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
 
-  return {ScStart, ScEnd};
+  Iter->second = {ScStart, ScEnd};
+  return Iter->second;
 }
 
 /// Calculate Start and End points of memory access using
@@ -254,8 +262,8 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
                                     unsigned DepSetId, unsigned ASId,
                                     PredicatedScalarEvolution &PSE,
                                     bool NeedsFreeze) {
-  const auto &[ScStart, ScEnd] =
-      getStartAndEndForAccess(Lp, PtrExpr, AccessTy, PSE);
+  const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
+      Lp, PtrExpr, AccessTy, PSE, DC.getPointerBounds());
   assert(!isa<SCEVCouldNotCompute>(ScStart) &&
          !isa<SCEVCouldNotCompute>(ScEnd) &&
          "must be able to compute both start and end expressions");
@@ -1964,10 +1972,9 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   if (SE.isLoopInvariant(Src, InnermostLoop) ||
       SE.isLoopInvariant(Sink, InnermostLoop)) {
     const auto &[SrcStart, SrcEnd] =
-        getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE);
+        getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE, PointerBounds);
     const auto &[SinkStart, SinkEnd] =
-        getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE);
-
+        getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE, PointerBounds);
     if (!isa<SCEVCouldNotCompute>(SrcStart) &&
         !isa<SCEVCouldNotCompute>(SrcEnd) &&
         !isa<SCEVCouldNotCompute>(SinkStart) &&
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 5476dc5..85abf00 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1739,6 +1739,20 @@ static void computeKnownBitsFromOperator(const Operator *I,
         Known &= Known2.anyextOrTrunc(BitWidth);
         break;
       }
+      case Intrinsic::x86_sse2_pmulh_w:
+      case Intrinsic::x86_avx2_pmulh_w:
+      case Intrinsic::x86_avx512_pmulh_w_512:
+        computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
+        Known = KnownBits::mulhs(Known, Known2);
+        break;
+      case Intrinsic::x86_sse2_pmulhu_w:
+      case Intrinsic::x86_avx2_pmulhu_w:
+      case Intrinsic::x86_avx512_pmulhu_w_512:
+        computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
+        computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
+        Known = KnownBits::mulhu(Known, Known2);
+        break;
       case Intrinsic::x86_sse42_crc32_64_64:
         Known.Zero.setBitsFrom(32);
         break;
@@ -6403,9 +6417,10 @@ const Value *llvm::getUnderlyingObject(const Value *V, unsigned MaxLookup) {
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast ||
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
-      V = cast<Operator>(V)->getOperand(0);
-      if (!V->getType()->isPointerTy())
+      Value *NewV = cast<Operator>(V)->getOperand(0);
+      if (!NewV->getType()->isPointerTy())
         return V;
+      V = NewV;
     } else if (auto *GA = dyn_cast<GlobalAlias>(V)) {
       if (GA->isInterposable())
         return V;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d81a54d..b13f6bc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -843,11 +843,9 @@ namespace {
 
     SelectionDAG &getDAG() const { return DAG; }
 
-    /// Returns a type large enough to hold any valid shift amount - before type
-    /// legalization these can be huge.
+    /// Convenience wrapper around TargetLowering::getShiftAmountTy.
     EVT getShiftAmountTy(EVT LHSTy) {
-      assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
-      return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes);
+      return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout());
     }
 
     /// This method returns true if we are running before type legalization or
@@ -4395,14 +4393,13 @@ template <class MatchContextClass> SDValue DAGCombiner::visitMUL(SDNode *N) {
   // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
   if (N1IsConst && !N1IsOpaqueConst && ConstValue1.isNegatedPowerOf2()) {
     unsigned Log2Val = (-ConstValue1).logBase2();
-    EVT ShiftVT = getShiftAmountTy(N0.getValueType());
 
     // FIXME: If the input is something that is easily negated (e.g. a
     // single-use add), we should put the negate there.
     return Matcher.getNode(
         ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
         Matcher.getNode(ISD::SHL, DL, VT, N0,
-                        DAG.getConstant(Log2Val, DL, ShiftVT)));
+                        DAG.getShiftAmountConstant(Log2Val, VT, DL)));
   }
 
   // Attempt to reuse an existing umul_lohi/smul_lohi node, but only if the
@@ -5101,9 +5098,9 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
 
   // fold (mulhs x, 1) -> (sra x, size(x)-1)
   if (isOneConstant(N1))
-    return DAG.getNode(ISD::SRA, DL, VT, N0,
-                       DAG.getConstant(N0.getScalarValueSizeInBits() - 1, DL,
-                                       getShiftAmountTy(VT)));
+    return DAG.getNode(
+        ISD::SRA, DL, VT, N0,
+        DAG.getShiftAmountConstant(N0.getScalarValueSizeInBits() - 1, VT, DL));
 
   // fold (mulhs x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
@@ -5121,8 +5118,7 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
       N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
       N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
       N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
-            DAG.getConstant(SimpleSize, DL,
-                            getShiftAmountTy(N1.getValueType())));
+                       DAG.getShiftAmountConstant(SimpleSize, NewVT, DL));
       return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
     }
   }
@@ -5192,8 +5188,7 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
       N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
       N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
       N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
-            DAG.getConstant(SimpleSize, DL,
-                            getShiftAmountTy(N1.getValueType())));
+                       DAG.getShiftAmountConstant(SimpleSize, NewVT, DL));
       return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
     }
   }
@@ -5404,8 +5399,7 @@ SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
       Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
       // Compute the high part as N1.
       Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
-            DAG.getConstant(SimpleSize, DL,
-                            getShiftAmountTy(Lo.getValueType())));
+                       DAG.getShiftAmountConstant(SimpleSize, NewVT, DL));
       Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
       // Compute the low part as N0.
       Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
@@ -5458,8 +5452,7 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
       Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
       // Compute the high part as N1.
       Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
-            DAG.getConstant(SimpleSize, DL,
-                            getShiftAmountTy(Lo.getValueType())));
+                       DAG.getShiftAmountConstant(SimpleSize, NewVT, DL));
       Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
       // Compute the low part as N0.
       Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
@@ -7484,8 +7477,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   if (OpSizeInBits > 16) {
     SDLoc DL(N);
     Res = DAG.getNode(ISD::SRL, DL, VT, Res,
-                      DAG.getConstant(OpSizeInBits - 16, DL,
-                                      getShiftAmountTy(VT)));
+                      DAG.getShiftAmountConstant(OpSizeInBits - 16, VT, DL));
   }
   return Res;
 }
@@ -7603,7 +7595,7 @@ static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) {
 //   (rotr (bswap A), 16)
 static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI,
                                        SelectionDAG &DAG, SDNode *N, SDValue N0,
-                                       SDValue N1, EVT VT, EVT ShiftAmountTy) {
+                                       SDValue N1, EVT VT) {
   assert(N->getOpcode() == ISD::OR && VT == MVT::i32 &&
          "MatchBSwapHWordOrAndAnd: expecting i32");
   if (!TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
@@ -7635,7 +7627,7 @@ static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI,
 
   SDLoc DL(N);
   SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Shift0.getOperand(0));
-  SDValue ShAmt = DAG.getConstant(16, DL, ShiftAmountTy);
+  SDValue ShAmt = DAG.getShiftAmountConstant(16, VT, DL);
   return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
 }
 
@@ -7655,13 +7647,11 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
     return SDValue();
 
-  if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N0, N1, VT,
-                                              getShiftAmountTy(VT)))
+  if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N0, N1, VT))
     return BSwap;
 
   // Try again with commuted operands.
-  if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N1, N0, VT,
-                                              getShiftAmountTy(VT)))
+  if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N1, N0, VT))
     return BSwap;
 
 
@@ -7698,7 +7688,7 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
 
   // Result of the bswap should be rotated by 16. If it's not legal, then
   // do  (x << 16) | (x >> 16).
-  SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT));
+  SDValue ShAmt = DAG.getShiftAmountConstant(16, VT, DL);
   if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
     return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt);
   if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
@@ -9301,11 +9291,10 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
     return NewLoad;
 
   SDValue ShiftedLoad =
-      NeedsZext
-          ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
-                        DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT,
-                                                   SDLoc(N), LegalOperations))
-          : NewLoad;
+      NeedsZext ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
+                              DAG.getShiftAmountConstant(ZeroExtendedBytes * 8,
+                                                         VT, SDLoc(N)))
+                : NewLoad;
   return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
 }
 
@@ -10430,8 +10419,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
           TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
           TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
           TLI.isTruncateFree(VT, TruncVT)) {
-        SDValue Amt = DAG.getConstant(ShiftAmt, DL,
-            getShiftAmountTy(N0.getOperand(0).getValueType()));
+        SDValue Amt = DAG.getShiftAmountConstant(ShiftAmt, VT, DL);
         SDValue Shift = DAG.getNode(ISD::SRL, DL, VT,
                                     N0.getOperand(0), Amt);
         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT,
@@ -10679,10 +10667,9 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
       uint64_t ShiftAmt = N1C->getZExtValue();
       SDLoc DL0(N0);
-      SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT,
-                                       N0.getOperand(0),
-                          DAG.getConstant(ShiftAmt, DL0,
-                                          getShiftAmountTy(SmallVT)));
+      SDValue SmallShift =
+          DAG.getNode(ISD::SRL, DL0, SmallVT, N0.getOperand(0),
+                      DAG.getShiftAmountConstant(ShiftAmt, SmallVT, DL0));
       AddToWorklist(SmallShift.getNode());
       APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt);
       return DAG.getNode(ISD::AND, DL, VT,
@@ -10726,8 +10713,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       if (ShAmt) {
         SDLoc DL(N0);
         Op = DAG.getNode(ISD::SRL, DL, VT, Op,
-                  DAG.getConstant(ShAmt, DL,
-                                  getShiftAmountTy(Op.getValueType())));
+                         DAG.getShiftAmountConstant(ShAmt, VT, DL));
         AddToWorklist(Op.getNode());
       }
       return DAG.getNode(ISD::XOR, DL, VT, Op, DAG.getConstant(1, DL, VT));
@@ -11086,7 +11072,7 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) {
       SDValue Res = N0.getOperand(0);
       if (uint64_t NewShAmt = (ShAmt->getZExtValue() - (BW / 2)))
         Res = DAG.getNode(ISD::SHL, DL, VT, Res,
-                          DAG.getConstant(NewShAmt, DL, getShiftAmountTy(VT)));
+                          DAG.getShiftAmountConstant(NewShAmt, VT, DL));
       Res = DAG.getZExtOrTrunc(Res, DL, HalfVT);
       Res = DAG.getNode(ISD::BSWAP, DL, HalfVT, Res);
       return DAG.getZExtOrTrunc(Res, DL, VT);
@@ -12316,9 +12302,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
       if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
         return DAG.getNode(ISD::ABS, DL, VT, LHS);
 
-      SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, LHS,
-                                  DAG.getConstant(VT.getScalarSizeInBits() - 1,
-                                                  DL, getShiftAmountTy(VT)));
+      SDValue Shift = DAG.getNode(
+          ISD::SRA, DL, VT, LHS,
+          DAG.getShiftAmountConstant(VT.getScalarSizeInBits() - 1, VT, DL));
       SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
       AddToWorklist(Shift.getNode());
       AddToWorklist(Add.getNode());
@@ -14625,9 +14611,6 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
   // Shift the result left, if we've swallowed a left shift.
   SDValue Result = Load;
   if (ShLeftAmt != 0) {
-    EVT ShImmTy = getShiftAmountTy(Result.getValueType());
-    if (!isUIntN(ShImmTy.getScalarSizeInBits(), ShLeftAmt))
-      ShImmTy = VT;
     // If the shift amount is as large as the result size (but, presumably,
     // no larger than the source) then the useful bits of the result are
     // zero; we can't simply return the shortened shift, because the result
@@ -14635,8 +14618,8 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
     if (ShLeftAmt >= VT.getScalarSizeInBits())
       Result = DAG.getConstant(0, DL, VT);
     else
-      Result = DAG.getNode(ISD::SHL, DL, VT,
-                          Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
+      Result = DAG.getNode(ISD::SHL, DL, VT, Result,
+                           DAG.getShiftAmountConstant(ShLeftAmt, VT, DL));
   }
 
   if (ShiftedOffset != 0) {
@@ -16898,7 +16881,7 @@ SDValue DAGCombiner::combineFMulOrFDivWithIntPow2(SDNode *N) {
 
   // Perform actual transform.
   SDValue MantissaShiftCnt =
-      DAG.getConstant(*Mantissa, DL, getShiftAmountTy(NewIntVT));
+      DAG.getShiftAmountConstant(*Mantissa, NewIntVT, DL);
   // TODO: Sometimes Log2 is of form `(X + C)`. `(X + C) << C1` should fold to
   // `(X << C1) + (C << C1)`, but that isn't always the case because of the
   // cast. We could implement that by handle here to handle the casts.
@@ -19811,9 +19794,9 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
   // shifted by ByteShift and truncated down to NumBytes.
   if (ByteShift) {
     SDLoc DL(IVal);
-    IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal,
-                       DAG.getConstant(ByteShift*8, DL,
-                                    DC->getShiftAmountTy(IVal.getValueType())));
+    IVal = DAG.getNode(
+        ISD::SRL, DL, IVal.getValueType(), IVal,
+        DAG.getShiftAmountConstant(ByteShift * 8, IVal.getValueType(), DL));
   }
 
   // Figure out the offset for the store and the alignment of the access.
@@ -27422,12 +27405,11 @@ SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
 
   // and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit
   // constant.
-  EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
   auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
   if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) {
     unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1;
     if (!TLI.shouldAvoidTransformToShift(XType, ShCt)) {
-      SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
+      SDValue ShiftAmt = DAG.getShiftAmountConstant(ShCt, XType, DL);
       SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt);
       AddToWorklist(Shift.getNode());
 
@@ -27447,7 +27429,7 @@ SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
   if (TLI.shouldAvoidTransformToShift(XType, ShCt))
     return SDValue();
 
-  SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
+  SDValue ShiftAmt = DAG.getShiftAmountConstant(ShCt, XType, DL);
   SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt);
   AddToWorklist(Shift.getNode());
 
@@ -27661,16 +27643,13 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
       const APInt &AndMask = ConstAndRHS->getAPIntValue();
       if (TLI.shouldFoldSelectWithSingleBitTest(VT, AndMask)) {
         unsigned ShCt = AndMask.getBitWidth() - 1;
-        SDValue ShlAmt =
-            DAG.getConstant(AndMask.countl_zero(), SDLoc(AndLHS),
-                            getShiftAmountTy(AndLHS.getValueType()));
+        SDValue ShlAmt = DAG.getShiftAmountConstant(AndMask.countl_zero(), VT,
+                                                    SDLoc(AndLHS));
         SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt);
 
         // Now arithmetic right shift it all the way over, so the result is
         // either all-ones, or zero.
-        SDValue ShrAmt =
-          DAG.getConstant(ShCt, SDLoc(Shl),
-                          getShiftAmountTy(Shl.getValueType()));
+        SDValue ShrAmt = DAG.getShiftAmountConstant(ShCt, VT, SDLoc(Shl));
         SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt);
 
         return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
@@ -27718,9 +27697,9 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
       return SDValue();
 
     // shl setcc result by log2 n2c
-    return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
-                       DAG.getConstant(ShCt, SDLoc(Temp),
-                                       getShiftAmountTy(Temp.getValueType())));
+    return DAG.getNode(
+        ISD::SHL, DL, N2.getValueType(), Temp,
+        DAG.getShiftAmountConstant(ShCt, N2.getValueType(), SDLoc(Temp)));
   }
 
   // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a058b50..38f8f07 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -5724,21 +5724,20 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
 
   SDValue InOp0 = N->getOperand(0);
   if (getTypeAction(InOp0.getValueType()) == TargetLowering::TypePromoteInteger)
-    InOp0 = GetPromotedInteger(N->getOperand(0));
+    InOp0 = GetPromotedInteger(InOp0);
 
   EVT InVT = InOp0.getValueType();
+  EVT InSVT = InVT.getVectorElementType();
 
   unsigned OutNumElems = OutVT.getVectorNumElements();
   SmallVector<SDValue, 8> Ops;
   Ops.reserve(OutNumElems);
   for (unsigned i = 0; i != OutNumElems; ++i) {
-
     // Extract the element from the original vector.
-    SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(),
-      BaseIdx, DAG.getConstant(i, dl, BaseIdx.getValueType()));
-    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-      InVT.getVectorElementType(), N->getOperand(0), Index);
-
+    SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(), BaseIdx,
+                                DAG.getConstant(i, dl, BaseIdx.getValueType()));
+    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InSVT,
+                              N->getOperand(0), Index);
     SDValue Op = DAG.getAnyExtOrTrunc(Ext, dl, NOutVTElem);
     // Insert the converted element to the new vector.
     Ops.push_back(Op);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9624230..9d2deb4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1752,16 +1752,16 @@ SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
 }
 
 SDValue SelectionDAG::getShiftAmountConstant(uint64_t Val, EVT VT,
-                                             const SDLoc &DL, bool LegalTypes) {
+                                             const SDLoc &DL) {
   assert(VT.isInteger() && "Shift amount is not an integer type!");
-  EVT ShiftVT = TLI->getShiftAmountTy(VT, getDataLayout(), LegalTypes);
+  EVT ShiftVT = TLI->getShiftAmountTy(VT, getDataLayout());
   return getConstant(Val, DL, ShiftVT);
 }
 
 SDValue SelectionDAG::getShiftAmountConstant(const APInt &Val, EVT VT,
-                                             const SDLoc &DL, bool LegalTypes) {
+                                             const SDLoc &DL) {
   assert(Val.ult(VT.getScalarSizeInBits()) && "Out of range shift");
-  return getShiftAmountConstant(Val.getZExtValue(), VT, DL, LegalTypes);
+  return getShiftAmountConstant(Val.getZExtValue(), VT, DL);
 }
 
 SDValue SelectionDAG::getVectorIdxConstant(uint64_t Val, const SDLoc &DL,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 8db2708..cc55d53 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -104,6 +104,7 @@
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cstddef>
+#include <deque>
 #include <iterator>
 #include <limits>
 #include <optional>
@@ -7976,6 +7977,37 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, Trunc);
     return;
   }
+  case Intrinsic::experimental_vector_partial_reduce_add: {
+    SDValue OpNode = getValue(I.getOperand(1));
+    EVT ReducedTy = EVT::getEVT(I.getType());
+    EVT FullTy = OpNode.getValueType();
+
+    unsigned Stride = ReducedTy.getVectorMinNumElements();
+    unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
+
+    // Collect all of the subvectors
+    std::deque<SDValue> Subvectors;
+    Subvectors.push_back(getValue(I.getOperand(0)));
+    for (unsigned i = 0; i < ScaleFactor; i++) {
+      auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl);
+      Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy,
+                                       {OpNode, SourceIndex}));
+    }
+
+    // Flatten the subvector tree
+    while (Subvectors.size() > 1) {
+      Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy,
+                                       {Subvectors[0], Subvectors[1]}));
+      Subvectors.pop_front();
+      Subvectors.pop_front();
+    }
+
+    assert(Subvectors.size() == 1 &&
+           "There should only be one subvector after tree flattening");
+
+    setValue(&I, Subvectors[0]);
+    return;
+  }
   case Intrinsic::experimental_cttz_elts: {
     auto DL = getCurSDLoc();
     SDValue Op = getValue(I.getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index fdd9f1f..458f962 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1880,8 +1880,8 @@ bool TargetLowering::SimplifyDemandedBits(
             Flags.setNoSignedWrap(IsNSW);
             Flags.setNoUnsignedWrap(IsNUW);
             SDValue NewOp = TLO.DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Op0);
-            SDValue NewShiftAmt = TLO.DAG.getShiftAmountConstant(
-                ShAmt, HalfVT, dl, TLO.LegalTypes());
+            SDValue NewShiftAmt =
+                TLO.DAG.getShiftAmountConstant(ShAmt, HalfVT, dl);
             SDValue NewShift = TLO.DAG.getNode(ISD::SHL, dl, HalfVT, NewOp,
                                                NewShiftAmt, Flags);
             SDValue NewExt =
@@ -1977,8 +1977,8 @@ bool TargetLowering::SimplifyDemandedBits(
             ((InDemandedMask.countLeadingZeros() >= (BitWidth / 2)) ||
              TLO.DAG.MaskedValueIsZero(Op0, HiBits))) {
           SDValue NewOp = TLO.DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Op0);
-          SDValue NewShiftAmt = TLO.DAG.getShiftAmountConstant(
-              ShAmt, HalfVT, dl, TLO.LegalTypes());
+          SDValue NewShiftAmt =
+              TLO.DAG.getShiftAmountConstant(ShAmt, HalfVT, dl);
           SDValue NewShift =
               TLO.DAG.getNode(ISD::SRL, dl, HalfVT, NewOp, NewShiftAmt);
           return TLO.CombineTo(
@@ -2600,8 +2600,7 @@ bool TargetLowering::SimplifyDemandedBits(
         if (!(HighBits & DemandedBits)) {
           // None of the shifted in bits are needed.  Add a truncate of the
           // shift input, then shift it.
-          SDValue NewShAmt =
-              TLO.DAG.getShiftAmountConstant(ShVal, VT, dl, TLO.LegalTypes());
+          SDValue NewShAmt = TLO.DAG.getShiftAmountConstant(ShVal, VT, dl);
           SDValue NewTrunc =
               TLO.DAG.getNode(ISD::TRUNCATE, dl, VT, Src.getOperand(0));
           return TLO.CombineTo(
@@ -4254,8 +4253,7 @@ SDValue TargetLowering::foldSetCCWithBinOp(EVT VT, SDValue N0, SDValue N1,
     return SDValue();
 
   // (X - Y) == Y --> X == Y << 1
-  SDValue One =
-      DAG.getShiftAmountConstant(1, OpVT, DL, !DCI.isBeforeLegalize());
+  SDValue One = DAG.getShiftAmountConstant(1, OpVT, DL);
   SDValue YShl1 = DAG.getNode(ISD::SHL, DL, N1.getValueType(), Y, One);
   if (!DCI.isCalledByLegalizer())
     DCI.AddToWorklist(YShl1.getNode());
@@ -5113,8 +5111,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
             return DAG.getNode(
                 ISD::TRUNCATE, dl, VT,
                 DAG.getNode(ISD::SRL, dl, ShValTy, N0,
-                            DAG.getShiftAmountConstant(
-                                ShCt, ShValTy, dl, !DCI.isBeforeLegalize())));
+                            DAG.getShiftAmountConstant(ShCt, ShValTy, dl)));
           }
         } else if (Cond == ISD::SETEQ && C1 == AndRHS->getAPIntValue()) {
           // (X & 8) == 8  -->  (X & 8) >> 3
@@ -5125,8 +5122,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
             return DAG.getNode(
                 ISD::TRUNCATE, dl, VT,
                 DAG.getNode(ISD::SRL, dl, ShValTy, N0,
-                            DAG.getShiftAmountConstant(
-                                ShCt, ShValTy, dl, !DCI.isBeforeLegalize())));
+                            DAG.getShiftAmountConstant(ShCt, ShValTy, dl)));
           }
         }
       }
@@ -5144,8 +5140,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
             if (!TLI.shouldAvoidTransformToShift(ShValTy, ShiftBits)) {
               SDValue Shift = DAG.getNode(
                   ISD::SRL, dl, ShValTy, N0.getOperand(0),
-                  DAG.getShiftAmountConstant(ShiftBits, ShValTy, dl,
-                                             !DCI.isBeforeLegalize()));
+                  DAG.getShiftAmountConstant(ShiftBits, ShValTy, dl));
               SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), dl, ShValTy);
               return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond);
             }
@@ -5174,8 +5169,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
             !TLI.shouldAvoidTransformToShift(ShValTy, ShiftBits)) {
           SDValue Shift =
               DAG.getNode(ISD::SRL, dl, ShValTy, N0,
-                          DAG.getShiftAmountConstant(ShiftBits, ShValTy, dl,
-                                                     !DCI.isBeforeLegalize()));
+                          DAG.getShiftAmountConstant(ShiftBits, ShValTy, dl));
           SDValue CmpRHS = DAG.getConstant(NewC, dl, ShValTy);
           return DAG.getSetCC(dl, VT, Shift, CmpRHS, NewCond);
         }
@@ -6639,7 +6633,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
 
   EVT VT = REMNode.getValueType();
   EVT SVT = VT.getScalarType();
-  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout(), !DCI.isBeforeLegalize());
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
   EVT ShSVT = ShVT.getScalarType();
 
   // If MUL is unavailable, we cannot proceed in any case.
@@ -6897,7 +6891,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
 
   EVT VT = REMNode.getValueType();
   EVT SVT = VT.getScalarType();
-  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout(), !DCI.isBeforeLegalize());
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
   EVT ShSVT = ShVT.getScalarType();
 
   // If we are after ops legalization, and MUL is unavailable, we can not
@@ -9599,9 +9593,8 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
     for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
       unsigned ShiftIntoIdx =
           (DAG.getDataLayout().isBigEndian() ? (NumElem - 1) - Idx : Idx);
-      SDValue ShiftAmount =
-          DAG.getShiftAmountConstant(ShiftIntoIdx * SrcEltVT.getSizeInBits(),
-                                     LoadVT, SL, /*LegalTypes=*/false);
+      SDValue ShiftAmount = DAG.getShiftAmountConstant(
+          ShiftIntoIdx * SrcEltVT.getSizeInBits(), LoadVT, SL);
       SDValue ShiftedElt = DAG.getNode(ISD::SRL, SL, LoadVT, Load, ShiftAmount);
       SDValue Elt =
           DAG.getNode(ISD::AND, SL, LoadVT, ShiftedElt, SrcEltBitMask);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index ff684c7c..353b0ca 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1052,13 +1052,12 @@ MVT TargetLoweringBase::getScalarShiftAmountTy(const DataLayout &DL,
   return MVT::getIntegerVT(DL.getPointerSizeInBits(0));
 }
 
-EVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy, const DataLayout &DL,
-                                         bool LegalTypes) const {
+EVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy,
+                                         const DataLayout &DL) const {
   assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
   if (LHSTy.isVector())
     return LHSTy;
-  MVT ShiftVT =
-      LegalTypes ? getScalarShiftAmountTy(DL, LHSTy) : getPointerTy(DL);
+  MVT ShiftVT = getScalarShiftAmountTy(DL, LHSTy);
   // If any possible shift value won't fit in the prefered type, just use
   // something safe. Assume it will be legalized when the shift is expanded.
   if (ShiftVT.getSizeInBits() < Log2_32_Ceil(LHSTy.getSizeInBits()))
diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp
index e18de9a..c5835e8 100644
--- a/llvm/lib/Demangle/MicrosoftDemangle.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -2343,12 +2343,13 @@ Demangler::demangleTemplateParameterList(std::string_view &MangledName) {
       TP.N = TPRN = Arena.alloc<TemplateParameterReferenceNode>();
       TPRN->Symbol = parse(MangledName);
       TPRN->Affinity = PointerAffinity::Reference;
-    } else if (llvm::itanium_demangle::starts_with(MangledName, "$F") ||
-               llvm::itanium_demangle::starts_with(MangledName, "$G")) {
+    } else if (startsWith(MangledName, "$F", "F", !IsAutoNTTP) ||
+               startsWith(MangledName, "$G", "G", !IsAutoNTTP)) {
       TP.N = TPRN = Arena.alloc<TemplateParameterReferenceNode>();
 
       // Data member pointer.
-      MangledName.remove_prefix(1);
+      if (!IsAutoNTTP)
+        MangledName.remove_prefix(1); // Remove leading '$'
       char InheritanceSpecifier = MangledName.front();
       MangledName.remove_prefix(1);
 
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index a9aaff4..79a190f 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -1466,8 +1466,10 @@ void jitLinkForORC(
     return;
   }
 
-  if (auto Err = OnLoaded(*O.getBinary(), *Info, RTDyld.getSymbolTable()))
+  if (auto Err = OnLoaded(*O.getBinary(), *Info, RTDyld.getSymbolTable())) {
     OnEmitted(std::move(O), std::move(Info), std::move(Err));
+    return;
+  }
 
   RuntimeDyldImpl::finalizeAsync(std::move(RTDyld.Dyld), std::move(OnEmitted),
                                  std::move(O), std::move(Info));
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 8522747..b2ee758 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -652,9 +652,10 @@ static const Value *stripPointerCastsAndOffsets(
       }
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
-      V = cast<Operator>(V)->getOperand(0);
-      if (!V->getType()->isPointerTy())
+      Value *NewV = cast<Operator>(V)->getOperand(0);
+      if (!NewV->getType()->isPointerTy())
         return V;
+      V = NewV;
     } else if (StripKind != PSK_ZeroIndicesSameRepresentation &&
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       // TODO: If we know an address space cast will not change the
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c98f61d..44982f5 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6143,6 +6143,20 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::experimental_vector_partial_reduce_add: {
+    VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+    VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
+
+    unsigned VecWidth = VecTy->getElementCount().getKnownMinValue();
+    unsigned AccWidth = AccTy->getElementCount().getKnownMinValue();
+
+    Check((VecWidth % AccWidth) == 0,
+          "Invalid vector widths for partial "
+          "reduction. The width of the input vector "
+          "must be a positive integer multiple of "
+          "the width of the accumulator vector.");
+    break;
+  }
   case Intrinsic::experimental_noalias_scope_decl: {
     NoAliasScopeDecls.push_back(cast<IntrinsicInst>(&Call));
     break;
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index e6e6b7d..a30a64b 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -511,12 +511,6 @@ static void CheckBundleSubtargets(const MCSubtargetInfo *OldSTI,
 void MCELFStreamer::emitInstToData(const MCInst &Inst,
                                    const MCSubtargetInfo &STI) {
   MCAssembler &Assembler = getAssembler();
-  SmallVector<MCFixup, 4> Fixups;
-  SmallString<256> Code;
-  Assembler.getEmitter().encodeInstruction(Inst, Code, Fixups, STI);
-
-  for (auto &Fixup : Fixups)
-    fixSymbolsInTLSFixups(Fixup.getValue());
 
   // There are several possibilities here:
   //
@@ -526,9 +520,7 @@ void MCELFStreamer::emitInstToData(const MCInst &Inst,
   //
   // If bundling is enabled:
   // - If we're not in a bundle-locked group, emit the instruction into a
-  //   fragment of its own. If there are no fixups registered for the
-  //   instruction, emit a MCCompactEncodedInstFragment. Otherwise, emit a
-  //   MCDataFragment.
+  //   fragment of its own.
   // - If we're in a bundle-locked group, append the instruction to the current
   //   data fragment because we want all the instructions in a group to get into
   //   the same fragment. Be careful not to do that for the first instruction in
@@ -542,16 +534,6 @@ void MCELFStreamer::emitInstToData(const MCInst &Inst,
       // The bundle-locking directive ensures this is a new data fragment.
       DF = cast<MCDataFragment>(getCurrentFragment());
       CheckBundleSubtargets(DF->getSubtargetInfo(), &STI);
-    } else if (!isBundleLocked() && Fixups.size() == 0) {
-      // Optimize memory usage by emitting the instruction to a
-      // MCCompactEncodedInstFragment when not in a bundle-locked group and
-      // there are no fixups registered.
-      MCCompactEncodedInstFragment *CEIF =
-          getContext().allocFragment<MCCompactEncodedInstFragment>();
-      insert(CEIF);
-      CEIF->getContents().append(Code.begin(), Code.end());
-      CEIF->setHasInstructions(STI);
-      return;
     } else {
       DF = getContext().allocFragment<MCDataFragment>();
       insert(DF);
@@ -571,17 +553,22 @@ void MCELFStreamer::emitInstToData(const MCInst &Inst,
     DF = getOrCreateDataFragment(&STI);
   }
 
-  // Add the fixups and data.
+  // Emit instruction directly into data fragment.
+  size_t FixupStartIndex = DF->getFixups().size();
+  size_t CodeOffset = DF->getContents().size();
+  Assembler.getEmitter().encodeInstruction(Inst, DF->getContents(),
+                                           DF->getFixups(), STI);
+
+  auto Fixups = MutableArrayRef(DF->getFixups()).slice(FixupStartIndex);
   for (auto &Fixup : Fixups) {
-    Fixup.setOffset(Fixup.getOffset() + DF->getContents().size());
-    DF->getFixups().push_back(Fixup);
+    Fixup.setOffset(Fixup.getOffset() + CodeOffset);
+    fixSymbolsInTLSFixups(Fixup.getValue());
   }
 
   DF->setHasInstructions(STI);
   if (!Fixups.empty() && Fixups.back().getTargetKind() ==
                              getAssembler().getBackend().RelaxFixupKind)
     DF->setLinkerRelaxable();
-  DF->getContents().append(Code.begin(), Code.end());
 }
 
 void MCELFStreamer::emitBundleAlignMode(Align Alignment) {
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index afe5da6..a72e34f 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -393,10 +393,8 @@ void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
       getContext().allocFragment<MCRelaxableFragment>(Inst, STI);
   insert(IF);
 
-  SmallString<128> Code;
-  getAssembler().getEmitter().encodeInstruction(Inst, Code, IF->getFixups(),
-                                                STI);
-  IF->getContents().append(Code.begin(), Code.end());
+  getAssembler().getEmitter().encodeInstruction(Inst, IF->getContents(),
+                                                IF->getFixups(), STI);
 }
 
 #ifndef NDEBUG
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 47e1ae4..ea2f157 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -103,6 +103,7 @@ const char *Instruction::getOpcodeName(Opcode Opc) {
 #define DEF_INSTR(ID, OPC, CLASS) OPC
 #include "llvm/SandboxIR/SandboxIRValues.def"
   }
+  llvm_unreachable("Unknown Opcode");
 }
 
 bool Instruction::classof(const sandboxir::Value *From) {
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 3017a9b..3664de7 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -4118,6 +4118,199 @@ namespace {
     exp += FirstSignificant;
     buffer.erase(&buffer[0], &buffer[FirstSignificant]);
   }
+
+  void toStringImpl(SmallVectorImpl<char> &Str, const bool isNeg, int exp,
+                    APInt significand, unsigned FormatPrecision,
+                    unsigned FormatMaxPadding, bool TruncateZero) {
+    const int semanticsPrecision = significand.getBitWidth();
+
+    if (isNeg)
+      Str.push_back('-');
+
+    // Set FormatPrecision if zero.  We want to do this before we
+    // truncate trailing zeros, as those are part of the precision.
+    if (!FormatPrecision) {
+      // We use enough digits so the number can be round-tripped back to an
+      // APFloat. The formula comes from "How to Print Floating-Point Numbers
+      // Accurately" by Steele and White.
+      // FIXME: Using a formula based purely on the precision is conservative;
+      // we can print fewer digits depending on the actual value being printed.
+
+      // FormatPrecision = 2 + floor(significandBits / lg_2(10))
+      FormatPrecision = 2 + semanticsPrecision * 59 / 196;
+    }
+
+    // Ignore trailing binary zeros.
+    int trailingZeros = significand.countr_zero();
+    exp += trailingZeros;
+    significand.lshrInPlace(trailingZeros);
+
+    // Change the exponent from 2^e to 10^e.
+    if (exp == 0) {
+      // Nothing to do.
+    } else if (exp > 0) {
+      // Just shift left.
+      significand = significand.zext(semanticsPrecision + exp);
+      significand <<= exp;
+      exp = 0;
+    } else { /* exp < 0 */
+      int texp = -exp;
+
+      // We transform this using the identity:
+      //   (N)(2^-e) == (N)(5^e)(10^-e)
+      // This means we have to multiply N (the significand) by 5^e.
+      // To avoid overflow, we have to operate on numbers large
+      // enough to store N * 5^e:
+      //   log2(N * 5^e) == log2(N) + e * log2(5)
+      //                 <= semantics->precision + e * 137 / 59
+      //   (log_2(5) ~ 2.321928 < 2.322034 ~ 137/59)
+
+      unsigned precision = semanticsPrecision + (137 * texp + 136) / 59;
+
+      // Multiply significand by 5^e.
+      //   N * 5^0101 == N * 5^(1*1) * 5^(0*2) * 5^(1*4) * 5^(0*8)
+      significand = significand.zext(precision);
+      APInt five_to_the_i(precision, 5);
+      while (true) {
+        if (texp & 1)
+          significand *= five_to_the_i;
+
+        texp >>= 1;
+        if (!texp)
+          break;
+        five_to_the_i *= five_to_the_i;
+      }
+    }
+
+    AdjustToPrecision(significand, exp, FormatPrecision);
+
+    SmallVector<char, 256> buffer;
+
+    // Fill the buffer.
+    unsigned precision = significand.getBitWidth();
+    if (precision < 4) {
+      // We need enough precision to store the value 10.
+      precision = 4;
+      significand = significand.zext(precision);
+    }
+    APInt ten(precision, 10);
+    APInt digit(precision, 0);
+
+    bool inTrail = true;
+    while (significand != 0) {
+      // digit <- significand % 10
+      // significand <- significand / 10
+      APInt::udivrem(significand, ten, significand, digit);
+
+      unsigned d = digit.getZExtValue();
+
+      // Drop trailing zeros.
+      if (inTrail && !d)
+        exp++;
+      else {
+        buffer.push_back((char) ('0' + d));
+        inTrail = false;
+      }
+    }
+
+    assert(!buffer.empty() && "no characters in buffer!");
+
+    // Drop down to FormatPrecision.
+    // TODO: don't do more precise calculations above than are required.
+    AdjustToPrecision(buffer, exp, FormatPrecision);
+
+    unsigned NDigits = buffer.size();
+
+    // Check whether we should use scientific notation.
+    bool FormatScientific;
+    if (!FormatMaxPadding)
+      FormatScientific = true;
+    else {
+      if (exp >= 0) {
+        // 765e3 --> 765000
+        //              ^^^
+        // But we shouldn't make the number look more precise than it is.
+        FormatScientific = ((unsigned) exp > FormatMaxPadding ||
+                            NDigits + (unsigned) exp > FormatPrecision);
+      } else {
+        // Power of the most significant digit.
+        int MSD = exp + (int) (NDigits - 1);
+        if (MSD >= 0) {
+          // 765e-2 == 7.65
+          FormatScientific = false;
+        } else {
+          // 765e-5 == 0.00765
+          //           ^ ^^
+          FormatScientific = ((unsigned) -MSD) > FormatMaxPadding;
+        }
+      }
+    }
+
+    // Scientific formatting is pretty straightforward.
+    if (FormatScientific) {
+      exp += (NDigits - 1);
+
+      Str.push_back(buffer[NDigits-1]);
+      Str.push_back('.');
+      if (NDigits == 1 && TruncateZero)
+        Str.push_back('0');
+      else
+        for (unsigned I = 1; I != NDigits; ++I)
+          Str.push_back(buffer[NDigits-1-I]);
+      // Fill with zeros up to FormatPrecision.
+      if (!TruncateZero && FormatPrecision > NDigits - 1)
+        Str.append(FormatPrecision - NDigits + 1, '0');
+      // For !TruncateZero we use lower 'e'.
+      Str.push_back(TruncateZero ? 'E' : 'e');
+
+      Str.push_back(exp >= 0 ? '+' : '-');
+      if (exp < 0)
+        exp = -exp;
+      SmallVector<char, 6> expbuf;
+      do {
+        expbuf.push_back((char) ('0' + (exp % 10)));
+        exp /= 10;
+      } while (exp);
+      // Exponent always at least two digits if we do not truncate zeros.
+      if (!TruncateZero && expbuf.size() < 2)
+        expbuf.push_back('0');
+      for (unsigned I = 0, E = expbuf.size(); I != E; ++I)
+        Str.push_back(expbuf[E-1-I]);
+      return;
+    }
+
+    // Non-scientific, positive exponents.
+    if (exp >= 0) {
+      for (unsigned I = 0; I != NDigits; ++I)
+        Str.push_back(buffer[NDigits-1-I]);
+      for (unsigned I = 0; I != (unsigned) exp; ++I)
+        Str.push_back('0');
+      return;
+    }
+
+    // Non-scientific, negative exponents.
+
+    // The number of digits to the left of the decimal point.
+    int NWholeDigits = exp + (int) NDigits;
+
+    unsigned I = 0;
+    if (NWholeDigits > 0) {
+      for (; I != (unsigned) NWholeDigits; ++I)
+        Str.push_back(buffer[NDigits-I-1]);
+      Str.push_back('.');
+    } else {
+      unsigned NZeros = 1 + (unsigned) -NWholeDigits;
+
+      Str.push_back('0');
+      Str.push_back('.');
+      for (unsigned Z = 1; Z != NZeros; ++Z)
+        Str.push_back('0');
+    }
+
+    for (; I != NDigits; ++I)
+      Str.push_back(buffer[NDigits-I-1]);
+
+  }
 } // namespace
 
 void IEEEFloat::toString(SmallVectorImpl<char> &Str, unsigned FormatPrecision,
@@ -4152,193 +4345,15 @@ void IEEEFloat::toString(SmallVectorImpl<char> &Str, unsigned FormatPrecision,
     break;
   }
 
-  if (isNegative())
-    Str.push_back('-');
-
   // Decompose the number into an APInt and an exponent.
   int exp = exponent - ((int) semantics->precision - 1);
   APInt significand(
       semantics->precision,
       ArrayRef(significandParts(), partCountForBits(semantics->precision)));
 
-  // Set FormatPrecision if zero.  We want to do this before we
-  // truncate trailing zeros, as those are part of the precision.
-  if (!FormatPrecision) {
-    // We use enough digits so the number can be round-tripped back to an
-    // APFloat. The formula comes from "How to Print Floating-Point Numbers
-    // Accurately" by Steele and White.
-    // FIXME: Using a formula based purely on the precision is conservative;
-    // we can print fewer digits depending on the actual value being printed.
-
-    // FormatPrecision = 2 + floor(significandBits / lg_2(10))
-    FormatPrecision = 2 + semantics->precision * 59 / 196;
-  }
-
-  // Ignore trailing binary zeros.
-  int trailingZeros = significand.countr_zero();
-  exp += trailingZeros;
-  significand.lshrInPlace(trailingZeros);
-
-  // Change the exponent from 2^e to 10^e.
-  if (exp == 0) {
-    // Nothing to do.
-  } else if (exp > 0) {
-    // Just shift left.
-    significand = significand.zext(semantics->precision + exp);
-    significand <<= exp;
-    exp = 0;
-  } else { /* exp < 0 */
-    int texp = -exp;
-
-    // We transform this using the identity:
-    //   (N)(2^-e) == (N)(5^e)(10^-e)
-    // This means we have to multiply N (the significand) by 5^e.
-    // To avoid overflow, we have to operate on numbers large
-    // enough to store N * 5^e:
-    //   log2(N * 5^e) == log2(N) + e * log2(5)
-    //                 <= semantics->precision + e * 137 / 59
-    //   (log_2(5) ~ 2.321928 < 2.322034 ~ 137/59)
-
-    unsigned precision = semantics->precision + (137 * texp + 136) / 59;
-
-    // Multiply significand by 5^e.
-    //   N * 5^0101 == N * 5^(1*1) * 5^(0*2) * 5^(1*4) * 5^(0*8)
-    significand = significand.zext(precision);
-    APInt five_to_the_i(precision, 5);
-    while (true) {
-      if (texp & 1) significand *= five_to_the_i;
-
-      texp >>= 1;
-      if (!texp) break;
-      five_to_the_i *= five_to_the_i;
-    }
-  }
-
-  AdjustToPrecision(significand, exp, FormatPrecision);
-
-  SmallVector<char, 256> buffer;
-
-  // Fill the buffer.
-  unsigned precision = significand.getBitWidth();
-  if (precision < 4) {
-    // We need enough precision to store the value 10.
-    precision = 4;
-    significand = significand.zext(precision);
-  }
-  APInt ten(precision, 10);
-  APInt digit(precision, 0);
-
-  bool inTrail = true;
-  while (significand != 0) {
-    // digit <- significand % 10
-    // significand <- significand / 10
-    APInt::udivrem(significand, ten, significand, digit);
-
-    unsigned d = digit.getZExtValue();
-
-    // Drop trailing zeros.
-    if (inTrail && !d) exp++;
-    else {
-      buffer.push_back((char) ('0' + d));
-      inTrail = false;
-    }
-  }
-
-  assert(!buffer.empty() && "no characters in buffer!");
-
-  // Drop down to FormatPrecision.
-  // TODO: don't do more precise calculations above than are required.
-  AdjustToPrecision(buffer, exp, FormatPrecision);
-
-  unsigned NDigits = buffer.size();
-
-  // Check whether we should use scientific notation.
-  bool FormatScientific;
-  if (!FormatMaxPadding)
-    FormatScientific = true;
-  else {
-    if (exp >= 0) {
-      // 765e3 --> 765000
-      //              ^^^
-      // But we shouldn't make the number look more precise than it is.
-      FormatScientific = ((unsigned) exp > FormatMaxPadding ||
-                          NDigits + (unsigned) exp > FormatPrecision);
-    } else {
-      // Power of the most significant digit.
-      int MSD = exp + (int) (NDigits - 1);
-      if (MSD >= 0) {
-        // 765e-2 == 7.65
-        FormatScientific = false;
-      } else {
-        // 765e-5 == 0.00765
-        //           ^ ^^
-        FormatScientific = ((unsigned) -MSD) > FormatMaxPadding;
-      }
-    }
-  }
-
-  // Scientific formatting is pretty straightforward.
-  if (FormatScientific) {
-    exp += (NDigits - 1);
-
-    Str.push_back(buffer[NDigits-1]);
-    Str.push_back('.');
-    if (NDigits == 1 && TruncateZero)
-      Str.push_back('0');
-    else
-      for (unsigned I = 1; I != NDigits; ++I)
-        Str.push_back(buffer[NDigits-1-I]);
-    // Fill with zeros up to FormatPrecision.
-    if (!TruncateZero && FormatPrecision > NDigits - 1)
-      Str.append(FormatPrecision - NDigits + 1, '0');
-    // For !TruncateZero we use lower 'e'.
-    Str.push_back(TruncateZero ? 'E' : 'e');
-
-    Str.push_back(exp >= 0 ? '+' : '-');
-    if (exp < 0) exp = -exp;
-    SmallVector<char, 6> expbuf;
-    do {
-      expbuf.push_back((char) ('0' + (exp % 10)));
-      exp /= 10;
-    } while (exp);
-    // Exponent always at least two digits if we do not truncate zeros.
-    if (!TruncateZero && expbuf.size() < 2)
-      expbuf.push_back('0');
-    for (unsigned I = 0, E = expbuf.size(); I != E; ++I)
-      Str.push_back(expbuf[E-1-I]);
-    return;
-  }
-
-  // Non-scientific, positive exponents.
-  if (exp >= 0) {
-    for (unsigned I = 0; I != NDigits; ++I)
-      Str.push_back(buffer[NDigits-1-I]);
-    for (unsigned I = 0; I != (unsigned) exp; ++I)
-      Str.push_back('0');
-    return;
-  }
-
-  // Non-scientific, negative exponents.
-
-  // The number of digits to the left of the decimal point.
-  int NWholeDigits = exp + (int) NDigits;
-
-  unsigned I = 0;
-  if (NWholeDigits > 0) {
-    for (; I != (unsigned) NWholeDigits; ++I)
-      Str.push_back(buffer[NDigits-I-1]);
-    Str.push_back('.');
-  } else {
-    unsigned NZeros = 1 + (unsigned) -NWholeDigits;
-
-    Str.push_back('0');
-    Str.push_back('.');
-    for (unsigned Z = 1; Z != NZeros; ++Z)
-      Str.push_back('0');
-  }
+  toStringImpl(Str, isNegative(), exp, significand, FormatPrecision,
+               FormatMaxPadding, TruncateZero);
 
-  for (; I != NDigits; ++I)
-    Str.push_back(buffer[NDigits-I-1]);
 }
 
 bool IEEEFloat::getExactInverse(APFloat *inv) const {
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 6dc0c86..ecc487a 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -1733,9 +1733,9 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
   } else if (!ConsumeAfterOpt) {
     // Positional args have already been handled if ConsumeAfter is specified.
     unsigned ValNo = 0, NumVals = static_cast<unsigned>(PositionalVals.size());
-    for (size_t i = 0, e = PositionalOpts.size(); i != e; ++i) {
-      if (RequiresValue(PositionalOpts[i])) {
-        ProvidePositionalOption(PositionalOpts[i], PositionalVals[ValNo].first,
+    for (Option *Opt : PositionalOpts) {
+      if (RequiresValue(Opt)) {
+        ProvidePositionalOption(Opt, PositionalVals[ValNo].first,
                                 PositionalVals[ValNo].second);
         ValNo++;
         --NumPositionalRequired; // We fulfilled our duty...
@@ -1745,16 +1745,15 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
       // do not give it values that others need.  'Done' controls whether the
       // option even _WANTS_ any more.
       //
-      bool Done = PositionalOpts[i]->getNumOccurrencesFlag() == cl::Required;
+      bool Done = Opt->getNumOccurrencesFlag() == cl::Required;
       while (NumVals - ValNo > NumPositionalRequired && !Done) {
-        switch (PositionalOpts[i]->getNumOccurrencesFlag()) {
+        switch (Opt->getNumOccurrencesFlag()) {
         case cl::Optional:
           Done = true; // Optional arguments want _at most_ one value
           [[fallthrough]];
         case cl::ZeroOrMore: // Zero or more will take all they can get...
         case cl::OneOrMore:  // One or more will take all they can get...
-          ProvidePositionalOption(PositionalOpts[i],
-                                  PositionalVals[ValNo].first,
+          ProvidePositionalOption(Opt, PositionalVals[ValNo].first,
                                   PositionalVals[ValNo].second);
           ValNo++;
           break;
@@ -1767,11 +1766,10 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
   } else {
     assert(ConsumeAfterOpt && NumPositionalRequired <= PositionalVals.size());
     unsigned ValNo = 0;
-    for (size_t J = 0, E = PositionalOpts.size(); J != E; ++J)
-      if (RequiresValue(PositionalOpts[J])) {
-        ErrorParsing |= ProvidePositionalOption(PositionalOpts[J],
-                                                PositionalVals[ValNo].first,
-                                                PositionalVals[ValNo].second);
+    for (Option *Opt : PositionalOpts)
+      if (RequiresValue(Opt)) {
+        ErrorParsing |= ProvidePositionalOption(
+            Opt, PositionalVals[ValNo].first, PositionalVals[ValNo].second);
         ValNo++;
       }
 
diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
index 41bbc00..7660de5 100644
--- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
@@ -183,15 +183,12 @@ static const struct ThunkNameAndReg {
 namespace {
 struct SLSBLRThunkInserter : ThunkInserter<SLSBLRThunkInserter> {
   const char *getThunkPrefix() { return SLSBLRNamePrefix; }
-  bool mayUseThunk(const MachineFunction &MF, bool InsertedThunks) {
-    if (InsertedThunks)
-      return false;
+  bool mayUseThunk(const MachineFunction &MF) {
     ComdatThunks &= !MF.getSubtarget<AArch64Subtarget>().hardenSlsNoComdat();
-    // FIXME: This could also check if there are any BLRs in the function
-    // to more accurately reflect if a thunk will be needed.
     return MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr();
   }
-  bool insertThunks(MachineModuleInfo &MMI, MachineFunction &MF);
+  bool insertThunks(MachineModuleInfo &MMI, MachineFunction &MF,
+                    bool ExistingThunks);
   void populateThunk(MachineFunction &MF);
 
 private:
@@ -200,7 +197,10 @@ private:
 } // namespace
 
 bool SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI,
-                                       MachineFunction &MF) {
+                                       MachineFunction &MF,
+                                       bool ExistingThunks) {
+  if (ExistingThunks)
+    return false;
   // FIXME: It probably would be possible to filter which thunks to produce
   // based on which registers are actually used in BLR instructions in this
   // function. But would that be a worthwhile optimization?
@@ -210,6 +210,8 @@ bool SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI,
 }
 
 void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) {
+  assert(MF.getFunction().hasComdat() == ComdatThunks &&
+         "ComdatThunks value changed since MF creation");
   // FIXME: How to better communicate Register number, rather than through
   // name and lookup table?
   assert(MF.getName().starts_with(getThunkPrefix()));
@@ -411,30 +413,13 @@ FunctionPass *llvm::createAArch64SLSHardeningPass() {
 }
 
 namespace {
-class AArch64IndirectThunks : public MachineFunctionPass {
+class AArch64IndirectThunks : public ThunkInserterPass<SLSBLRThunkInserter> {
 public:
   static char ID;
 
-  AArch64IndirectThunks() : MachineFunctionPass(ID) {}
+  AArch64IndirectThunks() : ThunkInserterPass(ID) {}
 
   StringRef getPassName() const override { return "AArch64 Indirect Thunks"; }
-
-  bool doInitialization(Module &M) override;
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-private:
-  std::tuple<SLSBLRThunkInserter> TIs;
-
-  template <typename... ThunkInserterT>
-  static void initTIs(Module &M,
-                      std::tuple<ThunkInserterT...> &ThunkInserters) {
-    (..., std::get<ThunkInserterT>(ThunkInserters).init(M));
-  }
-  template <typename... ThunkInserterT>
-  static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
-                     std::tuple<ThunkInserterT...> &ThunkInserters) {
-    return (0 | ... | std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF));
-  }
 };
 
 } // end anonymous namespace
@@ -444,14 +429,3 @@ char AArch64IndirectThunks::ID = 0;
 FunctionPass *llvm::createAArch64IndirectThunks() {
   return new AArch64IndirectThunks();
 }
-
-bool AArch64IndirectThunks::doInitialization(Module &M) {
-  initTIs(M, TIs);
-  return false;
-}
-
-bool AArch64IndirectThunks::runOnMachineFunction(MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << getPassName() << '\n');
-  auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
-  return runTIs(MMI, MF, TIs);
-}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index eb60b96..0ee8136 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -61,6 +61,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
 static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
                                       cl::init(true), cl::Hidden);
 
+// A complete guess as to a reasonable cost.
+static cl::opt<unsigned>
+    BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
+                    cl::desc("The cost of a histcnt instruction"));
+
 namespace {
 class TailFoldingOption {
   // These bitfields will only ever be set to something non-zero in operator=,
@@ -508,11 +513,39 @@ static bool isUnpackedVectorVT(EVT VecVT) {
          VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
 }
 
+static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
+  Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
+  Type *EltTy = ICA.getArgTypes()[1];        // Type of bucket elements
+
+  // Only allow (32b and 64b) integers or pointers for now...
+  if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) ||
+      (EltTy->getScalarSizeInBits() != 32 &&
+       EltTy->getScalarSizeInBits() != 64))
+    return InstructionCost::getInvalid();
+
+  // FIXME: Hacky check for legal vector types. We can promote smaller types
+  //        but we cannot legalize vectors via splitting for histcnt.
+  // FIXME: We should be able to generate histcnt for fixed-length vectors
+  //        using ptrue with a specific VL.
+  if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy))
+    if ((VTy->getElementCount().getKnownMinValue() != 2 &&
+         VTy->getElementCount().getKnownMinValue() != 4) ||
+        VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 ||
+        !VTy->isScalableTy())
+      return InstructionCost::getInvalid();
+
+  return InstructionCost(BaseHistCntCost);
+}
+
 InstructionCost
 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) {
   auto *RetTy = ICA.getReturnType();
   switch (ICA.getID()) {
+  case Intrinsic::experimental_vector_histogram_add:
+    if (!ST->hasSVE2())
+      return InstructionCost::getInvalid();
+    return getHistogramCost(ICA);
   case Intrinsic::umin:
   case Intrinsic::umax:
   case Intrinsic::smin:
diff --git a/llvm/lib/Target/ARM/ARMSLSHardening.cpp b/llvm/lib/Target/ARM/ARMSLSHardening.cpp
index d9ff14e..d77db17 100644
--- a/llvm/lib/Target/ARM/ARMSLSHardening.cpp
+++ b/llvm/lib/Target/ARM/ARMSLSHardening.cpp
@@ -163,7 +163,7 @@ static const struct ThunkNameRegMode {
 
 // An enum for tracking whether Arm and Thumb thunks have been inserted into the
 // current module so far.
-enum ArmInsertedThunks { ArmThunk = 1, ThumbThunk = 2 };
+enum ArmInsertedThunks { NoThunk = 0, ArmThunk = 1, ThumbThunk = 2 };
 
 inline ArmInsertedThunks &operator|=(ArmInsertedThunks &X,
                                      ArmInsertedThunks Y) {
@@ -174,19 +174,12 @@ namespace {
 struct SLSBLRThunkInserter
     : ThunkInserter<SLSBLRThunkInserter, ArmInsertedThunks> {
   const char *getThunkPrefix() { return SLSBLRNamePrefix; }
-  bool mayUseThunk(const MachineFunction &MF,
-                   ArmInsertedThunks InsertedThunks) {
-    if ((InsertedThunks & ArmThunk &&
-         !MF.getSubtarget<ARMSubtarget>().isThumb()) ||
-        (InsertedThunks & ThumbThunk &&
-         MF.getSubtarget<ARMSubtarget>().isThumb()))
-      return false;
+  bool mayUseThunk(const MachineFunction &MF) {
     ComdatThunks &= !MF.getSubtarget<ARMSubtarget>().hardenSlsNoComdat();
-    // FIXME: This could also check if there are any indirect calls in the
-    // function to more accurately reflect if a thunk will be needed.
     return MF.getSubtarget<ARMSubtarget>().hardenSlsBlr();
   }
-  ArmInsertedThunks insertThunks(MachineModuleInfo &MMI, MachineFunction &MF);
+  ArmInsertedThunks insertThunks(MachineModuleInfo &MMI, MachineFunction &MF,
+                                 ArmInsertedThunks InsertedThunks);
   void populateThunk(MachineFunction &MF);
 
 private:
@@ -194,8 +187,14 @@ private:
 };
 } // namespace
 
-ArmInsertedThunks SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI,
-                                                    MachineFunction &MF) {
+ArmInsertedThunks
+SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI, MachineFunction &MF,
+                                  ArmInsertedThunks InsertedThunks) {
+  if ((InsertedThunks & ArmThunk &&
+       !MF.getSubtarget<ARMSubtarget>().isThumb()) ||
+      (InsertedThunks & ThumbThunk &&
+       MF.getSubtarget<ARMSubtarget>().isThumb()))
+    return NoThunk;
   // FIXME: It probably would be possible to filter which thunks to produce
   // based on which registers are actually used in indirect calls in this
   // function. But would that be a worthwhile optimization?
@@ -208,6 +207,8 @@ ArmInsertedThunks SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI,
 }
 
 void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) {
+  assert(MF.getFunction().hasComdat() == ComdatThunks &&
+         "ComdatThunks value changed since MF creation");
   // FIXME: How to better communicate Register number, rather than through
   // name and lookup table?
   assert(MF.getName().starts_with(getThunkPrefix()));
@@ -384,38 +385,14 @@ FunctionPass *llvm::createARMSLSHardeningPass() {
 }
 
 namespace {
-class ARMIndirectThunks : public MachineFunctionPass {
+class ARMIndirectThunks : public ThunkInserterPass<SLSBLRThunkInserter> {
 public:
   static char ID;
 
-  ARMIndirectThunks() : MachineFunctionPass(ID) {}
+  ARMIndirectThunks() : ThunkInserterPass(ID) {}
 
   StringRef getPassName() const override { return "ARM Indirect Thunks"; }
-
-  bool doInitialization(Module &M) override;
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineModuleInfoWrapperPass>();
-    AU.addPreserved<MachineModuleInfoWrapperPass>();
-  }
-
-private:
-  std::tuple<SLSBLRThunkInserter> TIs;
-
-  template <typename... ThunkInserterT>
-  static void initTIs(Module &M,
-                      std::tuple<ThunkInserterT...> &ThunkInserters) {
-    (..., std::get<ThunkInserterT>(ThunkInserters).init(M));
-  }
-  template <typename... ThunkInserterT>
-  static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
-                     std::tuple<ThunkInserterT...> &ThunkInserters) {
-    return (0 | ... | std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF));
-  }
 };
-
 } // end anonymous namespace
 
 char ARMIndirectThunks::ID = 0;
@@ -423,14 +400,3 @@ char ARMIndirectThunks::ID = 0;
 FunctionPass *llvm::createARMIndirectThunks() {
   return new ARMIndirectThunks();
 }
-
-bool ARMIndirectThunks::doInitialization(Module &M) {
-  initTIs(M, TIs);
-  return false;
-}
-
-bool ARMIndirectThunks::runOnMachineFunction(MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << getPassName() << '\n');
-  auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
-  return runTIs(MMI, MF, TIs);
-}
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 7b3057b..697a22b 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1262,6 +1262,10 @@ def FeatureUnalignedVectorMem
 def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
     "UsePostRAScheduler", "true", "Schedule again after register allocation">;
 
+def FeaturePredictableSelectIsExpensive
+    : SubtargetFeature<"predictable-select-expensive", "PredictableSelectIsExpensive", "true",
+                       "Prefer likely predicted branches over selects">;
+
 def TuneNoOptimizedZeroStrideLoad
    : SubtargetFeature<"no-optimized-zero-stride-load", "HasOptimizedZeroStrideLoad",
                       "false", "Hasn't optimized (perform fewer memory operations)"
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 45368a0..022b8bc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1518,6 +1518,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   // Disable strict node mutation.
   IsStrictFPEnabled = true;
+
+  // Let the subtarget decide if a predictable select is more expensive than the
+  // corresponding branch. This information is used in CGP/SelectOpt to decide
+  // when to convert selects into branches.
+  PredictableSelectIsExpensive = Subtarget.predictableSelectIsExpensive();
 }
 
 EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL,
@@ -7685,7 +7690,11 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       if (SDValue NewSel = foldBinOpIntoSelectIfProfitable(*Op->use_begin(),
                                                            DAG, Subtarget)) {
         DAG.ReplaceAllUsesWith(BinOp, &NewSel);
-        return lowerSELECT(NewSel, DAG);
+        // Opcode check is necessary because foldBinOpIntoSelectIfProfitable
+        // may return a constant node and cause crash in lowerSELECT.
+        if (NewSel.getOpcode() == ISD::SELECT)
+          return lowerSELECT(NewSel, DAG);
+        return NewSel;
       }
     }
   }
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index a1ae8a1..9c76bb1 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -956,6 +956,12 @@ void RISCVInsertVSETVLI::forwardVSETVLIAVL(VSETVLIInfo &Info) const {
   VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI);
   if (!DefInstrInfo.hasSameVLMAX(Info))
     return;
+  // If the AVL is a register with multiple definitions, don't forward it. We
+  // might not be able to extend its LiveInterval without clobbering other val
+  // nums.
+  if (DefInstrInfo.hasAVLReg() &&
+      !LIS->getInterval(DefInstrInfo.getAVLReg()).containsOneValue())
+    return;
   Info.setAVL(DefInstrInfo);
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 176d0e7..d603138 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -959,6 +959,32 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       return Cost * LT.first;
     break;
   }
+  // vp integer arithmetic ops.
+  case Intrinsic::vp_add:
+  case Intrinsic::vp_and:
+  case Intrinsic::vp_ashr:
+  case Intrinsic::vp_lshr:
+  case Intrinsic::vp_mul:
+  case Intrinsic::vp_or:
+  case Intrinsic::vp_sdiv:
+  case Intrinsic::vp_shl:
+  case Intrinsic::vp_srem:
+  case Intrinsic::vp_sub:
+  case Intrinsic::vp_udiv:
+  case Intrinsic::vp_urem:
+  case Intrinsic::vp_xor:
+  // vp float arithmetic ops.
+  case Intrinsic::vp_fadd:
+  case Intrinsic::vp_fsub:
+  case Intrinsic::vp_fmul:
+  case Intrinsic::vp_fdiv:
+  case Intrinsic::vp_frem: {
+    std::optional<unsigned> FOp =
+        VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
+    if (FOp)
+      return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind);
+    break;
+  }
   }
 
   if (ST->hasVInstructions() && RetTy->isVectorTy()) {
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index a6285a2..b161eed 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -172,7 +172,6 @@ uint64_t SystemZMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNum,
     uint32_t BitOffset = MIBitSize - RawBitOffset - OpBitSize;
     Fixups.push_back(MCFixup::create(BitOffset >> 3, MO.getExpr(),
                                      (MCFixupKind)Kind, MI.getLoc()));
-    assert(Fixups.size() <= 2 && "More than two memory operands in MI?");
     return 0;
   }
   llvm_unreachable("Unexpected operand type!");
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index dbea42d..e49e96c 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3849,6 +3849,14 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
         return Warning(Ops[0]->getStartLoc(), "mask, index, and destination "
                                               "registers should be distinct");
     }
+  } else if (isTCMMIMFP16PS(Opcode) || isTCMMRLFP16PS(Opcode) ||
+             isTDPBF16PS(Opcode) || isTDPFP16PS(Opcode) || isTDPBSSD(Opcode) ||
+             isTDPBSUD(Opcode) || isTDPBUSD(Opcode) || isTDPBUUD(Opcode)) {
+    unsigned SrcDest = Inst.getOperand(0).getReg();
+    unsigned Src1 = Inst.getOperand(2).getReg();
+    unsigned Src2 = Inst.getOperand(3).getReg();
+    if (SrcDest == Src1 || SrcDest == Src2 || Src1 == Src2)
+      return Error(Ops[0]->getStartLoc(), "all tmm registers must be distinct");
   }
 
   // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index 394947b..ab8b3dc 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -128,7 +128,7 @@ FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
 char X86FlagsCopyLoweringPass::ID = 0;
 
 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addUsedIfAvailable<MachineDominatorTreeWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -258,13 +258,32 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   TII = Subtarget->getInstrInfo();
   TRI = Subtarget->getRegisterInfo();
-  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   PromoteRC = &X86::GR8RegClass;
 
   if (MF.empty())
     // Nothing to do for a degenerate empty function...
     return false;
 
+  if (none_of(MRI->def_instructions(X86::EFLAGS), [](const MachineInstr &MI) {
+        return MI.getOpcode() == TargetOpcode::COPY;
+      }))
+    return false;
+
+  // We change the code, so we don't preserve the dominator tree anyway. If we
+  // got a valid MDT from the pass manager, use that, otherwise construct one
+  // now. This is an optimization that avoids unnecessary MDT construction for
+  // functions that have no flag copies.
+
+  auto MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+  std::unique_ptr<MachineDominatorTree> OwnedMDT;
+  if (MDTWrapper) {
+    MDT = &MDTWrapper->getDomTree();
+  } else {
+    OwnedMDT = std::make_unique<MachineDominatorTree>();
+    OwnedMDT->getBase().recalculate(MF);
+    MDT = OwnedMDT.get();
+  }
+
   // Collect the copies in RPO so that when there are chains where a copy is in
   // turn copied again we visit the first one first. This ensures we can find
   // viable locations for testing the original EFLAGS that dominate all the
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index f2b3855..c91bd57 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2745,7 +2745,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
         Src = Src.getOperand(0);
       }
 
-    if (Src.getOpcode() == ISD::SHL && Src.hasOneUse()) {
+    if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
       // Give up if the shift is not a valid scale factor [1,2,3].
       SDValue ShlSrc = Src.getOperand(0);
       SDValue ShlAmt = Src.getOperand(1);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index de26ce2..e03edf9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56041,18 +56041,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     };
     auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
       bool AllConstants = true;
-      bool AllSubVectors = true;
+      bool AllSubs = true;
+      unsigned VecSize = VT.getSizeInBits();
       for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
-        SDValue Sub = SubOps[I].getOperand(Op);
-        unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
-        SDValue BC = peekThroughBitcasts(Sub);
+        SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
+        unsigned SubSize = BC.getValueSizeInBits();
+        unsigned EltSize = BC.getScalarValueSizeInBits();
         AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
                         ISD::isBuildVectorOfConstantFPSDNodes(BC.getNode());
-        AllSubVectors &= Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-                         Sub.getOperand(0).getValueType() == VT &&
-                         Sub.getConstantOperandAPInt(1) == (I * NumSubElts);
+        AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+                   BC.getOperand(0).getValueSizeInBits() == VecSize &&
+                   (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
       }
-      return AllConstants || AllSubVectors;
+      return AllConstants || AllSubs;
     };
 
     switch (Op0.getOpcode()) {
diff --git a/llvm/lib/Target/X86/X86IndirectThunks.cpp b/llvm/lib/Target/X86/X86IndirectThunks.cpp
index ecc5260..4f4a8d8 100644
--- a/llvm/lib/Target/X86/X86IndirectThunks.cpp
+++ b/llvm/lib/Target/X86/X86IndirectThunks.cpp
@@ -61,26 +61,26 @@ static const char R11LVIThunkName[] = "__llvm_lvi_thunk_r11";
 namespace {
 struct RetpolineThunkInserter : ThunkInserter<RetpolineThunkInserter> {
   const char *getThunkPrefix() { return RetpolineNamePrefix; }
-  bool mayUseThunk(const MachineFunction &MF, bool InsertedThunks) {
-    if (InsertedThunks)
-      return false;
+  bool mayUseThunk(const MachineFunction &MF) {
     const auto &STI = MF.getSubtarget<X86Subtarget>();
     return (STI.useRetpolineIndirectCalls() ||
             STI.useRetpolineIndirectBranches()) &&
            !STI.useRetpolineExternalThunk();
   }
-  bool insertThunks(MachineModuleInfo &MMI, MachineFunction &MF);
+  bool insertThunks(MachineModuleInfo &MMI, MachineFunction &MF,
+                    bool ExistingThunks);
   void populateThunk(MachineFunction &MF);
 };
 
 struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
   const char *getThunkPrefix() { return LVIThunkNamePrefix; }
-  bool mayUseThunk(const MachineFunction &MF, bool InsertedThunks) {
-    if (InsertedThunks)
-      return false;
+  bool mayUseThunk(const MachineFunction &MF) {
     return MF.getSubtarget<X86Subtarget>().useLVIControlFlowIntegrity();
   }
-  bool insertThunks(MachineModuleInfo &MMI, MachineFunction &MF) {
+  bool insertThunks(MachineModuleInfo &MMI, MachineFunction &MF,
+                    bool ExistingThunks) {
+    if (ExistingThunks)
+      return false;
     createThunkFunction(MMI, R11LVIThunkName);
     return true;
   }
@@ -104,36 +104,23 @@ struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
   }
 };
 
-class X86IndirectThunks : public MachineFunctionPass {
+class X86IndirectThunks
+    : public ThunkInserterPass<RetpolineThunkInserter, LVIThunkInserter> {
 public:
   static char ID;
 
-  X86IndirectThunks() : MachineFunctionPass(ID) {}
+  X86IndirectThunks() : ThunkInserterPass(ID) {}
 
   StringRef getPassName() const override { return "X86 Indirect Thunks"; }
-
-  bool doInitialization(Module &M) override;
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-private:
-  std::tuple<RetpolineThunkInserter, LVIThunkInserter> TIs;
-
-  template <typename... ThunkInserterT>
-  static void initTIs(Module &M,
-                      std::tuple<ThunkInserterT...> &ThunkInserters) {
-    (..., std::get<ThunkInserterT>(ThunkInserters).init(M));
-  }
-  template <typename... ThunkInserterT>
-  static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
-                     std::tuple<ThunkInserterT...> &ThunkInserters) {
-    return (0 | ... | std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF));
-  }
 };
 
 } // end anonymous namespace
 
 bool RetpolineThunkInserter::insertThunks(MachineModuleInfo &MMI,
-                                          MachineFunction &MF) {
+                                          MachineFunction &MF,
+                                          bool ExistingThunks) {
+  if (ExistingThunks)
+    return false;
   if (MMI.getTarget().getTargetTriple().getArch() == Triple::x86_64)
     createThunkFunction(MMI, R11RetpolineName);
   else
@@ -259,14 +246,3 @@ FunctionPass *llvm::createX86IndirectThunksPass() {
 }
 
 char X86IndirectThunks::ID = 0;
-
-bool X86IndirectThunks::doInitialization(Module &M) {
-  initTIs(M, TIs);
-  return false;
-}
-
-bool X86IndirectThunks::runOnMachineFunction(MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << getPassName() << '\n');
-  auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
-  return runTIs(MMI, MF, TIs);
-}
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 2bfae0e..20f5dba 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -109,14 +109,8 @@ static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
   if (!Op1)
     return nullptr;
 
-  LazyValueInfo::Tristate Result = LVI->getPredicateAt(
-      C->getPredicate(), Op0, Op1, At, /*UseBlockValue=*/false);
-  if (Result == LazyValueInfo::Unknown)
-    return nullptr;
-
-  return (Result == LazyValueInfo::True)
-             ? ConstantInt::getTrue(C->getContext())
-             : ConstantInt::getFalse(C->getContext());
+  return LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At,
+                             /*UseBlockValue=*/false);
 }
 
 static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
@@ -243,15 +237,17 @@ static Value *getValueOnEdge(LazyValueInfo *LVI, Value *Incoming,
 
   // The "false" case
   if (auto *C = dyn_cast<Constant>(SI->getFalseValue()))
-    if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI) ==
-        LazyValueInfo::False)
+    if (auto *Res = dyn_cast_or_null<ConstantInt>(
+            LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI));
+        Res && Res->isZero())
       return SI->getTrueValue();
 
   // The "true" case,
   // similar to the select "false" case, but try the select "true" value
   if (auto *C = dyn_cast<Constant>(SI->getTrueValue()))
-    if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI) ==
-        LazyValueInfo::False)
+    if (auto *Res = dyn_cast_or_null<ConstantInt>(
+            LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI));
+        Res && Res->isZero())
       return SI->getFalseValue();
 
   return nullptr;
@@ -320,16 +316,13 @@ static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) {
 static bool constantFoldCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
   Value *Op0 = Cmp->getOperand(0);
   Value *Op1 = Cmp->getOperand(1);
-  LazyValueInfo::Tristate Result =
-      LVI->getPredicateAt(Cmp->getPredicate(), Op0, Op1, Cmp,
-                          /*UseBlockValue=*/true);
-  if (Result == LazyValueInfo::Unknown)
+  Constant *Res = LVI->getPredicateAt(Cmp->getPredicate(), Op0, Op1, Cmp,
+                                      /*UseBlockValue=*/true);
+  if (!Res)
     return false;
 
   ++NumCmps;
-  Constant *TorF =
-      ConstantInt::get(CmpInst::makeCmpResultType(Op0->getType()), Result);
-  Cmp->replaceAllUsesWith(TorF);
+  Cmp->replaceAllUsesWith(Res);
   Cmp->eraseFromParent();
   return true;
 }
@@ -371,11 +364,11 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
 
     for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
       ConstantInt *Case = CI->getCaseValue();
-      LazyValueInfo::Tristate State =
+      auto *Res = dyn_cast_or_null<ConstantInt>(
           LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I,
-                              /* UseBlockValue */ true);
+                              /* UseBlockValue */ true));
 
-      if (State == LazyValueInfo::False) {
+      if (Res && Res->isZero()) {
         // This case never fires - remove it.
         BasicBlock *Succ = CI->getCaseSuccessor();
         Succ->removePredecessor(BB);
@@ -392,7 +385,7 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
           DTU.applyUpdatesPermissive({{DominatorTree::Delete, BB, Succ}});
         continue;
       }
-      if (State == LazyValueInfo::True) {
+      if (Res && Res->isOne()) {
         // This case always fires.  Arrange for the switch to be turned into an
         // unconditional branch by replacing the switch condition with the case
         // value.
@@ -716,11 +709,12 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
     // relatively expensive analysis for constants which are obviously either
     // null or non-null to start with.
     if (Type && !CB.paramHasAttr(ArgNo, Attribute::NonNull) &&
-        !isa<Constant>(V) &&
-        LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
-                            ConstantPointerNull::get(Type), &CB,
-                            /*UseBlockValue=*/false) == LazyValueInfo::False)
-      ArgNos.push_back(ArgNo);
+        !isa<Constant>(V))
+      if (auto *Res = dyn_cast_or_null<ConstantInt>(LVI->getPredicateAt(
+              ICmpInst::ICMP_EQ, V, ConstantPointerNull::get(Type), &CB,
+              /*UseBlockValue=*/false));
+          Res && Res->isZero())
+        ArgNos.push_back(ArgNo);
     ArgNo++;
   }
 
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 36c7a3f..7a0b661 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -596,11 +596,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
       CmpInst::Predicate Pred;
       Value *Val;
       Constant *Cst;
-      if (!PredCst && match(V, m_Cmp(Pred, m_Value(Val), m_Constant(Cst)))) {
-        auto Res = LVI->getPredicateOnEdge(Pred, Val, Cst, P, BB, CxtI);
-        if (Res != LazyValueInfo::Unknown)
-          PredCst = ConstantInt::getBool(V->getContext(), Res);
-      }
+      if (!PredCst && match(V, m_Cmp(Pred, m_Value(Val), m_Constant(Cst))))
+        PredCst = LVI->getPredicateOnEdge(Pred, Val, Cst, P, BB, CxtI);
       if (Constant *KC = getKnownConstant(PredCst, Preference))
         Result.emplace_back(KC, P);
     }
@@ -780,13 +777,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
           if (LHSInst && LHSInst->getParent() == BB)
             continue;
 
-          LazyValueInfo::Tristate
-            ResT = LVI->getPredicateOnEdge(Pred, LHS,
-                                           cast<Constant>(RHS), PredBB, BB,
-                                           CxtI ? CxtI : Cmp);
-          if (ResT == LazyValueInfo::Unknown)
-            continue;
-          Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
+          Res = LVI->getPredicateOnEdge(Pred, LHS, cast<Constant>(RHS), PredBB,
+                                        BB, CxtI ? CxtI : Cmp);
         }
 
         if (Constant *KC = getKnownConstant(Res, WantInteger))
@@ -806,14 +798,10 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
         for (BasicBlock *P : predecessors(BB)) {
           // If the value is known by LazyValueInfo to be a constant in a
           // predecessor, use that information to try to thread this block.
-          LazyValueInfo::Tristate Res =
-            LVI->getPredicateOnEdge(Pred, CmpLHS,
-                                    CmpConst, P, BB, CxtI ? CxtI : Cmp);
-          if (Res == LazyValueInfo::Unknown)
-            continue;
-
-          Constant *ResC = ConstantInt::get(CmpType, Res);
-          Result.emplace_back(ResC, P);
+          Constant *Res = LVI->getPredicateOnEdge(Pred, CmpLHS, CmpConst, P, BB,
+                                                  CxtI ? CxtI : Cmp);
+          if (Constant *KC = getKnownConstant(Res, WantInteger))
+            Result.emplace_back(KC, P);
         }
 
         return !Result.empty();
@@ -1082,11 +1070,11 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
     // it's value at the branch instruction.  We only handle comparisons
     // against a constant at this time.
     if (Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1))) {
-      LazyValueInfo::Tristate Ret =
+      Constant *Res =
           LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
                               CondConst, BB->getTerminator(),
                               /*UseBlockValue=*/false);
-      if (Ret != LazyValueInfo::Unknown) {
+      if (Res) {
         // We can safely replace *some* uses of the CondInst if it has
         // exactly one value as returned by LVI. RAUW is incorrect in the
         // presence of guards and assumes, that have the `Cond` as the use. This
@@ -1094,10 +1082,7 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
         // at the end of block, but RAUW unconditionally replaces all uses
         // including the guards/assumes themselves and the uses before the
         // guard/assume.
-        auto *CI = Ret == LazyValueInfo::True ?
-          ConstantInt::getTrue(CondCmp->getType()) :
-          ConstantInt::getFalse(CondCmp->getType());
-        if (replaceFoldableUses(CondCmp, CI, BB))
+        if (replaceFoldableUses(CondCmp, Res, BB))
           return true;
       }
 
@@ -2891,15 +2876,13 @@ bool JumpThreadingPass::tryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
     // Now check if one of the select values would allow us to constant fold the
     // terminator in BB. We don't do the transform if both sides fold, those
     // cases will be threaded in any case.
-    LazyValueInfo::Tristate LHSFolds =
+    Constant *LHSRes =
         LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
                                 CondRHS, Pred, BB, CondCmp);
-    LazyValueInfo::Tristate RHSFolds =
+    Constant *RHSRes =
         LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
                                 CondRHS, Pred, BB, CondCmp);
-    if ((LHSFolds != LazyValueInfo::Unknown ||
-         RHSFolds != LazyValueInfo::Unknown) &&
-        LHSFolds != RHSFolds) {
+    if ((LHSRes || RHSRes) && LHSRes != RHSRes) {
       unfoldSelectInstr(Pred, BB, SI, CondLHS, I);
       return true;
     }
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 715f63b..d5e91d3 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -783,8 +783,10 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   // Replace the inner loop backedge with an unconditional branch to the exit.
   BasicBlock *InnerExitBlock = FI.InnerLoop->getExitBlock();
   BasicBlock *InnerExitingBlock = FI.InnerLoop->getExitingBlock();
-  InnerExitingBlock->getTerminator()->eraseFromParent();
-  BranchInst::Create(InnerExitBlock, InnerExitingBlock);
+  Instruction *Term = InnerExitingBlock->getTerminator();
+  Instruction *BI = BranchInst::Create(InnerExitBlock, InnerExitingBlock);
+  BI->setDebugLoc(Term->getDebugLoc());
+  Term->eraseFromParent();
 
   // Update the DomTree and MemorySSA.
   DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader());
diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index 939c361..bd7895f 100644
--- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -85,8 +85,11 @@ static bool replaceConditionalBranchesOnConstant(Instruction *II,
     if (Target && Target != Other) {
       BasicBlock *Source = BI->getParent();
       Other->removePredecessor(Source);
+
+      Instruction *NewBI = BranchInst::Create(Target, Source);
+      NewBI->setDebugLoc(BI->getDebugLoc());
       BI->eraseFromParent();
-      BranchInst::Create(Target, Source);
+
       if (DTU)
         DTU->applyUpdates({{DominatorTree::Delete, Source, Other}});
       if (pred_empty(Other))
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 6e021a5..cd5ab55 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -525,7 +525,14 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
                          SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
                          SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
   StoreInst *OnlyStore = Info.OnlyStore;
-  bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
+  Value *ReplVal = OnlyStore->getOperand(0);
+  // Loads may either load the stored value or uninitialized memory (undef).
+  // If the stored value may be poison, then replacing an uninitialized memory
+  // load with it would be incorrect.
+  if (!isGuaranteedNotToBePoison(ReplVal))
+    return false;
+
+  bool StoringGlobalVal = !isa<Instruction>(ReplVal);
   BasicBlock *StoreBB = OnlyStore->getParent();
   int StoreIndex = -1;
 
@@ -565,7 +572,6 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
     }
 
     // Otherwise, we *can* safely rewrite this load.
-    Value *ReplVal = OnlyStore->getOperand(0);
     // If the replacement value is the load, this must occur in unreachable
     // code.
     if (ReplVal == LI)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f9c0c66..2c69566 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6813,6 +6813,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
 
     // First-order recurrences are replaced by vector shuffles inside the loop.
     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
+      // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
+      // penultimate value of the recurrence.
+      // TODO: Consider vscale_range info.
+      if (VF.isScalable() && VF.getKnownMinValue() == 1)
+        return InstructionCost::getInvalid();
       SmallVector<int> Mask(VF.getKnownMinValue());
       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
@@ -9006,6 +9011,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     }
   }
   Builder.setInsertPoint(&*LatchVPBB->begin());
+  VPBasicBlock *MiddleVPBB =
+      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
+  VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
   for (VPRecipeBase &R :
        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
@@ -9114,8 +9122,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     // also modeled in VPlan.
     auto *FinalReductionResult = new VPInstruction(
         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
-    cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
-        ->appendRecipe(FinalReductionResult);
+    FinalReductionResult->insertBefore(*MiddleVPBB, IP);
     OrigExitingVPV->replaceUsesWithIf(
         FinalReductionResult,
         [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
diff --git a/llvm/test/Analysis/BasicAA/ptr-vector.ll b/llvm/test/Analysis/BasicAA/ptr-vector.ll
new file mode 100644
index 0000000..7dea24f
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/ptr-vector.ll
@@ -0,0 +1,12 @@
+; RUN: opt -print-all-alias-modref-info -passes=aa-eval -disable-output < %s 2>&1 | FileCheck %s
+
+; CHECK: MayAlias:	i8* %b, i8* %p
+; CHECK: Just Ref:  Ptr: i8* %p	<->  %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %a, i32 8, <1 x i1> %c, <1 x ptr> poison)
+; CHECK: Just Ref:  Ptr: i8* %b	<->  %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %a, i32 8, <1 x i1> %c, <1 x ptr> poison)
+define void @test(ptr %a, ptr %b, <1 x i1> %c) {
+  %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %a, i32 8, <1 x i1> %c, <1 x ptr> poison)
+  %p = bitcast <1 x ptr> %v1p to ptr
+  load i8, ptr %p
+  store i8 0, ptr %b
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 1ff280d..1993023 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -909,6 +909,123 @@ define void @masked_scatter_v1i128(<1 x i128> %data, <1 x ptr> %ptrs, <1 x i1> %
   ret void
 }
 
+define void @histogram_nxv2i64(<vscale x 2 x ptr> %buckets, <vscale x 2 x i1> %mask) #3 {
+; CHECK-LABEL: 'histogram_nxv2i64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 1, <vscale x 2 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'histogram_nxv2i64'
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 1, <vscale x 2 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 1, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @histogram_nxv4i32(<vscale x 4 x ptr> %buckets, <vscale x 4 x i1> %mask) #3 {
+; CHECK-LABEL: 'histogram_nxv4i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'histogram_nxv4i32'
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+  ret void
+}
+
+define void @histogram_nxv8i16(<vscale x 8 x ptr> %buckets, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: 'histogram_nxv8i16'
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'histogram_nxv8i16'
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
+  ret void
+}
+
+define void @histogram_nxv16i8(<vscale x 16 x ptr> %buckets, <vscale x 16 x i1> %mask) {
+; CHECK-LABEL: 'histogram_nxv16i8'
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'histogram_nxv16i8'
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.experimental.vector.histogram.add.nxv16p0.i64(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
+  ret void
+}
+
+define void @histogram_v2i64(<2 x ptr> %buckets, <2 x i1> %mask) {
+; CHECK-LABEL: 'histogram_v2i64'
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.v2p0.i64(<2 x ptr> %buckets, i64 1, <2 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'histogram_v2i64'
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.v2p0.i64(<2 x ptr> %buckets, i64 1, <2 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.experimental.vector.histogram.add.v2p0.i64(<2 x ptr> %buckets, i64 1, <2 x i1> %mask)
+  ret void
+}
+
+define void @histogram_v4i32(<4 x ptr> %buckets, <4 x i1> %mask) {
+; CHECK-LABEL: 'histogram_v4i32'
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.v4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'histogram_v4i32'
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.v4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.experimental.vector.histogram.add.v4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask)
+  ret void
+}
+
+define void @histogram_v8i16(<8 x ptr> %buckets, <8 x i1> %mask) {
+; CHECK-LABEL: 'histogram_v8i16'
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.v8p0.i16(<8 x ptr> %buckets, i16 1, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'histogram_v8i16'
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.v8p0.i16(<8 x ptr> %buckets, i16 1, <8 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.experimental.vector.histogram.add.v8p0.i16(<8 x ptr> %buckets, i16 1, <8 x i1> %mask)
+  ret void
+}
+
+define void @histogram_v16i8(<16 x ptr> %buckets, <16 x i1> %mask) {
+; CHECK-LABEL: 'histogram_v16i8'
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.v16p0.i8(<16 x ptr> %buckets, i8 1, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'histogram_v16i8'
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.v16p0.i8(<16 x ptr> %buckets, i8 1, <16 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.experimental.vector.histogram.add.v16p0.i64(<16 x ptr> %buckets, i8 1, <16 x i1> %mask)
+  ret void
+}
+
+define void @histogram_nxv4i64(<vscale x 4 x ptr> %buckets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: 'histogram_nxv4i64'
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'histogram_nxv4i64'
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
+  ret void
+}
+
 declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64)
 declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64)
 declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
@@ -949,3 +1066,4 @@ declare void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs,
 attributes #0 = { "target-features"="+sve,+bf16" }
 attributes #1 = { "target-features"="+sve" vscale_range(1,16) }
 attributes #2 = { "target-features"="+sve" vscale_range(2, 16) }
+attributes #3 = { "target-features"="+sve,+sve2" vscale_range(1,16) }
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
index a23ea00..87ffb23 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v --type-based-intrinsic-cost=true | FileCheck %s --check-prefixes=TYPEBASED
 
 define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-LABEL: 'unsupported_fp_ops'
@@ -7,6 +8,11 @@ define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'unsupported_fp_ops'
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
 
   %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
   %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
@@ -18,6 +24,10 @@ define void @powi(<vscale x 4 x float> %vec) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'powi'
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
   ret void
 }
@@ -27,6 +37,10 @@ define void @fshr(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i3
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'fshr'
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %1 = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   call <vscale x 1 x i32> @llvm.fshr.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
   ret void
 }
@@ -36,6 +50,10 @@ define void @fshl(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i3
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <vscale x 1 x i32> @llvm.fshl.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'fshl'
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %1 = call <vscale x 1 x i32> @llvm.fshl.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   call <vscale x 1 x i32> @llvm.fshl.nxv4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
   ret void
 }
@@ -82,6 +100,47 @@ define void @vp_fshr() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %38 = call <vscale x 8 x i64> @llvm.vp.fshr.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'vp_fshr'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <2 x i8> @llvm.vp.fshr.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %2 = call <4 x i8> @llvm.vp.fshr.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %3 = call <8 x i8> @llvm.vp.fshr.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %4 = call <16 x i8> @llvm.vp.fshr.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %5 = call <vscale x 1 x i8> @llvm.vp.fshr.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, <vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %6 = call <vscale x 2 x i8> @llvm.vp.fshr.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %7 = call <vscale x 4 x i8> @llvm.vp.fshr.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %8 = call <vscale x 8 x i8> @llvm.vp.fshr.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %9 = call <vscale x 16 x i8> @llvm.vp.fshr.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %10 = call <vscale x 32 x i8> @llvm.vp.fshr.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, <vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %11 = call <vscale x 64 x i8> @llvm.vp.fshr.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, <vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %12 = call <2 x i16> @llvm.vp.fshr.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %13 = call <4 x i16> @llvm.vp.fshr.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %14 = call <8 x i16> @llvm.vp.fshr.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %15 = call <16 x i16> @llvm.vp.fshr.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %16 = call <vscale x 1 x i16> @llvm.vp.fshr.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, <vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %17 = call <vscale x 2 x i16> @llvm.vp.fshr.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %18 = call <vscale x 4 x i16> @llvm.vp.fshr.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %19 = call <vscale x 8 x i16> @llvm.vp.fshr.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %20 = call <vscale x 16 x i16> @llvm.vp.fshr.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %21 = call <vscale x 32 x i16> @llvm.vp.fshr.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, <vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %22 = call <2 x i32> @llvm.vp.fshr.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %23 = call <4 x i32> @llvm.vp.fshr.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %24 = call <8 x i32> @llvm.vp.fshr.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %25 = call <16 x i32> @llvm.vp.fshr.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %26 = call <vscale x 1 x i32> @llvm.vp.fshr.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, <vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %27 = call <vscale x 2 x i32> @llvm.vp.fshr.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %28 = call <vscale x 4 x i32> @llvm.vp.fshr.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %29 = call <vscale x 8 x i32> @llvm.vp.fshr.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %30 = call <vscale x 16 x i32> @llvm.vp.fshr.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %31 = call <2 x i64> @llvm.vp.fshr.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %32 = call <4 x i64> @llvm.vp.fshr.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %33 = call <8 x i64> @llvm.vp.fshr.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %34 = call <16 x i64> @llvm.vp.fshr.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %35 = call <vscale x 1 x i64> @llvm.vp.fshr.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %36 = call <vscale x 2 x i64> @llvm.vp.fshr.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %37 = call <vscale x 4 x i64> @llvm.vp.fshr.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %38 = call <vscale x 8 x i64> @llvm.vp.fshr.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   call <2 x i8> @llvm.vp.fshr.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   call <4 x i8> @llvm.vp.fshr.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
   call <8 x i8> @llvm.vp.fshr.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
@@ -165,6 +224,47 @@ define void @vp_fshl() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %38 = call <vscale x 8 x i64> @llvm.vp.fshl.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'vp_fshl'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %1 = call <2 x i8> @llvm.vp.fshl.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %2 = call <4 x i8> @llvm.vp.fshl.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %3 = call <8 x i8> @llvm.vp.fshl.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %4 = call <16 x i8> @llvm.vp.fshl.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %5 = call <vscale x 1 x i8> @llvm.vp.fshl.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, <vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %6 = call <vscale x 2 x i8> @llvm.vp.fshl.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %7 = call <vscale x 4 x i8> @llvm.vp.fshl.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %8 = call <vscale x 8 x i8> @llvm.vp.fshl.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %9 = call <vscale x 16 x i8> @llvm.vp.fshl.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %10 = call <vscale x 32 x i8> @llvm.vp.fshl.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, <vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %11 = call <vscale x 64 x i8> @llvm.vp.fshl.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, <vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %12 = call <2 x i16> @llvm.vp.fshl.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %13 = call <4 x i16> @llvm.vp.fshl.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %14 = call <8 x i16> @llvm.vp.fshl.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %15 = call <16 x i16> @llvm.vp.fshl.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %16 = call <vscale x 1 x i16> @llvm.vp.fshl.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, <vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %17 = call <vscale x 2 x i16> @llvm.vp.fshl.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %18 = call <vscale x 4 x i16> @llvm.vp.fshl.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %19 = call <vscale x 8 x i16> @llvm.vp.fshl.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %20 = call <vscale x 16 x i16> @llvm.vp.fshl.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %21 = call <vscale x 32 x i16> @llvm.vp.fshl.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, <vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %22 = call <2 x i32> @llvm.vp.fshl.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %23 = call <4 x i32> @llvm.vp.fshl.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %24 = call <8 x i32> @llvm.vp.fshl.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %25 = call <16 x i32> @llvm.vp.fshl.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %26 = call <vscale x 1 x i32> @llvm.vp.fshl.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, <vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %27 = call <vscale x 2 x i32> @llvm.vp.fshl.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %28 = call <vscale x 4 x i32> @llvm.vp.fshl.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %29 = call <vscale x 8 x i32> @llvm.vp.fshl.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %30 = call <vscale x 16 x i32> @llvm.vp.fshl.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %31 = call <2 x i64> @llvm.vp.fshl.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %32 = call <4 x i64> @llvm.vp.fshl.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %33 = call <8 x i64> @llvm.vp.fshl.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %34 = call <16 x i64> @llvm.vp.fshl.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %35 = call <vscale x 1 x i64> @llvm.vp.fshl.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %36 = call <vscale x 2 x i64> @llvm.vp.fshl.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %37 = call <vscale x 4 x i64> @llvm.vp.fshl.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %38 = call <vscale x 8 x i64> @llvm.vp.fshl.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   call <2 x i8> @llvm.vp.fshl.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   call <4 x i8> @llvm.vp.fshl.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
   call <8 x i8> @llvm.vp.fshl.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
@@ -216,32 +316,131 @@ define void @add() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = add <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = add <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = add <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t12 = add <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t14 = add <8 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t16 = add <16 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = add <vscale x 2 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t20 = add <vscale x 4 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t21 = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t22 = add <vscale x 8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t23 = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t24 = add <vscale x 16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = add <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = add <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = add <vscale x 8 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = add <vscale x 16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.add.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = add <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.add.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t11 = add <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.add.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t13 = add <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.add.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t15 = add <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.add.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = add <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = add <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t21 = add <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.add.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t23 = add <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t25 = add <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t27 = add <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t29 = add <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t31 = add <16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t33 = add <vscale x 2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t35 = add <vscale x 4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t37 = add <vscale x 8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t39 = add <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t41 = add <vscale x 2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.add.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t43 = add <vscale x 4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.add.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t45 = add <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.add.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t47 = add <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t49 = add <vscale x 2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t51 = add <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.add.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t53 = add <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t55 = add <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t57 = add <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t59 = add <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t61 = add <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t63 = add <vscale x 16 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'add'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = add <2 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = add <4 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = add <8 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = add <16 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.add.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = add <2 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.add.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t11 = add <4 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.add.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t13 = add <8 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.add.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t15 = add <16 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.add.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = add <2 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = add <4 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t21 = add <8 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.add.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t23 = add <16 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t25 = add <2 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t27 = add <4 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t29 = add <8 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t31 = add <16 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t33 = add <vscale x 2 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t35 = add <vscale x 4 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t37 = add <vscale x 8 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t39 = add <vscale x 16 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t41 = add <vscale x 2 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.add.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t43 = add <vscale x 4 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.add.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t45 = add <vscale x 8 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.add.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t47 = add <vscale x 16 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t49 = add <vscale x 2 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t51 = add <vscale x 4 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.add.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t53 = add <vscale x 8 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t55 = add <vscale x 16 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t57 = add <vscale x 2 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t59 = add <vscale x 4 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t61 = add <vscale x 8 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t63 = add <vscale x 16 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   %t1 = add <2 x i8> undef, undef
   %t2 = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
@@ -250,30 +449,263 @@ define void @add() {
   %t5 = add <8 x i8> undef, undef
   %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
   %t7 = add <16 x i8> undef, undef
-  %t8 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-  %t9 = add <2 x i64> undef, undef
-  %t10 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-  %t12 = add <4 x i64> undef, undef
-  %t13 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-  %t14 = add <8 x i64> undef, undef
-  %t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-  %t16 = add <16 x i64> undef, undef
-  %t17 = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %t18 = add <vscale x 2 x i8> undef, undef
-  %t19 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %t20 = add <vscale x 4 x i8> undef, undef
-  %t21 = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-  %t22 = add <vscale x 8 x i8> undef, undef
-  %t23 = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %t24 = add <vscale x 16 x i8> undef, undef
-  %t25 = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-  %t26 = add <vscale x 2 x i64> undef, undef
-  %t27 = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-  %t28 = add <vscale x 4 x i64> undef, undef
-  %t29 = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-  %t30 = add <vscale x 8 x i64> undef, undef
-  %t31 = call <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-  %t32 = add <vscale x 16 x i64> undef, undef
+  %t8 = call <2 x i16> @llvm.vp.add.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+  %t9 = add <2 x i16> undef, undef
+  %t10 = call <4 x i16> @llvm.vp.add.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+  %t11 = add <4 x i16> undef, undef
+  %t12 = call <8 x i16> @llvm.vp.add.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+  %t13 = add <8 x i16> undef, undef
+  %t14 = call <16 x i16> @llvm.vp.add.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+  %t15 = add <16 x i16> undef, undef
+  %t16 = call <2 x i32> @llvm.vp.add.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+  %t17 = add <2 x i32> undef, undef
+  %t18 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+  %t19 = add <4 x i32> undef, undef
+  %t20 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+  %t21 = add <8 x i32> undef, undef
+  %t22 = call <16 x i32> @llvm.vp.add.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+  %t23 = add <16 x i32> undef, undef
+  %t24 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+  %t25 = add <2 x i64> undef, undef
+  %t26 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+  %t27 = add <4 x i64> undef, undef
+  %t28 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+  %t29 = add <8 x i64> undef, undef
+  %t30 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+  %t31 = add <16 x i64> undef, undef
+  %t32 = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t33 = add <vscale x 2 x i8> undef, undef
+  %t34 = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t35 = add <vscale x 4 x i8> undef, undef
+  %t36 = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t37 = add <vscale x 8 x i8> undef, undef
+  %t38 = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t39 = add <vscale x 16 x i8> undef, undef
+  %t40 = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t41 = add <vscale x 2 x i16> undef, undef
+  %t42 = call <vscale x 4 x i16> @llvm.vp.add.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t43 = add <vscale x 4 x i16> undef, undef
+  %t44 = call <vscale x 8 x i16> @llvm.vp.add.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t45 = add <vscale x 8 x i16> undef, undef
+  %t46 = call <vscale x 16 x i16> @llvm.vp.add.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t47 = add <vscale x 16 x i16> undef, undef
+  %t48 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t49 = add <vscale x 2 x i32> undef, undef
+  %t50 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t51 = add <vscale x 4 x i32> undef, undef
+  %t52 = call <vscale x 8 x i32> @llvm.vp.add.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t53 = add <vscale x 8 x i32> undef, undef
+  %t54 = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t55 = add <vscale x 16 x i32> undef, undef
+  %t56 = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t57 = add <vscale x 2 x i64> undef, undef
+  %t58 = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t59 = add <vscale x 4 x i64> undef, undef
+  %t60 = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t61 = add <vscale x 8 x i64> undef, undef
+  %t62 = call <vscale x 16 x i64> @llvm.vp.add.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t63 = add <vscale x 16 x i64> undef, undef
+  ret void
+}
+
+define void @and() {
+; CHECK-LABEL: 'and'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.and.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = and <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.and.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = and <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.and.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = and <8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.and.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = and <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.and.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = and <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.and.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t11 = and <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.and.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t13 = and <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.and.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t15 = and <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.and.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = and <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = and <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t21 = and <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.and.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t23 = and <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.and.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t25 = and <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.and.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t27 = and <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.and.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t29 = and <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.and.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t31 = and <16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.and.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t33 = and <vscale x 2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.and.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t35 = and <vscale x 4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.and.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t37 = and <vscale x 8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t39 = and <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.and.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t41 = and <vscale x 2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.and.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t43 = and <vscale x 4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.and.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t45 = and <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.and.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t47 = and <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.and.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t49 = and <vscale x 2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t51 = and <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.and.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t53 = and <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.and.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t55 = and <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.and.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t57 = and <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.and.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t59 = and <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t61 = and <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.and.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t63 = and <vscale x 16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'and'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.and.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = and <2 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.and.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = and <4 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.and.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = and <8 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.and.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = and <16 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.and.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = and <2 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.and.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t11 = and <4 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.and.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t13 = and <8 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.and.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t15 = and <16 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.and.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = and <2 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = and <4 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t21 = and <8 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.and.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t23 = and <16 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.and.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t25 = and <2 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.and.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t27 = and <4 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.and.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t29 = and <8 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.and.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t31 = and <16 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.and.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t33 = and <vscale x 2 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.and.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t35 = and <vscale x 4 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.and.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t37 = and <vscale x 8 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t39 = and <vscale x 16 x i8> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.and.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t41 = and <vscale x 2 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.and.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t43 = and <vscale x 4 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.and.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t45 = and <vscale x 8 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.and.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t47 = and <vscale x 16 x i16> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.and.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t49 = and <vscale x 2 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t51 = and <vscale x 4 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.and.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t53 = and <vscale x 8 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.and.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t55 = and <vscale x 16 x i32> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.and.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t57 = and <vscale x 2 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.and.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t59 = and <vscale x 4 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t61 = and <vscale x 8 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.and.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t63 = and <vscale x 16 x i64> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %t0 = call <2 x i8> @llvm.vp.and.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+  %t1 = and <2 x i8> undef, undef
+  %t2 = call <4 x i8> @llvm.vp.and.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+  %t3 = and <4 x i8> undef, undef
+  %t4 = call <8 x i8> @llvm.vp.and.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+  %t5 = and <8 x i8> undef, undef
+  %t6 = call <16 x i8> @llvm.vp.and.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+  %t7 = and <16 x i8> undef, undef
+  %t8 = call <2 x i16> @llvm.vp.and.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+  %t9 = and <2 x i16> undef, undef
+  %t10 = call <4 x i16> @llvm.vp.and.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+  %t11 = and <4 x i16> undef, undef
+  %t12 = call <8 x i16> @llvm.vp.and.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+  %t13 = and <8 x i16> undef, undef
+  %t14 = call <16 x i16> @llvm.vp.and.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+  %t15 = and <16 x i16> undef, undef
+  %t16 = call <2 x i32> @llvm.vp.and.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+  %t17 = and <2 x i32> undef, undef
+  %t18 = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+  %t19 = and <4 x i32> undef, undef
+  %t20 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+  %t21 = and <8 x i32> undef, undef
+  %t22 = call <16 x i32> @llvm.vp.and.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+  %t23 = and <16 x i32> undef, undef
+  %t24 = call <2 x i64> @llvm.vp.and.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+  %t25 = and <2 x i64> undef, undef
+  %t26 = call <4 x i64> @llvm.vp.and.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+  %t27 = and <4 x i64> undef, undef
+  %t28 = call <8 x i64> @llvm.vp.and.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+  %t29 = and <8 x i64> undef, undef %t30 = call <16 x i64> @llvm.vp.and.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+  %t31 = and <16 x i64> undef, undef
+  %t32 = call <vscale x 2 x i8> @llvm.vp.and.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t33 = and <vscale x 2 x i8> undef, undef
+  %t34 = call <vscale x 4 x i8> @llvm.vp.and.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t35 = and <vscale x 4 x i8> undef, undef
+  %t36 = call <vscale x 8 x i8> @llvm.vp.and.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t37 = and <vscale x 8 x i8> undef, undef
+  %t38 = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t39 = and <vscale x 16 x i8> undef, undef
+  %t40 = call <vscale x 2 x i16> @llvm.vp.and.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t41 = and <vscale x 2 x i16> undef, undef
+  %t42 = call <vscale x 4 x i16> @llvm.vp.and.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t43 = and <vscale x 4 x i16> undef, undef
+  %t44 = call <vscale x 8 x i16> @llvm.vp.and.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t45 = and <vscale x 8 x i16> undef, undef
+  %t46 = call <vscale x 16 x i16> @llvm.vp.and.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t47 = and <vscale x 16 x i16> undef, undef
+  %t48 = call <vscale x 2 x i32> @llvm.vp.and.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t49 = and <vscale x 2 x i32> undef, undef
+  %t50 = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t51 = and <vscale x 4 x i32> undef, undef
+  %t52 = call <vscale x 8 x i32> @llvm.vp.and.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t53 = and <vscale x 8 x i32> undef, undef
+  %t54 = call <vscale x 16 x i32> @llvm.vp.and.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t55 = and <vscale x 16 x i32> undef, undef
+  %t56 = call <vscale x 2 x i64> @llvm.vp.and.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t57 = and <vscale x 2 x i64> undef, undef
+  %t58 = call <vscale x 4 x i64> @llvm.vp.and.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t59 = and <vscale x 4 x i64> undef, undef
+  %t60 = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t61 = and <vscale x 8 x i64> undef, undef
+  %t62 = call <vscale x 16 x i64> @llvm.vp.and.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t63 = and <vscale x 16 x i64> undef, undef
   ret void
 }
 
@@ -313,6 +745,41 @@ define void @abs() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'abs'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %1 = call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 false, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %3 = call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 false, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %5 = call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 false, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %7 = call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 false, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x i8> @llvm.vp.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <vscale x 2 x i8> @llvm.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 4 x i8> @llvm.vp.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <vscale x 4 x i8> @llvm.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %21 = call <vscale x 8 x i8> @llvm.vp.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <vscale x 8 x i8> @llvm.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %25 = call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 0, <2 x i1> undef, i32 undef)
   call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 0)
   call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 0, <4 x i1> undef, i32 undef)
@@ -384,6 +851,41 @@ define void @load() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = load <vscale x 16 x i64>, ptr undef, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'load'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %t0 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = load <2 x i8>, ptr undef, align 2
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %t2 = call <4 x i8> @llvm.vp.load.v4i8.p0(ptr undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = load <4 x i8>, ptr undef, align 4
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %t4 = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = load <8 x i8>, ptr undef, align 8
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %t6 = call <16 x i8> @llvm.vp.load.v16i8.p0(ptr undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = load <16 x i8>, ptr undef, align 16
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %t8 = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = load <2 x i64>, ptr undef, align 16
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %t10 = call <4 x i64> @llvm.vp.load.v4i64.p0(ptr undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t12 = load <4 x i64>, ptr undef, align 32
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %t13 = call <8 x i64> @llvm.vp.load.v8i64.p0(ptr undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t14 = load <8 x i64>, ptr undef, align 64
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %t15 = call <16 x i64> @llvm.vp.load.v16i64.p0(ptr undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t16 = load <16 x i64>, ptr undef, align 128
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i8> @llvm.vp.load.nxv2i8.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = load <vscale x 2 x i8>, ptr undef, align 2
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t20 = load <vscale x 4 x i8>, ptr undef, align 4
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t21 = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t22 = load <vscale x 8 x i8>, ptr undef, align 8
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t23 = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t24 = load <vscale x 16 x i8>, ptr undef, align 16
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t25 = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = load <vscale x 2 x i64>, ptr undef, align 16
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t27 = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = load <vscale x 4 x i64>, ptr undef, align 32
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t29 = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = load <vscale x 8 x i64>, ptr undef, align 64
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t31 = call <vscale x 16 x i64> @llvm.vp.load.nxv16i64.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = load <vscale x 16 x i64>, ptr undef, align 128
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %t0 = call <2 x i8> @llvm.vp.load.v2i8(ptr undef, <2 x i1> undef, i32 undef)
   %t1 = load <2 x i8>, ptr undef
   %t2 = call <4 x i8> @llvm.vp.load.v4i8(ptr undef, <4 x i1> undef, i32 undef)
@@ -455,6 +957,41 @@ define void @store() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <vscale x 16 x i64> undef, ptr undef, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'store'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 2
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.vp.store.v4i8.p0(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, ptr undef, align 4
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.vp.store.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> undef, ptr undef, align 8
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.vp.store.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 16
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 16
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.vp.store.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr undef, align 32
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.vp.store.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i64> undef, ptr undef, align 64
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.vp.store.v16i64.p0(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i64> undef, ptr undef, align 128
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv2i8.p0(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i8> undef, ptr undef, align 2
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i8> undef, ptr undef, align 4
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 8 x i8> undef, ptr undef, align 8
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 16 x i8> undef, ptr undef, align 16
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> undef, ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 2 x i64> undef, ptr undef, align 16
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> undef, ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 4 x i64> undef, ptr undef, align 32
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv8i64.p0(<vscale x 8 x i64> undef, ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <vscale x 8 x i64> undef, ptr undef, align 64
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv16i64.p0(<vscale x 16 x i64> undef, ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <vscale x 16 x i64> undef, ptr undef, align 128
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   call void @llvm.vp.store.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef)
   store <2 x i8> undef, ptr undef
   call void @llvm.vp.store.v4i8(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef)
@@ -514,6 +1051,29 @@ define void @strided_load() {
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'strided_load'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %t6 = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %t8.a = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 8 undef, i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %t10.a = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 undef, i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %t13.a = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr align 8 undef, i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %t15.a = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr align 8 undef, i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %t8 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %t13 = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i8> @llvm.experimental.vp.strided.load.nxv2i8.p0.i64(ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 4 x i8> @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t21 = call <vscale x 8 x i8> @llvm.experimental.vp.strided.load.nxv8i8.p0.i64(ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t23 = call <vscale x 16 x i8> @llvm.experimental.vp.strided.load.nxv16i8.p0.i64(ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t25 = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t29 = call <vscale x 8 x i64> @llvm.experimental.vp.strided.load.nxv8i64.p0.i64(ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
   %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef)
   %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef)
@@ -561,6 +1121,29 @@ define void @strided_store() {
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'strided_store'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.experimental.vp.strided.store.v8i8.p0.i64(<8 x i8> undef, ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.experimental.vp.strided.store.v16i8.p0.i64(<16 x i8> undef, ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i64.p0.i64(<2 x i64> undef, ptr undef, i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> undef, ptr undef, i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.experimental.vp.strided.store.v8i64.p0.i64(<8 x i64> undef, ptr undef, i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i64.p0.i64(<2 x i64> undef, ptr align 8 undef, i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> undef, ptr align 8 undef, i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.experimental.vp.strided.store.v8i64.p0.i64(<8 x i64> undef, ptr align 8 undef, i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr align 8 undef, i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv2i8.p0.i64(<vscale x 2 x i8> undef, ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv4i8.p0.i64(<vscale x 4 x i8> undef, ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i64(<vscale x 8 x i8> undef, ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i64(<vscale x 16 x i8> undef, ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i64(<vscale x 2 x i64> undef, ptr undef, i64 undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv8i64.p0.i64(<vscale x 8 x i64> undef, ptr undef, i64 undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   call void @llvm.experimental.vp.strided.store.v2i8.i64(<2 x i8> undef, ptr undef, i64 undef, <2 x i1> undef, i32 undef)
   call void @llvm.experimental.vp.strided.store.v4i8.i64(<4 x i8> undef, ptr undef, i64 undef, <4 x i1> undef, i32 undef)
   call void @llvm.experimental.vp.strided.store.v8i8.i64(<8 x i8> undef, ptr undef, i64 undef, <8 x i1> undef, i32 undef)
@@ -622,6 +1205,41 @@ define void @reduce_add() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %32 = call i64 @llvm.vector.reduce.add.nxv16i64(<vscale x 16 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'reduce_add'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %1 = call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %3 = call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %5 = call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %6 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %7 = call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %9 = call i64 @llvm.vp.reduce.add.v2i64(i64 undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %10 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %11 = call i64 @llvm.vp.reduce.add.v4i64(i64 undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %13 = call i64 @llvm.vp.reduce.add.v8i64(i64 undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %14 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %15 = call i64 @llvm.vp.reduce.add.v16i64(i64 undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %17 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %19 = call i8 @llvm.vp.reduce.add.nxv4i8(i8 undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %20 = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %21 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %22 = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %23 = call i8 @llvm.vp.reduce.add.nxv16i8(i8 undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %24 = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %25 = call i64 @llvm.vp.reduce.add.nxv2i64(i64 undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %27 = call i64 @llvm.vp.reduce.add.nxv4i64(i64 undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %28 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %29 = call i64 @llvm.vp.reduce.add.nxv8i64(i64 undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %30 = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %31 = call i64 @llvm.vp.reduce.add.nxv16i64(i64 undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %32 = call i64 @llvm.vector.reduce.add.nxv16i64(<vscale x 16 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
   call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
   call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
@@ -693,6 +1311,41 @@ define void @reduce_fadd() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %32 = call double @llvm.vector.reduce.fadd.nxv16f64(double undef, <vscale x 16 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; TYPEBASED-LABEL: 'reduce_fadd'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %1 = call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call float @llvm.vector.reduce.fadd.v2f32(float undef, <2 x float> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %3 = call float @llvm.vp.reduce.fadd.v4f32(float undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %4 = call float @llvm.vector.reduce.fadd.v4f32(float undef, <4 x float> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %5 = call float @llvm.vp.reduce.fadd.v8f32(float undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %6 = call float @llvm.vector.reduce.fadd.v8f32(float undef, <8 x float> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %7 = call float @llvm.vp.reduce.fadd.v16f32(float undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %8 = call float @llvm.vector.reduce.fadd.v16f32(float undef, <16 x float> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %9 = call double @llvm.vp.reduce.fadd.v2f64(double undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call double @llvm.vector.reduce.fadd.v2f64(double undef, <2 x double> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %11 = call double @llvm.vp.reduce.fadd.v4f64(double undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %12 = call double @llvm.vector.reduce.fadd.v4f64(double undef, <4 x double> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %13 = call double @llvm.vp.reduce.fadd.v8f64(double undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %14 = call double @llvm.vector.reduce.fadd.v8f64(double undef, <8 x double> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %15 = call double @llvm.vp.reduce.fadd.v16f64(double undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %16 = call double @llvm.vector.reduce.fadd.v16f64(double undef, <16 x double> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %17 = call float @llvm.vp.reduce.fadd.nxv2f32(float undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %18 = call float @llvm.vector.reduce.fadd.nxv2f32(float undef, <vscale x 2 x float> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %19 = call float @llvm.vp.reduce.fadd.nxv4f32(float undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %20 = call float @llvm.vector.reduce.fadd.nxv4f32(float undef, <vscale x 4 x float> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %21 = call float @llvm.vp.reduce.fadd.nxv8f32(float undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %22 = call float @llvm.vector.reduce.fadd.nxv8f32(float undef, <vscale x 8 x float> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %23 = call float @llvm.vp.reduce.fadd.nxv16f32(float undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %24 = call float @llvm.vector.reduce.fadd.nxv16f32(float undef, <vscale x 16 x float> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %25 = call double @llvm.vp.reduce.fadd.nxv2f64(double undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %26 = call double @llvm.vector.reduce.fadd.nxv2f64(double undef, <vscale x 2 x double> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %27 = call double @llvm.vp.reduce.fadd.nxv4f64(double undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %28 = call double @llvm.vector.reduce.fadd.nxv4f64(double undef, <vscale x 4 x double> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %29 = call double @llvm.vp.reduce.fadd.nxv8f64(double undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %30 = call double @llvm.vector.reduce.fadd.nxv8f64(double undef, <vscale x 8 x double> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %31 = call double @llvm.vp.reduce.fadd.nxv16f64(double undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %32 = call double @llvm.vector.reduce.fadd.nxv16f64(double undef, <vscale x 16 x double> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef)
   call float @llvm.vector.reduce.fadd.v2f32(float undef, <2 x float> undef)
   call float @llvm.vp.reduce.fadd.v4f32(float undef, <4 x float> undef, <4 x i1> undef, i32 undef)
@@ -728,6 +1381,114 @@ define void @reduce_fadd() {
   ret void
 }
 
+define void @vp_fadd(){
+; CHECK-LABEL: 'vp_fadd'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = fadd <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = fadd <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t5 = fadd <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t7 = fadd <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = fadd <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t11 = fadd <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = fadd <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = fadd <16 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'vp_fadd'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = fadd <2 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = fadd <4 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t5 = fadd <8 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t7 = fadd <16 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = fadd <2 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t11 = fadd <4 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = fadd <8 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = fadd <16 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+  %t1 = fadd <2 x float> undef, undef
+  %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+  %t3 = fadd <4 x float> undef, undef
+  %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+  %t5 = fadd <8 x float> undef, undef
+  %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+  %t7 = fadd <16 x float> undef, undef
+  %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+  %t9 = fadd <2 x double> undef, undef
+  %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+  %t11 = fadd <4 x double> undef, undef
+  %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+  %t13 = fadd <8 x double> undef, undef
+  %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+  %t15 = fadd <16 x double> undef, undef
+  %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t18 = fadd <vscale x 2 x float> undef, undef
+  %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t20 = fadd <vscale x 4 x float> undef, undef
+  %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t22 = fadd <vscale x 8 x float> undef, undef
+  %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t24 = fadd <vscale x 16 x float> undef, undef
+  %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %t26 = fadd <vscale x 2 x double> undef, undef
+  %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %t28 = fadd <vscale x 4 x double> undef, undef
+  %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %t30 = fadd <vscale x 8 x double> undef, undef
+  %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %t32 = fadd <vscale x 16 x double> undef, undef
+
+  ret void
+}
+
+
 declare <2 x i8> @llvm.vp.add.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
 declare <4 x i8> @llvm.vp.add.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
 declare <8 x i8> @llvm.vp.add.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index b1ca889..a76d0b3 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -4,6 +4,226 @@
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
+
+; CHECK-GI:      warning: Instruction selection used fallback path for f128_fp128
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_i128
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_double
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_float
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_i32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_half
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2f128_fp128
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_fp128
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2f128_double
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_double
+
+
+define fp128 @f128_fp128(fp128 %a, fp128 %b, fp128 %d, fp128 %e) {
+; CHECK-LABEL: f128_fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    stp q2, q3, [sp] // 32-byte Folded Spill
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    b.ge .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:  .LBB0_2: // %entry
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt fp128 %a, %b
+  %s = select i1 %c, fp128 %d, fp128 %e
+  ret fp128 %s
+}
+
+define i128 @f128_i128(fp128 %a, fp128 %b, i128 %d, i128 %e) {
+; CHECK-LABEL: f128_i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    mov x19, x3
+; CHECK-NEXT:    mov x20, x2
+; CHECK-NEXT:    mov x21, x1
+; CHECK-NEXT:    mov x22, x0
+; CHECK-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    csel x20, x22, x20, lt
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel x1, x21, x19, lt
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt fp128 %a, %b
+  %s = select i1 %c, i128 %d, i128 %e
+  ret i128 %s
+}
+
+define double @f128_double(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-LABEL: f128_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset b8, -24
+; CHECK-NEXT:    .cfi_offset b9, -32
+; CHECK-NEXT:    fmov d8, d3
+; CHECK-NEXT:    fmov d9, d2
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    fcsel d0, d9, d8, lt
+; CHECK-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define float @f128_float(fp128 %a, fp128 %b, float %d, float %e) {
+; CHECK-LABEL: f128_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset b8, -24
+; CHECK-NEXT:    .cfi_offset b9, -32
+; CHECK-NEXT:    fmov s8, s3
+; CHECK-NEXT:    fmov s9, s2
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    fcsel s0, s9, s8, lt
+; CHECK-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt fp128 %a, %b
+  %s = select i1 %c, float %d, float %e
+  ret float %s
+}
+
+define i32 @f128_i32(fp128 %a, fp128 %b, i32 %d, i32 %e) {
+; CHECK-LABEL: f128_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov w19, w1
+; CHECK-NEXT:    mov w20, w0
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    csel w0, w20, w19, lt
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt fp128 %a, %b
+  %s = select i1 %c, i32 %d, i32 %e
+  ret i32 %s
+}
+
+define half @f128_half(fp128 %a, fp128 %b, half %d, half %e) {
+; CHECK-SD-NOFP16-LABEL: f128_half:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NOFP16-NEXT:    fmov s8, s3
+; CHECK-SD-NOFP16-NEXT:    fmov s9, s2
+; CHECK-SD-NOFP16-NEXT:    bl __lttf2
+; CHECK-SD-NOFP16-NEXT:    cmp w0, #0
+; CHECK-SD-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    fcsel s0, s9, s8, lt
+; CHECK-SD-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-SD-NOFP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: f128_half:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-FP16-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-FP16-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-FP16-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-FP16-NEXT:    fmov s8, s3
+; CHECK-SD-FP16-NEXT:    fmov s9, s2
+; CHECK-SD-FP16-NEXT:    bl __lttf2
+; CHECK-SD-FP16-NEXT:    cmp w0, #0
+; CHECK-SD-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    fcsel h0, h9, h8, lt
+; CHECK-SD-FP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: f128_half:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NOFP16-NEXT:    fmov s8, s3
+; CHECK-GI-NOFP16-NEXT:    fmov s9, s2
+; CHECK-GI-NOFP16-NEXT:    bl __lttf2
+; CHECK-GI-NOFP16-NEXT:    cmp w0, #0
+; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    fcsel s0, s9, s8, lt
+; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-NOFP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: f128_half:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-FP16-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-FP16-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-FP16-NEXT:    fmov s8, s3
+; CHECK-GI-FP16-NEXT:    fmov s9, s2
+; CHECK-GI-FP16-NEXT:    bl __lttf2
+; CHECK-GI-FP16-NEXT:    cmp w0, #0
+; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    fcsel h0, h9, h8, lt
+; CHECK-GI-FP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = fcmp olt fp128 %a, %b
+  %s = select i1 %c, half %d, half %e
+  ret half %s
+}
+
 define double @f64_double(double %a, double %b, double %d, double %e) {
 ; CHECK-LABEL: f64_double:
 ; CHECK:       // %bb.0: // %entry
@@ -135,6 +355,186 @@ entry:
   ret i32 %s
 }
 
+define <2 x fp128> @v2f128_fp128(<2 x fp128> %a, <2 x fp128> %b, <2 x fp128> %d, <2 x fp128> %e) {
+; CHECK-LABEL: v2f128_fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #112
+; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    stp q4, q5, [sp] // 32-byte Folded Spill
+; CHECK-NEXT:    stp q1, q3, [sp, #32] // 32-byte Folded Spill
+; CHECK-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEXT:    stp q7, q6, [sp, #64] // 32-byte Folded Spill
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    b.ge .LBB12_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:  .LBB12_2: // %entry
+; CHECK-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    b.ge .LBB12_4
+; CHECK-NEXT:  // %bb.3: // %entry
+; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:  .LBB12_4: // %entry
+; CHECK-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <2 x fp128> %a, %b
+  %s = select <2 x i1> %c, <2 x fp128> %d, <2 x fp128> %e
+  ret <2 x fp128> %s
+}
+
+define <3 x fp128> @v3f128_fp128(<3 x fp128> %a, <3 x fp128> %b, <3 x fp128> %d, <3 x fp128> %e) {
+; CHECK-LABEL: v3f128_fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #112
+; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    stp q1, q4, [sp] // 32-byte Folded Spill
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    stp q2, q5, [sp, #32] // 32-byte Folded Spill
+; CHECK-NEXT:    stp q6, q7, [sp, #64] // 32-byte Folded Spill
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    b.lt .LBB13_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    ldr q0, [sp, #128]
+; CHECK-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:  .LBB13_2: // %entry
+; CHECK-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    b.lt .LBB13_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    ldr q0, [sp, #144]
+; CHECK-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:  .LBB13_4: // %entry
+; CHECK-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    add x8, sp, #160
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    csel x8, x9, x8, lt
+; CHECK-NEXT:    ldp q0, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-NEXT:    ldr q2, [x8]
+; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <3 x fp128> %a, %b
+  %s = select <3 x i1> %c, <3 x fp128> %d, <3 x fp128> %e
+  ret <3 x fp128> %s
+}
+
+
+define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double> %d, <2 x double> %e) {
+; CHECK-LABEL: v2f128_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    stp q4, q5, [sp, #48] // 32-byte Folded Spill
+; CHECK-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ldp q2, q1, [sp, #48] // 32-byte Folded Reload
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <2 x fp128> %a, %b
+  %s = select <2 x i1> %c, <2 x double> %d, <2 x double> %e
+  ret <2 x double> %s
+}
+
+define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double> %d, <3 x double> %e) {
+; CHECK-LABEL: v3f128_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #160
+; CHECK-NEXT:    str x30, [sp, #144] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 160
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    stp q2, q5, [sp, #112] // 32-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-NEXT:    ldr d5, [sp, #184]
+; CHECK-NEXT:    str q3, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    ldp d3, d2, [sp, #168]
+; CHECK-NEXT:    mov v6.d[1], v7.d[0]
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mov v1.16b, v4.16b
+; CHECK-NEXT:    str q5, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr d5, [sp, #160]
+; CHECK-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-NEXT:    str q5, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp q6, q3, [sp, #32] // 32-byte Folded Spill
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    ldp q0, q1, [sp, #112] // 32-byte Folded Reload
+; CHECK-NEXT:    bl __lttf2
+; CHECK-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    ldp q2, q4, [sp, #64] // 32-byte Folded Reload
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    ldr q3, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    fmov d2, x8
+; CHECK-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    add sp, sp, #160
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <3 x fp128> %a, %b
+  %s = select <3 x i1> %c, <3 x double> %d, <3 x double> %e
+  ret <3 x double> %s
+}
+
 define <2 x double> @v2f64_double(<2 x double> %a, <2 x double> %b, <2 x double> %d, <2 x double> %e) {
 ; CHECK-LABEL: v2f64_double:
 ; CHECK:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/itofp-bf16.ll b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
new file mode 100644
index 0000000..978fe0b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
@@ -0,0 +1,1832 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
+
+define bfloat @stofp_i64_bf16(i64 %a) {
+; CHECK-LABEL: stofp_i64_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    and x11, x0, #0x8000000000000000
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    cneg x9, x0, mi
+; CHECK-NEXT:    lsr x10, x9, #53
+; CHECK-NEXT:    cmp x10, #0
+; CHECK-NEXT:    and x10, x9, #0xfffffffffffff000
+; CHECK-NEXT:    csel x10, x10, x9, ne
+; CHECK-NEXT:    scvtf d0, x10
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    tst x9, #0xfff
+; CHECK-NEXT:    csel w10, wzr, w10, eq
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    orr x9, x9, x11
+; CHECK-NEXT:    orr x9, x9, x10
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    fcvtxn s0, d0
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp i64 %a to bfloat
+  ret bfloat %c
+}
+
+define bfloat @utofp_i64_bf16(i64 %a) {
+; CHECK-LABEL: utofp_i64_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsr x9, x0, #53
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    cmp x9, #0
+; CHECK-NEXT:    and x9, x0, #0xfffffffffffff000
+; CHECK-NEXT:    csel x9, x9, x0, ne
+; CHECK-NEXT:    ucvtf d0, x9
+; CHECK-NEXT:    cset w9, ne
+; CHECK-NEXT:    tst x0, #0xfff
+; CHECK-NEXT:    csel w9, wzr, w9, eq
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    orr x9, x10, x9
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    fcvtxn s0, d0
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp i64 %a to bfloat
+  ret bfloat %c
+}
+
+define bfloat @stofp_i32_bf16(i32 %a) {
+; CHECK-LABEL: stofp_i32_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf d0, w0
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    fcvtxn s0, d0
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp i32 %a to bfloat
+  ret bfloat %c
+}
+
+define bfloat @utofp_i32_bf16(i32 %a) {
+; CHECK-LABEL: utofp_i32_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf d0, w0
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    fcvtxn s0, d0
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp i32 %a to bfloat
+  ret bfloat %c
+}
+
+define bfloat @stofp_i16_bf16(i16 %a) {
+; CHECK-LABEL: stofp_i16_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxth w9, w0
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    scvtf s0, w9
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp i16 %a to bfloat
+  ret bfloat %c
+}
+
+define bfloat @utofp_i16_bf16(i16 %a) {
+; CHECK-LABEL: utofp_i16_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w9, w0, #0xffff
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    ucvtf s0, w9
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp i16 %a to bfloat
+  ret bfloat %c
+}
+
+define bfloat @stofp_i8_bf16(i8 %a) {
+; CHECK-LABEL: stofp_i8_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb w9, w0
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    scvtf s0, w9
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp i8 %a to bfloat
+  ret bfloat %c
+}
+
+define bfloat @utofp_i8_bf16(i8 %a) {
+; CHECK-LABEL: utofp_i8_bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w9, w0, #0xff
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    ucvtf s0, w9
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp i8 %a to bfloat
+  ret bfloat %c
+}
+
+define <2 x bfloat> @stofp_v2i64_v2bf16(<2 x i64> %a) {
+; CHECK-LABEL: stofp_v2i64_v2bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    cmp x9, #0
+; CHECK-NEXT:    cneg x10, x9, mi
+; CHECK-NEXT:    and x9, x9, #0x8000000000000000
+; CHECK-NEXT:    lsr x11, x10, #53
+; CHECK-NEXT:    and x12, x10, #0xfffffffffffff000
+; CHECK-NEXT:    cmp x11, #0
+; CHECK-NEXT:    csel x11, x12, x10, ne
+; CHECK-NEXT:    cset w12, ne
+; CHECK-NEXT:    tst x10, #0xfff
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    csel w12, wzr, w12, eq
+; CHECK-NEXT:    scvtf d0, x11
+; CHECK-NEXT:    cmp x10, #0
+; CHECK-NEXT:    cneg x13, x10, mi
+; CHECK-NEXT:    and x10, x10, #0x8000000000000000
+; CHECK-NEXT:    lsr x14, x13, #53
+; CHECK-NEXT:    cmp x14, #0
+; CHECK-NEXT:    and x14, x13, #0xfffffffffffff000
+; CHECK-NEXT:    csel x11, x14, x13, ne
+; CHECK-NEXT:    cset w14, ne
+; CHECK-NEXT:    tst x13, #0xfff
+; CHECK-NEXT:    scvtf d1, x11
+; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    orr x9, x11, x9
+; CHECK-NEXT:    csel w11, wzr, w14, eq
+; CHECK-NEXT:    fmov x13, d1
+; CHECK-NEXT:    orr x9, x9, x12
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    orr x10, x13, x10
+; CHECK-NEXT:    orr x10, x10, x11
+; CHECK-NEXT:    fcvtxn s0, d0
+; CHECK-NEXT:    fmov d1, x10
+; CHECK-NEXT:    fcvtxn s1, d1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ubfx w11, w9, #16, #1
+; CHECK-NEXT:    add w9, w9, w8
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    add w9, w11, w9
+; CHECK-NEXT:    lsr w9, w9, #16
+; CHECK-NEXT:    ubfx w12, w10, #16, #1
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    add w8, w12, w8
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <2 x i64> %a to <2 x bfloat>
+  ret <2 x bfloat> %c
+}
+
+define <2 x bfloat> @utofp_v2i64_v2bf16(<2 x i64> %a) {
+; CHECK-LABEL: utofp_v2i64_v2bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    lsr x10, x9, #53
+; CHECK-NEXT:    and x12, x9, #0xfffffffffffff000
+; CHECK-NEXT:    cmp x10, #0
+; CHECK-NEXT:    lsr x10, x11, #53
+; CHECK-NEXT:    csel x12, x12, x9, ne
+; CHECK-NEXT:    cset w13, ne
+; CHECK-NEXT:    tst x9, #0xfff
+; CHECK-NEXT:    csel w9, wzr, w13, eq
+; CHECK-NEXT:    cmp x10, #0
+; CHECK-NEXT:    and x10, x11, #0xfffffffffffff000
+; CHECK-NEXT:    csel x10, x10, x11, ne
+; CHECK-NEXT:    ucvtf d0, x12
+; CHECK-NEXT:    ucvtf d1, x10
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    tst x11, #0xfff
+; CHECK-NEXT:    csel w10, wzr, w10, eq
+; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    fmov x12, d1
+; CHECK-NEXT:    orr x9, x11, x9
+; CHECK-NEXT:    orr x10, x12, x10
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    fmov d1, x10
+; CHECK-NEXT:    fcvtxn s0, d0
+; CHECK-NEXT:    fcvtxn s1, d1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    ubfx w11, w9, #16, #1
+; CHECK-NEXT:    add w9, w9, w8
+; CHECK-NEXT:    ubfx w12, w10, #16, #1
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    add w9, w11, w9
+; CHECK-NEXT:    add w8, w12, w8
+; CHECK-NEXT:    lsr w9, w9, #16
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <2 x i64> %a to <2 x bfloat>
+  ret <2 x bfloat> %c
+}
+
+define <3 x bfloat> @stofp_v3i64_v3bf16(<3 x i64> %a) {
+; CHECK-LABEL: stofp_v3i64_v3bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    scvtf v1.2d, v2.2d
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <3 x i64> %a to <3 x bfloat>
+  ret <3 x bfloat> %c
+}
+
+define <3 x bfloat> @utofp_v3i64_v3bf16(<3 x i64> %a) {
+; CHECK-LABEL: utofp_v3i64_v3bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ucvtf v1.2d, v2.2d
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <3 x i64> %a to <3 x bfloat>
+  ret <3 x bfloat> %c
+}
+
+define <4 x bfloat> @stofp_v4i64_v4bf16(<4 x i64> %a) {
+; CHECK-LABEL: stofp_v4i64_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <4 x i64> %a to <4 x bfloat>
+  ret <4 x bfloat> %c
+}
+
+define <4 x bfloat> @utofp_v4i64_v4bf16(<4 x i64> %a) {
+; CHECK-LABEL: utofp_v4i64_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <4 x i64> %a to <4 x bfloat>
+  ret <4 x bfloat> %c
+}
+
+define <8 x bfloat> @stofp_v8i64_v8bf16(<8 x i64> %a) {
+; CHECK-LABEL: stofp_v8i64_v8bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-NEXT:    ushr v4.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-NEXT:    add v6.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    bit v2.16b, v4.16b, v5.16b
+; CHECK-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <8 x i64> %a to <8 x bfloat>
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @utofp_v8i64_v8bf16(<8 x i64> %a) {
+; CHECK-LABEL: utofp_v8i64_v8bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-NEXT:    ushr v4.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-NEXT:    add v6.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    bit v2.16b, v4.16b, v5.16b
+; CHECK-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <8 x i64> %a to <8 x bfloat>
+  ret <8 x bfloat> %c
+}
+
+define <16 x bfloat> @stofp_v16i64_v16bf16(<16 x i64> %a) {
+; CHECK-LABEL: stofp_v16i64_v16bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-NEXT:    scvtf v6.2d, v6.2d
+; CHECK-NEXT:    scvtf v4.2d, v4.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-NEXT:    scvtf v7.2d, v7.2d
+; CHECK-NEXT:    scvtf v5.2d, v5.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-NEXT:    fcvtn v6.2s, v6.2d
+; CHECK-NEXT:    fcvtn v4.2s, v4.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
+; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-NEXT:    ushr v7.4s, v0.4s, #16
+; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
+; CHECK-NEXT:    add v19.4s, v0.4s, v3.4s
+; CHECK-NEXT:    add v18.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v20.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v17.16b, v1.16b
+; CHECK-NEXT:    fcmeq v17.4s, v2.4s, v2.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    add v7.4s, v7.4s, v19.4s
+; CHECK-NEXT:    fcmeq v19.4s, v6.4s, v6.4s
+; CHECK-NEXT:    add v5.4s, v5.4s, v18.4s
+; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    fcmeq v3.4s, v4.4s, v4.4s
+; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    orr v6.4s, #64, lsl #16
+; CHECK-NEXT:    orr v4.4s, #64, lsl #16
+; CHECK-NEXT:    bit v2.16b, v5.16b, v17.16b
+; CHECK-NEXT:    mov v5.16b, v19.16b
+; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
+; CHECK-NEXT:    bif v1.16b, v4.16b, v3.16b
+; CHECK-NEXT:    bsl v5.16b, v16.16b, v6.16b
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <16 x i64> %a to <16 x bfloat>
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @utofp_v16i64_v16bf16(<16 x i64> %a) {
+; CHECK-LABEL: utofp_v16i64_v16bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-NEXT:    ucvtf v6.2d, v6.2d
+; CHECK-NEXT:    ucvtf v4.2d, v4.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-NEXT:    ucvtf v5.2d, v5.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-NEXT:    fcvtn v6.2s, v6.2d
+; CHECK-NEXT:    fcvtn v4.2s, v4.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
+; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-NEXT:    ushr v7.4s, v0.4s, #16
+; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
+; CHECK-NEXT:    add v19.4s, v0.4s, v3.4s
+; CHECK-NEXT:    add v18.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v20.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v17.16b, v1.16b
+; CHECK-NEXT:    fcmeq v17.4s, v2.4s, v2.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    add v7.4s, v7.4s, v19.4s
+; CHECK-NEXT:    fcmeq v19.4s, v6.4s, v6.4s
+; CHECK-NEXT:    add v5.4s, v5.4s, v18.4s
+; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    fcmeq v3.4s, v4.4s, v4.4s
+; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    orr v6.4s, #64, lsl #16
+; CHECK-NEXT:    orr v4.4s, #64, lsl #16
+; CHECK-NEXT:    bit v2.16b, v5.16b, v17.16b
+; CHECK-NEXT:    mov v5.16b, v19.16b
+; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
+; CHECK-NEXT:    bif v1.16b, v4.16b, v3.16b
+; CHECK-NEXT:    bsl v5.16b, v16.16b, v6.16b
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <16 x i64> %a to <16 x bfloat>
+  ret <16 x bfloat> %c
+}
+
+define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) {
+; CHECK-LABEL: stofp_v32i64_v32bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf v17.2d, v2.2d
+; CHECK-NEXT:    scvtf v18.2d, v0.2d
+; CHECK-NEXT:    scvtf v19.2d, v3.2d
+; CHECK-NEXT:    scvtf v3.2d, v6.2d
+; CHECK-NEXT:    ldp q21, q20, [sp, #32]
+; CHECK-NEXT:    scvtf v4.2d, v4.2d
+; CHECK-NEXT:    scvtf v6.2d, v7.2d
+; CHECK-NEXT:    scvtf v5.2d, v5.2d
+; CHECK-NEXT:    ldp q24, q23, [sp, #64]
+; CHECK-NEXT:    movi v16.4s, #1
+; CHECK-NEXT:    fcvtn v0.2s, v17.2d
+; CHECK-NEXT:    scvtf v17.2d, v1.2d
+; CHECK-NEXT:    fcvtn v1.2s, v18.2d
+; CHECK-NEXT:    fcvtn v3.2s, v3.2d
+; CHECK-NEXT:    ldp q18, q7, [sp]
+; CHECK-NEXT:    scvtf v21.2d, v21.2d
+; CHECK-NEXT:    fcvtn v4.2s, v4.2d
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    scvtf v20.2d, v20.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v19.2d
+; CHECK-NEXT:    ldp q22, q19, [sp, #96]
+; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
+; CHECK-NEXT:    fcvtn2 v3.4s, v6.2d
+; CHECK-NEXT:    scvtf v18.2d, v18.2d
+; CHECK-NEXT:    scvtf v17.2d, v24.2d
+; CHECK-NEXT:    fcvtn v6.2s, v21.2d
+; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
+; CHECK-NEXT:    scvtf v22.2d, v22.2d
+; CHECK-NEXT:    scvtf v21.2d, v23.2d
+; CHECK-NEXT:    scvtf v7.2d, v7.2d
+; CHECK-NEXT:    ushr v24.4s, v0.4s, #16
+; CHECK-NEXT:    add v5.4s, v0.4s, v2.4s
+; CHECK-NEXT:    scvtf v19.2d, v19.2d
+; CHECK-NEXT:    ushr v23.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v3.4s, #16
+; CHECK-NEXT:    fcvtn v18.2s, v18.2d
+; CHECK-NEXT:    fcvtn2 v6.4s, v20.2d
+; CHECK-NEXT:    add v26.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fcvtn v17.2s, v17.2d
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
+; CHECK-NEXT:    fcvtn v22.2s, v22.2d
+; CHECK-NEXT:    fcmeq v20.4s, v0.4s, v0.4s
+; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    fcmeq v27.4s, v3.4s, v3.4s
+; CHECK-NEXT:    fcvtn2 v18.4s, v7.2d
+; CHECK-NEXT:    add v7.4s, v3.4s, v2.4s
+; CHECK-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-NEXT:    add v5.4s, v24.4s, v5.4s
+; CHECK-NEXT:    and v24.16b, v25.16b, v16.16b
+; CHECK-NEXT:    ushr v25.4s, v4.4s, #16
+; CHECK-NEXT:    fcvtn2 v22.4s, v19.2d
+; CHECK-NEXT:    add v19.4s, v23.4s, v26.4s
+; CHECK-NEXT:    ushr v26.4s, v6.4s, #16
+; CHECK-NEXT:    fcvtn2 v17.4s, v21.2d
+; CHECK-NEXT:    fcmeq v21.4s, v1.4s, v1.4s
+; CHECK-NEXT:    orr v1.4s, #64, lsl #16
+; CHECK-NEXT:    and v23.16b, v25.16b, v16.16b
+; CHECK-NEXT:    add v25.4s, v4.4s, v2.4s
+; CHECK-NEXT:    add v7.4s, v24.4s, v7.4s
+; CHECK-NEXT:    ushr v24.4s, v18.4s, #16
+; CHECK-NEXT:    add v30.4s, v18.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v5.16b, v20.16b
+; CHECK-NEXT:    ushr v28.4s, v22.4s, #16
+; CHECK-NEXT:    add v31.4s, v22.4s, v2.4s
+; CHECK-NEXT:    add v23.4s, v23.4s, v25.4s
+; CHECK-NEXT:    and v25.16b, v26.16b, v16.16b
+; CHECK-NEXT:    add v26.4s, v6.4s, v2.4s
+; CHECK-NEXT:    ushr v29.4s, v17.4s, #16
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
+; CHECK-NEXT:    add v2.4s, v17.4s, v2.4s
+; CHECK-NEXT:    and v28.16b, v28.16b, v16.16b
+; CHECK-NEXT:    bit v3.16b, v7.16b, v27.16b
+; CHECK-NEXT:    bit v1.16b, v19.16b, v21.16b
+; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
+; CHECK-NEXT:    fcmeq v26.4s, v6.4s, v6.4s
+; CHECK-NEXT:    orr v6.4s, #64, lsl #16
+; CHECK-NEXT:    and v16.16b, v29.16b, v16.16b
+; CHECK-NEXT:    add v24.4s, v24.4s, v30.4s
+; CHECK-NEXT:    fcmeq v30.4s, v18.4s, v18.4s
+; CHECK-NEXT:    add v28.4s, v28.4s, v31.4s
+; CHECK-NEXT:    fcmeq v31.4s, v22.4s, v22.4s
+; CHECK-NEXT:    fcmeq v29.4s, v4.4s, v4.4s
+; CHECK-NEXT:    orr v4.4s, #64, lsl #16
+; CHECK-NEXT:    orr v18.4s, #64, lsl #16
+; CHECK-NEXT:    orr v22.4s, #64, lsl #16
+; CHECK-NEXT:    mov v5.16b, v26.16b
+; CHECK-NEXT:    add v2.4s, v16.4s, v2.4s
+; CHECK-NEXT:    fcmeq v16.4s, v17.4s, v17.4s
+; CHECK-NEXT:    orr v17.4s, #64, lsl #16
+; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    mov v7.16b, v31.16b
+; CHECK-NEXT:    bit v4.16b, v23.16b, v29.16b
+; CHECK-NEXT:    bsl v5.16b, v25.16b, v6.16b
+; CHECK-NEXT:    mov v6.16b, v30.16b
+; CHECK-NEXT:    bsl v16.16b, v2.16b, v17.16b
+; CHECK-NEXT:    bsl v7.16b, v28.16b, v22.16b
+; CHECK-NEXT:    bsl v6.16b, v24.16b, v18.16b
+; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v3.8h
+; CHECK-NEXT:    uzp2 v3.8h, v16.8h, v7.8h
+; CHECK-NEXT:    uzp2 v2.8h, v6.8h, v5.8h
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <32 x i64> %a to <32 x bfloat>
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) {
+; CHECK-LABEL: utofp_v32i64_v32bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf v17.2d, v2.2d
+; CHECK-NEXT:    ucvtf v18.2d, v0.2d
+; CHECK-NEXT:    ucvtf v19.2d, v3.2d
+; CHECK-NEXT:    ucvtf v3.2d, v6.2d
+; CHECK-NEXT:    ldp q21, q20, [sp, #32]
+; CHECK-NEXT:    ucvtf v4.2d, v4.2d
+; CHECK-NEXT:    ucvtf v6.2d, v7.2d
+; CHECK-NEXT:    ucvtf v5.2d, v5.2d
+; CHECK-NEXT:    ldp q24, q23, [sp, #64]
+; CHECK-NEXT:    movi v16.4s, #1
+; CHECK-NEXT:    fcvtn v0.2s, v17.2d
+; CHECK-NEXT:    ucvtf v17.2d, v1.2d
+; CHECK-NEXT:    fcvtn v1.2s, v18.2d
+; CHECK-NEXT:    fcvtn v3.2s, v3.2d
+; CHECK-NEXT:    ldp q18, q7, [sp]
+; CHECK-NEXT:    ucvtf v21.2d, v21.2d
+; CHECK-NEXT:    fcvtn v4.2s, v4.2d
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    ucvtf v20.2d, v20.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v19.2d
+; CHECK-NEXT:    ldp q22, q19, [sp, #96]
+; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
+; CHECK-NEXT:    fcvtn2 v3.4s, v6.2d
+; CHECK-NEXT:    ucvtf v18.2d, v18.2d
+; CHECK-NEXT:    ucvtf v17.2d, v24.2d
+; CHECK-NEXT:    fcvtn v6.2s, v21.2d
+; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
+; CHECK-NEXT:    ucvtf v22.2d, v22.2d
+; CHECK-NEXT:    ucvtf v21.2d, v23.2d
+; CHECK-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-NEXT:    ushr v24.4s, v0.4s, #16
+; CHECK-NEXT:    add v5.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ucvtf v19.2d, v19.2d
+; CHECK-NEXT:    ushr v23.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v3.4s, #16
+; CHECK-NEXT:    fcvtn v18.2s, v18.2d
+; CHECK-NEXT:    fcvtn2 v6.4s, v20.2d
+; CHECK-NEXT:    add v26.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fcvtn v17.2s, v17.2d
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
+; CHECK-NEXT:    fcvtn v22.2s, v22.2d
+; CHECK-NEXT:    fcmeq v20.4s, v0.4s, v0.4s
+; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    fcmeq v27.4s, v3.4s, v3.4s
+; CHECK-NEXT:    fcvtn2 v18.4s, v7.2d
+; CHECK-NEXT:    add v7.4s, v3.4s, v2.4s
+; CHECK-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-NEXT:    add v5.4s, v24.4s, v5.4s
+; CHECK-NEXT:    and v24.16b, v25.16b, v16.16b
+; CHECK-NEXT:    ushr v25.4s, v4.4s, #16
+; CHECK-NEXT:    fcvtn2 v22.4s, v19.2d
+; CHECK-NEXT:    add v19.4s, v23.4s, v26.4s
+; CHECK-NEXT:    ushr v26.4s, v6.4s, #16
+; CHECK-NEXT:    fcvtn2 v17.4s, v21.2d
+; CHECK-NEXT:    fcmeq v21.4s, v1.4s, v1.4s
+; CHECK-NEXT:    orr v1.4s, #64, lsl #16
+; CHECK-NEXT:    and v23.16b, v25.16b, v16.16b
+; CHECK-NEXT:    add v25.4s, v4.4s, v2.4s
+; CHECK-NEXT:    add v7.4s, v24.4s, v7.4s
+; CHECK-NEXT:    ushr v24.4s, v18.4s, #16
+; CHECK-NEXT:    add v30.4s, v18.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v5.16b, v20.16b
+; CHECK-NEXT:    ushr v28.4s, v22.4s, #16
+; CHECK-NEXT:    add v31.4s, v22.4s, v2.4s
+; CHECK-NEXT:    add v23.4s, v23.4s, v25.4s
+; CHECK-NEXT:    and v25.16b, v26.16b, v16.16b
+; CHECK-NEXT:    add v26.4s, v6.4s, v2.4s
+; CHECK-NEXT:    ushr v29.4s, v17.4s, #16
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
+; CHECK-NEXT:    add v2.4s, v17.4s, v2.4s
+; CHECK-NEXT:    and v28.16b, v28.16b, v16.16b
+; CHECK-NEXT:    bit v3.16b, v7.16b, v27.16b
+; CHECK-NEXT:    bit v1.16b, v19.16b, v21.16b
+; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
+; CHECK-NEXT:    fcmeq v26.4s, v6.4s, v6.4s
+; CHECK-NEXT:    orr v6.4s, #64, lsl #16
+; CHECK-NEXT:    and v16.16b, v29.16b, v16.16b
+; CHECK-NEXT:    add v24.4s, v24.4s, v30.4s
+; CHECK-NEXT:    fcmeq v30.4s, v18.4s, v18.4s
+; CHECK-NEXT:    add v28.4s, v28.4s, v31.4s
+; CHECK-NEXT:    fcmeq v31.4s, v22.4s, v22.4s
+; CHECK-NEXT:    fcmeq v29.4s, v4.4s, v4.4s
+; CHECK-NEXT:    orr v4.4s, #64, lsl #16
+; CHECK-NEXT:    orr v18.4s, #64, lsl #16
+; CHECK-NEXT:    orr v22.4s, #64, lsl #16
+; CHECK-NEXT:    mov v5.16b, v26.16b
+; CHECK-NEXT:    add v2.4s, v16.4s, v2.4s
+; CHECK-NEXT:    fcmeq v16.4s, v17.4s, v17.4s
+; CHECK-NEXT:    orr v17.4s, #64, lsl #16
+; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    mov v7.16b, v31.16b
+; CHECK-NEXT:    bit v4.16b, v23.16b, v29.16b
+; CHECK-NEXT:    bsl v5.16b, v25.16b, v6.16b
+; CHECK-NEXT:    mov v6.16b, v30.16b
+; CHECK-NEXT:    bsl v16.16b, v2.16b, v17.16b
+; CHECK-NEXT:    bsl v7.16b, v28.16b, v22.16b
+; CHECK-NEXT:    bsl v6.16b, v24.16b, v18.16b
+; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v3.8h
+; CHECK-NEXT:    uzp2 v3.8h, v16.8h, v7.8h
+; CHECK-NEXT:    uzp2 v2.8h, v6.8h, v5.8h
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <32 x i64> %a to <32 x bfloat>
+  ret <32 x bfloat> %c
+}
+
+define <2 x bfloat> @stofp_v2i32_v2bf16(<2 x i32> %a) {
+; CHECK-LABEL: stofp_v2i32_v2bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <2 x i32> %a to <2 x bfloat>
+  ret <2 x bfloat> %c
+}
+
+define <2 x bfloat> @utofp_v2i32_v2bf16(<2 x i32> %a) {
+; CHECK-LABEL: utofp_v2i32_v2bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <2 x i32> %a to <2 x bfloat>
+  ret <2 x bfloat> %c
+}
+
+define <3 x bfloat> @stofp_v3i32_v3bf16(<3 x i32> %a) {
+; CHECK-LABEL: stofp_v3i32_v3bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <3 x i32> %a to <3 x bfloat>
+  ret <3 x bfloat> %c
+}
+
+define <3 x bfloat> @utofp_v3i32_v3bf16(<3 x i32> %a) {
+; CHECK-LABEL: utofp_v3i32_v3bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <3 x i32> %a to <3 x bfloat>
+  ret <3 x bfloat> %c
+}
+
+define <4 x bfloat> @stofp_v4i32_v4bf16(<4 x i32> %a) {
+; CHECK-LABEL: stofp_v4i32_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <4 x i32> %a to <4 x bfloat>
+  ret <4 x bfloat> %c
+}
+
+define <4 x bfloat> @utofp_v4i32_v4bf16(<4 x i32> %a) {
+; CHECK-LABEL: utofp_v4i32_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <4 x i32> %a to <4 x bfloat>
+  ret <4 x bfloat> %c
+}
+
+define <8 x bfloat> @stofp_v8i32_v8bf16(<8 x i32> %a) {
+; CHECK-LABEL: stofp_v8i32_v8bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-NEXT:    movi v5.4s, #127, msl #8
+; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-NEXT:    ushr v4.4s, v1.4s, #16
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v5.4s
+; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v5.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <8 x i32> %a to <8 x bfloat>
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @utofp_v8i32_v8bf16(<8 x i32> %a) {
+; CHECK-LABEL: utofp_v8i32_v8bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-NEXT:    movi v5.4s, #127, msl #8
+; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-NEXT:    ushr v4.4s, v1.4s, #16
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v5.4s
+; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v5.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <8 x i32> %a to <8 x bfloat>
+  ret <8 x bfloat> %c
+}
+
+define <16 x bfloat> @stofp_v16i32_v16bf16(<16 x i32> %a) {
+; CHECK-LABEL: stofp_v16i32_v16bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    scvtf v4.4s, v1.4s
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-NEXT:    movi v17.4s, #127, msl #8
+; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-NEXT:    ushr v6.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v7.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v16.4s, v3.4s, #16
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    and v6.16b, v6.16b, v1.16b
+; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    and v5.16b, v7.16b, v1.16b
+; CHECK-NEXT:    and v6.16b, v16.16b, v1.16b
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v17.4s
+; CHECK-NEXT:    addhn v1.4h, v2.4s, v17.4s
+; CHECK-NEXT:    add v2.4s, v5.4s, v4.4s
+; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    addhn2 v0.8h, v2.4s, v17.4s
+; CHECK-NEXT:    addhn2 v1.8h, v3.4s, v17.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <16 x i32> %a to <16 x bfloat>
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @utofp_v16i32_v16bf16(<16 x i32> %a) {
+; CHECK-LABEL: utofp_v16i32_v16bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ucvtf v4.4s, v1.4s
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-NEXT:    movi v17.4s, #127, msl #8
+; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-NEXT:    ushr v6.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v7.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v16.4s, v3.4s, #16
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    and v6.16b, v6.16b, v1.16b
+; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    and v5.16b, v7.16b, v1.16b
+; CHECK-NEXT:    and v6.16b, v16.16b, v1.16b
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v17.4s
+; CHECK-NEXT:    addhn v1.4h, v2.4s, v17.4s
+; CHECK-NEXT:    add v2.4s, v5.4s, v4.4s
+; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    addhn2 v0.8h, v2.4s, v17.4s
+; CHECK-NEXT:    addhn2 v1.8h, v3.4s, v17.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <16 x i32> %a to <16 x bfloat>
+  ret <16 x bfloat> %c
+}
+
+define <32 x bfloat> @stofp_v32i32_v32bf16(<32 x i32> %a) {
+; CHECK-LABEL: stofp_v32i32_v32bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-NEXT:    scvtf v4.4s, v4.4s
+; CHECK-NEXT:    scvtf v6.4s, v6.4s
+; CHECK-NEXT:    movi v16.4s, #1
+; CHECK-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-NEXT:    scvtf v17.4s, v3.4s
+; CHECK-NEXT:    scvtf v5.4s, v5.4s
+; CHECK-NEXT:    scvtf v7.4s, v7.4s
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
+; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-NEXT:    ushr v18.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v19.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v20.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v22.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v7.4s, #16
+; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
+; CHECK-NEXT:    and v18.16b, v18.16b, v16.16b
+; CHECK-NEXT:    and v19.16b, v19.16b, v16.16b
+; CHECK-NEXT:    and v20.16b, v20.16b, v16.16b
+; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    and v3.16b, v22.16b, v16.16b
+; CHECK-NEXT:    add v2.4s, v18.4s, v2.4s
+; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
+; CHECK-NEXT:    add v6.4s, v20.4s, v6.4s
+; CHECK-NEXT:    and v18.16b, v23.16b, v16.16b
+; CHECK-NEXT:    and v19.16b, v24.16b, v16.16b
+; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
+; CHECK-NEXT:    add v20.4s, v3.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v21.4s
+; CHECK-NEXT:    addhn v1.4h, v2.4s, v21.4s
+; CHECK-NEXT:    addhn v2.4h, v4.4s, v21.4s
+; CHECK-NEXT:    addhn v3.4h, v6.4s, v21.4s
+; CHECK-NEXT:    add v4.4s, v18.4s, v17.4s
+; CHECK-NEXT:    add v5.4s, v19.4s, v5.4s
+; CHECK-NEXT:    add v6.4s, v16.4s, v7.4s
+; CHECK-NEXT:    addhn2 v0.8h, v20.4s, v21.4s
+; CHECK-NEXT:    addhn2 v1.8h, v4.4s, v21.4s
+; CHECK-NEXT:    addhn2 v2.8h, v5.4s, v21.4s
+; CHECK-NEXT:    addhn2 v3.8h, v6.4s, v21.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <32 x i32> %a to <32 x bfloat>
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @utofp_v32i32_v32bf16(<32 x i32> %a) {
+; CHECK-LABEL: utofp_v32i32_v32bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-NEXT:    ucvtf v4.4s, v4.4s
+; CHECK-NEXT:    ucvtf v6.4s, v6.4s
+; CHECK-NEXT:    movi v16.4s, #1
+; CHECK-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-NEXT:    ucvtf v17.4s, v3.4s
+; CHECK-NEXT:    ucvtf v5.4s, v5.4s
+; CHECK-NEXT:    ucvtf v7.4s, v7.4s
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
+; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-NEXT:    ushr v18.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v19.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v20.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v22.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v7.4s, #16
+; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
+; CHECK-NEXT:    and v18.16b, v18.16b, v16.16b
+; CHECK-NEXT:    and v19.16b, v19.16b, v16.16b
+; CHECK-NEXT:    and v20.16b, v20.16b, v16.16b
+; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    and v3.16b, v22.16b, v16.16b
+; CHECK-NEXT:    add v2.4s, v18.4s, v2.4s
+; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
+; CHECK-NEXT:    add v6.4s, v20.4s, v6.4s
+; CHECK-NEXT:    and v18.16b, v23.16b, v16.16b
+; CHECK-NEXT:    and v19.16b, v24.16b, v16.16b
+; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
+; CHECK-NEXT:    add v20.4s, v3.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v21.4s
+; CHECK-NEXT:    addhn v1.4h, v2.4s, v21.4s
+; CHECK-NEXT:    addhn v2.4h, v4.4s, v21.4s
+; CHECK-NEXT:    addhn v3.4h, v6.4s, v21.4s
+; CHECK-NEXT:    add v4.4s, v18.4s, v17.4s
+; CHECK-NEXT:    add v5.4s, v19.4s, v5.4s
+; CHECK-NEXT:    add v6.4s, v16.4s, v7.4s
+; CHECK-NEXT:    addhn2 v0.8h, v20.4s, v21.4s
+; CHECK-NEXT:    addhn2 v1.8h, v4.4s, v21.4s
+; CHECK-NEXT:    addhn2 v2.8h, v5.4s, v21.4s
+; CHECK-NEXT:    addhn2 v3.8h, v6.4s, v21.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <32 x i32> %a to <32 x bfloat>
+  ret <32 x bfloat> %c
+}
+
+define <2 x bfloat> @stofp_v2i16_v2bf16(<2 x i16> %a) {
+; CHECK-LABEL: stofp_v2i16_v2bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <2 x i16> %a to <2 x bfloat>
+  ret <2 x bfloat> %c
+}
+
+define <2 x bfloat> @utofp_v2i16_v2bf16(<2 x i16> %a) {
+; CHECK-LABEL: utofp_v2i16_v2bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <2 x i16> %a to <2 x bfloat>
+  ret <2 x bfloat> %c
+}
+
+define <3 x bfloat> @stofp_v3i16_v3bf16(<3 x i16> %a) {
+; CHECK-LABEL: stofp_v3i16_v3bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <3 x i16> %a to <3 x bfloat>
+  ret <3 x bfloat> %c
+}
+
+define <3 x bfloat> @utofp_v3i16_v3bf16(<3 x i16> %a) {
+; CHECK-LABEL: utofp_v3i16_v3bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <3 x i16> %a to <3 x bfloat>
+  ret <3 x bfloat> %c
+}
+
+define <4 x bfloat> @stofp_v4i16_v4bf16(<4 x i16> %a) {
+; CHECK-LABEL: stofp_v4i16_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <4 x i16> %a to <4 x bfloat>
+  ret <4 x bfloat> %c
+}
+
+define <4 x bfloat> @utofp_v4i16_v4bf16(<4 x i16> %a) {
+; CHECK-LABEL: utofp_v4i16_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <4 x i16> %a to <4 x bfloat>
+  ret <4 x bfloat> %c
+}
+
+define <8 x bfloat> @stofp_v8i16_v8bf16(<8 x i16> %a) {
+; CHECK-LABEL: stofp_v8i16_v8bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-NEXT:    scvtf v3.4s, v0.4s
+; CHECK-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <8 x i16> %a to <8 x bfloat>
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @utofp_v8i16_v8bf16(<8 x i16> %a) {
+; CHECK-LABEL: utofp_v8i16_v8bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-NEXT:    ucvtf v3.4s, v0.4s
+; CHECK-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <8 x i16> %a to <8 x bfloat>
+  ret <8 x bfloat> %c
+}
+
+define <16 x bfloat> @stofp_v16i16_v16bf16(<16 x i16> %a) {
+; CHECK-LABEL: stofp_v16i16_v16bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v4.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v7.4s, #127, msl #8
+; CHECK-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-NEXT:    scvtf v4.4s, v4.4s
+; CHECK-NEXT:    scvtf v5.4s, v0.4s
+; CHECK-NEXT:    scvtf v6.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v3.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v16.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v17.4s, v6.4s, #16
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    and v16.16b, v16.16b, v2.16b
+; CHECK-NEXT:    and v2.16b, v17.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v7.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-NEXT:    addhn v0.4h, v3.4s, v0.4s
+; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-NEXT:    addhn2 v0.8h, v5.4s, v3.4s
+; CHECK-NEXT:    addhn2 v1.8h, v6.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <16 x i16> %a to <16 x bfloat>
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @utofp_v16i16_v16bf16(<16 x i16> %a) {
+; CHECK-LABEL: utofp_v16i16_v16bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v4.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v7.4s, #127, msl #8
+; CHECK-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-NEXT:    ucvtf v4.4s, v4.4s
+; CHECK-NEXT:    ucvtf v5.4s, v0.4s
+; CHECK-NEXT:    ucvtf v6.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v3.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v16.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v17.4s, v6.4s, #16
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    and v16.16b, v16.16b, v2.16b
+; CHECK-NEXT:    and v2.16b, v17.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v7.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-NEXT:    addhn v0.4h, v3.4s, v0.4s
+; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-NEXT:    addhn2 v0.8h, v5.4s, v3.4s
+; CHECK-NEXT:    addhn2 v1.8h, v6.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <16 x i16> %a to <16 x bfloat>
+  ret <16 x bfloat> %c
+}
+
+define <32 x bfloat> @stofp_v32i16_v32bf16(<32 x i16> %a) {
+; CHECK-LABEL: stofp_v32i16_v32bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v4.4s, v1.4h, #0
+; CHECK-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v6.4s, v2.4h, #0
+; CHECK-NEXT:    sshll v7.4s, v3.4h, #0
+; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NEXT:    movi v16.4s, #1
+; CHECK-NEXT:    scvtf v5.4s, v5.4s
+; CHECK-NEXT:    scvtf v4.4s, v4.4s
+; CHECK-NEXT:    scvtf v6.4s, v6.4s
+; CHECK-NEXT:    scvtf v7.4s, v7.4s
+; CHECK-NEXT:    scvtf v17.4s, v0.4s
+; CHECK-NEXT:    scvtf v18.4s, v1.4s
+; CHECK-NEXT:    scvtf v19.4s, v2.4s
+; CHECK-NEXT:    scvtf v20.4s, v3.4s
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
+; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v2.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v3.4s, v7.4s, #16
+; CHECK-NEXT:    ushr v22.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v18.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v19.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v20.4s, #16
+; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v16.16b
+; CHECK-NEXT:    and v2.16b, v2.16b, v16.16b
+; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
+; CHECK-NEXT:    and v22.16b, v22.16b, v16.16b
+; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
+; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v21.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v21.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
+; CHECK-NEXT:    addhn v0.4h, v5.4s, v0.4s
+; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
+; CHECK-NEXT:    addhn v2.4h, v6.4s, v2.4s
+; CHECK-NEXT:    addhn v3.4h, v7.4s, v3.4s
+; CHECK-NEXT:    add v4.4s, v22.4s, v21.4s
+; CHECK-NEXT:    add v5.4s, v23.4s, v21.4s
+; CHECK-NEXT:    add v6.4s, v24.4s, v21.4s
+; CHECK-NEXT:    add v7.4s, v16.4s, v21.4s
+; CHECK-NEXT:    addhn2 v0.8h, v17.4s, v4.4s
+; CHECK-NEXT:    addhn2 v1.8h, v18.4s, v5.4s
+; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v6.4s
+; CHECK-NEXT:    addhn2 v3.8h, v20.4s, v7.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <32 x i16> %a to <32 x bfloat>
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @utofp_v32i16_v32bf16(<32 x i16> %a) {
+; CHECK-LABEL: utofp_v32i16_v32bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v4.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v6.4s, v2.4h, #0
+; CHECK-NEXT:    ushll v7.4s, v3.4h, #0
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NEXT:    movi v16.4s, #1
+; CHECK-NEXT:    ucvtf v5.4s, v5.4s
+; CHECK-NEXT:    ucvtf v4.4s, v4.4s
+; CHECK-NEXT:    ucvtf v6.4s, v6.4s
+; CHECK-NEXT:    ucvtf v7.4s, v7.4s
+; CHECK-NEXT:    ucvtf v17.4s, v0.4s
+; CHECK-NEXT:    ucvtf v18.4s, v1.4s
+; CHECK-NEXT:    ucvtf v19.4s, v2.4s
+; CHECK-NEXT:    ucvtf v20.4s, v3.4s
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
+; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v2.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v3.4s, v7.4s, #16
+; CHECK-NEXT:    ushr v22.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v18.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v19.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v20.4s, #16
+; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v16.16b
+; CHECK-NEXT:    and v2.16b, v2.16b, v16.16b
+; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
+; CHECK-NEXT:    and v22.16b, v22.16b, v16.16b
+; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
+; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v21.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v21.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
+; CHECK-NEXT:    addhn v0.4h, v5.4s, v0.4s
+; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
+; CHECK-NEXT:    addhn v2.4h, v6.4s, v2.4s
+; CHECK-NEXT:    addhn v3.4h, v7.4s, v3.4s
+; CHECK-NEXT:    add v4.4s, v22.4s, v21.4s
+; CHECK-NEXT:    add v5.4s, v23.4s, v21.4s
+; CHECK-NEXT:    add v6.4s, v24.4s, v21.4s
+; CHECK-NEXT:    add v7.4s, v16.4s, v21.4s
+; CHECK-NEXT:    addhn2 v0.8h, v17.4s, v4.4s
+; CHECK-NEXT:    addhn2 v1.8h, v18.4s, v5.4s
+; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v6.4s
+; CHECK-NEXT:    addhn2 v3.8h, v20.4s, v7.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <32 x i16> %a to <32 x bfloat>
+  ret <32 x bfloat> %c
+}
+
+define <2 x bfloat> @stofp_v2i8_v2bf16(<2 x i8> %a) {
+; CHECK-LABEL: stofp_v2i8_v2bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w9, v0.s[1]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    sxtb w10, w10
+; CHECK-NEXT:    sxtb w9, w9
+; CHECK-NEXT:    scvtf s1, w10
+; CHECK-NEXT:    scvtf s0, w9
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ubfx w12, w10, #16, #1
+; CHECK-NEXT:    ubfx w11, w9, #16, #1
+; CHECK-NEXT:    add w9, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    add w8, w12, w8
+; CHECK-NEXT:    add w9, w11, w9
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    lsr w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <2 x i8> %a to <2 x bfloat>
+  ret <2 x bfloat> %c
+}
+
+define <2 x bfloat> @utofp_v2i8_v2bf16(<2 x i8> %a) {
+; CHECK-LABEL: utofp_v2i8_v2bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w9, v0.s[1]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    and w10, w10, #0xff
+; CHECK-NEXT:    and w9, w9, #0xff
+; CHECK-NEXT:    ucvtf s1, w10
+; CHECK-NEXT:    ucvtf s0, w9
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ubfx w12, w10, #16, #1
+; CHECK-NEXT:    ubfx w11, w9, #16, #1
+; CHECK-NEXT:    add w9, w9, w8
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    add w8, w12, w8
+; CHECK-NEXT:    add w9, w11, w9
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    lsr w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <2 x i8> %a to <2 x bfloat>
+  ret <2 x bfloat> %c
+}
+
+define <3 x bfloat> @stofp_v3i8_v3bf16(<3 x i8> %a) {
+; CHECK-LABEL: stofp_v3i8_v3bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    mov v0.h[1], w1
+; CHECK-NEXT:    mov v0.h[2], w2
+; CHECK-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <3 x i8> %a to <3 x bfloat>
+  ret <3 x bfloat> %c
+}
+
+define <3 x bfloat> @utofp_v3i8_v3bf16(<3 x i8> %a) {
+; CHECK-LABEL: utofp_v3i8_v3bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    mov v0.h[1], w1
+; CHECK-NEXT:    mov v0.h[2], w2
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <3 x i8> %a to <3 x bfloat>
+  ret <3 x bfloat> %c
+}
+
+define <4 x bfloat> @stofp_v4i8_v4bf16(<4 x i8> %a) {
+; CHECK-LABEL: stofp_v4i8_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <4 x i8> %a to <4 x bfloat>
+  ret <4 x bfloat> %c
+}
+
+define <4 x bfloat> @utofp_v4i8_v4bf16(<4 x i8> %a) {
+; CHECK-LABEL: utofp_v4i8_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <4 x i8> %a to <4 x bfloat>
+  ret <4 x bfloat> %c
+}
+
+define <8 x bfloat> @stofp_v8i8_v8bf16(<8 x i8> %a) {
+; CHECK-LABEL: stofp_v8i8_v8bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-NEXT:    scvtf v3.4s, v0.4s
+; CHECK-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <8 x i8> %a to <8 x bfloat>
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @utofp_v8i8_v8bf16(<8 x i8> %a) {
+; CHECK-LABEL: utofp_v8i8_v8bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-NEXT:    ucvtf v3.4s, v0.4s
+; CHECK-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <8 x i8> %a to <8 x bfloat>
+  ret <8 x bfloat> %c
+}
+
+define <16 x bfloat> @stofp_v16i8_v16bf16(<16 x i8> %a) {
+; CHECK-LABEL: stofp_v16i8_v16bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v7.4s, #127, msl #8
+; CHECK-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-NEXT:    sshll v4.4s, v0.4h, #0
+; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-NEXT:    scvtf v4.4s, v4.4s
+; CHECK-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-NEXT:    scvtf v6.4s, v0.4s
+; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-NEXT:    ushr v0.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v16.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v17.4s, v6.4s, #16
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-NEXT:    and v17.16b, v17.16b, v1.16b
+; CHECK-NEXT:    add v5.4s, v5.4s, v7.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    addhn v1.4h, v3.4s, v5.4s
+; CHECK-NEXT:    addhn v0.4h, v4.4s, v0.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-NEXT:    add v4.4s, v17.4s, v7.4s
+; CHECK-NEXT:    addhn2 v1.8h, v2.4s, v3.4s
+; CHECK-NEXT:    addhn2 v0.8h, v6.4s, v4.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <16 x i8> %a to <16 x bfloat>
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @utofp_v16i8_v16bf16(<16 x i8> %a) {
+; CHECK-LABEL: utofp_v16i8_v16bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v7.4s, #127, msl #8
+; CHECK-NEXT:    ushll v3.4s, v2.4h, #0
+; CHECK-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-NEXT:    ucvtf v4.4s, v4.4s
+; CHECK-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-NEXT:    ucvtf v6.4s, v0.4s
+; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-NEXT:    ushr v0.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v16.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v17.4s, v6.4s, #16
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-NEXT:    and v17.16b, v17.16b, v1.16b
+; CHECK-NEXT:    add v5.4s, v5.4s, v7.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    addhn v1.4h, v3.4s, v5.4s
+; CHECK-NEXT:    addhn v0.4h, v4.4s, v0.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-NEXT:    add v4.4s, v17.4s, v7.4s
+; CHECK-NEXT:    addhn2 v1.8h, v2.4s, v3.4s
+; CHECK-NEXT:    addhn2 v0.8h, v6.4s, v4.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <16 x i8> %a to <16 x bfloat>
+  ret <16 x bfloat> %c
+}
+
+define <32 x bfloat> @stofp_v32i8_v32bf16(<32 x i8> %a) {
+; CHECK-LABEL: stofp_v32i8_v32bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll2 v4.8h, v1.16b, #0
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
+; CHECK-NEXT:    sshll v5.4s, v3.4h, #0
+; CHECK-NEXT:    sshll v6.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v7.4s, v4.4h, #0
+; CHECK-NEXT:    sshll v16.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    scvtf v5.4s, v5.4s
+; CHECK-NEXT:    scvtf v6.4s, v6.4s
+; CHECK-NEXT:    scvtf v7.4s, v7.4s
+; CHECK-NEXT:    scvtf v16.4s, v16.4s
+; CHECK-NEXT:    scvtf v17.4s, v3.4s
+; CHECK-NEXT:    scvtf v4.4s, v4.4s
+; CHECK-NEXT:    scvtf v18.4s, v0.4s
+; CHECK-NEXT:    scvtf v19.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v3.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v7.4s, #16
+; CHECK-NEXT:    ushr v20.4s, v16.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v22.4s, v18.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v19.4s, #16
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    and v20.16b, v20.16b, v2.16b
+; CHECK-NEXT:    and v23.16b, v23.16b, v2.16b
+; CHECK-NEXT:    and v24.16b, v24.16b, v2.16b
+; CHECK-NEXT:    and v22.16b, v22.16b, v2.16b
+; CHECK-NEXT:    and v25.16b, v25.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
+; CHECK-NEXT:    add v26.4s, v1.4s, v21.4s
+; CHECK-NEXT:    add v20.4s, v20.4s, v21.4s
+; CHECK-NEXT:    addhn v1.4h, v5.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v6.4s, v3.4s
+; CHECK-NEXT:    addhn v3.4h, v7.4s, v26.4s
+; CHECK-NEXT:    addhn v2.4h, v16.4s, v20.4s
+; CHECK-NEXT:    add v5.4s, v22.4s, v21.4s
+; CHECK-NEXT:    add v6.4s, v23.4s, v21.4s
+; CHECK-NEXT:    add v7.4s, v24.4s, v21.4s
+; CHECK-NEXT:    add v16.4s, v25.4s, v21.4s
+; CHECK-NEXT:    addhn2 v0.8h, v18.4s, v5.4s
+; CHECK-NEXT:    addhn2 v1.8h, v17.4s, v6.4s
+; CHECK-NEXT:    addhn2 v3.8h, v4.4s, v7.4s
+; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v16.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <32 x i8> %a to <32 x bfloat>
+  ret <32 x bfloat> %c
+}
+
+define <32 x bfloat> @utofp_v32i8_v32bf16(<32 x i8> %a) {
+; CHECK-LABEL: utofp_v32i8_v32bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll2 v3.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v4.8h, v1.16b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
+; CHECK-NEXT:    ushll v5.4s, v3.4h, #0
+; CHECK-NEXT:    ushll v6.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v7.4s, v4.4h, #0
+; CHECK-NEXT:    ushll v16.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ucvtf v5.4s, v5.4s
+; CHECK-NEXT:    ucvtf v6.4s, v6.4s
+; CHECK-NEXT:    ucvtf v7.4s, v7.4s
+; CHECK-NEXT:    ucvtf v16.4s, v16.4s
+; CHECK-NEXT:    ucvtf v17.4s, v3.4s
+; CHECK-NEXT:    ucvtf v4.4s, v4.4s
+; CHECK-NEXT:    ucvtf v18.4s, v0.4s
+; CHECK-NEXT:    ucvtf v19.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v3.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v7.4s, #16
+; CHECK-NEXT:    ushr v20.4s, v16.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v22.4s, v18.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v19.4s, #16
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    and v20.16b, v20.16b, v2.16b
+; CHECK-NEXT:    and v23.16b, v23.16b, v2.16b
+; CHECK-NEXT:    and v24.16b, v24.16b, v2.16b
+; CHECK-NEXT:    and v22.16b, v22.16b, v2.16b
+; CHECK-NEXT:    and v25.16b, v25.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
+; CHECK-NEXT:    add v26.4s, v1.4s, v21.4s
+; CHECK-NEXT:    add v20.4s, v20.4s, v21.4s
+; CHECK-NEXT:    addhn v1.4h, v5.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v6.4s, v3.4s
+; CHECK-NEXT:    addhn v3.4h, v7.4s, v26.4s
+; CHECK-NEXT:    addhn v2.4h, v16.4s, v20.4s
+; CHECK-NEXT:    add v5.4s, v22.4s, v21.4s
+; CHECK-NEXT:    add v6.4s, v23.4s, v21.4s
+; CHECK-NEXT:    add v7.4s, v24.4s, v21.4s
+; CHECK-NEXT:    add v16.4s, v25.4s, v21.4s
+; CHECK-NEXT:    addhn2 v0.8h, v18.4s, v5.4s
+; CHECK-NEXT:    addhn2 v1.8h, v17.4s, v6.4s
+; CHECK-NEXT:    addhn2 v3.8h, v4.4s, v7.4s
+; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v16.4s
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <32 x i8> %a to <32 x bfloat>
+  ret <32 x bfloat> %c
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-GI: {{.*}}
+; CHECK-GI-FP16: {{.*}}
+; CHECK-GI-NOFP16: {{.*}}
+; CHECK-SD: {{.*}}
+; CHECK-SD-FP16: {{.*}}
+; CHECK-SD-NOFP16: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index ae4ced2..ac26ccc 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -4,6 +4,227 @@
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
+; CHECK-GI:       warning: Instruction selection used fallback path for stofp_i128_f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i128_f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i64_f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i64_f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i32_f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i32_f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i16_f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i16_f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i8_f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i8_f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i128_f64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i128_f64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i128_f32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i128_f32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i128_f16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i128_f16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i128_v2f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i128_v2f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i128_v3f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i128_v3f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i64_v2f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i64_v2f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i64_v3f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i64_v3f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i128_v2f64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i128_v2f64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i128_v3f64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i128_v3f64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i32_v2f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i32_v2f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i32_v3f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i32_v3f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i16_v2f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i16_v2f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i16_v3f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i16_v3f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i8_v2f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i8_v2f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i8_v3f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i8_v3f128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i128_v2f32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i128_v2f32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i128_v3f32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i128_v3f32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i128_v2f16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i128_v2f16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i128_v3f16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i128_v3f16
+
+define fp128 @stofp_i128_f128(i128 %a) {
+; CHECK-LABEL: stofp_i128_f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __floattitf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp i128 %a to fp128
+  ret fp128 %c
+}
+
+define fp128 @utofp_i128_f128(i128 %a) {
+; CHECK-LABEL: utofp_i128_f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __floatuntitf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp i128 %a to fp128
+  ret fp128 %c
+}
+
+define fp128 @stofp_i64_f128(i64 %a) {
+; CHECK-LABEL: stofp_i64_f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __floatditf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp i64 %a to fp128
+  ret fp128 %c
+}
+
+define fp128 @utofp_i64_f128(i64 %a) {
+; CHECK-LABEL: utofp_i64_f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __floatunditf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp i64 %a to fp128
+  ret fp128 %c
+}
+
+define fp128 @stofp_i32_f128(i32 %a) {
+; CHECK-LABEL: stofp_i32_f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp i32 %a to fp128
+  ret fp128 %c
+}
+
+define fp128 @utofp_i32_f128(i32 %a) {
+; CHECK-LABEL: utofp_i32_f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp i32 %a to fp128
+  ret fp128 %c
+}
+
+define fp128 @stofp_i16_f128(i16 %a) {
+; CHECK-LABEL: stofp_i16_f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    sxth w0, w0
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp i16 %a to fp128
+  ret fp128 %c
+}
+
+define fp128 @utofp_i16_f128(i16 %a) {
+; CHECK-LABEL: utofp_i16_f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    and w0, w0, #0xffff
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp i16 %a to fp128
+  ret fp128 %c
+}
+
+define fp128 @stofp_i8_f128(i8 %a) {
+; CHECK-LABEL: stofp_i8_f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    sxtb w0, w0
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp i8 %a to fp128
+  ret fp128 %c
+}
+
+define fp128 @utofp_i8_f128(i8 %a) {
+; CHECK-LABEL: utofp_i8_f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    and w0, w0, #0xff
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp i8 %a to fp128
+  ret fp128 %c
+}
+
+define double @stofp_i128_f64(i128 %a) {
+; CHECK-LABEL: stofp_i128_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __floattidf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp i128 %a to double
+  ret double %c
+}
+
+define double @utofp_i128_f64(i128 %a) {
+; CHECK-LABEL: utofp_i128_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __floatuntidf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp i128 %a to double
+  ret double %c
+}
+
 define double @stofp_i64_f64(i64 %a) {
 ; CHECK-LABEL: stofp_i64_f64:
 ; CHECK:       // %bb.0: // %entry
@@ -88,6 +309,34 @@ entry:
   ret double %c
 }
 
+define float @stofp_i128_f32(i128 %a) {
+; CHECK-LABEL: stofp_i128_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __floattisf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp i128 %a to float
+  ret float %c
+}
+
+define float @utofp_i128_f32(i128 %a) {
+; CHECK-LABEL: utofp_i128_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __floatuntisf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp i128 %a to float
+  ret float %c
+}
+
 define float @stofp_i64_f32(i64 %a) {
 ; CHECK-LABEL: stofp_i64_f32:
 ; CHECK:       // %bb.0: // %entry
@@ -172,6 +421,92 @@ entry:
   ret float %c
 }
 
+define half @stofp_i128_f16(i128 %a) {
+; CHECK-SD-NOFP16-LABEL: stofp_i128_f16:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NOFP16-NEXT:    bl __floattisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: stofp_i128_f16:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-FP16-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-FP16-NEXT:    bl __floattihf
+; CHECK-SD-FP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: stofp_i128_f16:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NOFP16-NEXT:    bl __floattisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: stofp_i128_f16:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-FP16-NEXT:    bl __floattihf
+; CHECK-GI-FP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = sitofp i128 %a to half
+  ret half %c
+}
+
+define half @utofp_i128_f16(i128 %a) {
+; CHECK-SD-NOFP16-LABEL: utofp_i128_f16:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: utofp_i128_f16:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-FP16-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-FP16-NEXT:    bl __floatuntihf
+; CHECK-SD-FP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: utofp_i128_f16:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: utofp_i128_f16:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-FP16-NEXT:    bl __floatuntihf
+; CHECK-GI-FP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = uitofp i128 %a to half
+  ret half %c
+}
+
 define half @stofp_i64_f16(i64 %a) {
 ; CHECK-SD-NOFP16-LABEL: stofp_i64_f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
@@ -404,6 +739,396 @@ entry:
   ret half %c
 }
 
+define <2 x fp128> @stofp_v2i128_v2f128(<2 x i128> %a) {
+; CHECK-LABEL: stofp_v2i128_v2f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x3
+; CHECK-NEXT:    mov x20, x2
+; CHECK-NEXT:    bl __floattitf
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floattitf
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <2 x i128> %a to <2 x fp128>
+  ret <2 x fp128> %c
+}
+
+define <2 x fp128> @utofp_v2i128_v2f128(<2 x i128> %a) {
+; CHECK-LABEL: utofp_v2i128_v2f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x3
+; CHECK-NEXT:    mov x20, x2
+; CHECK-NEXT:    bl __floatuntitf
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatuntitf
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <2 x i128> %a to <2 x fp128>
+  ret <2 x fp128> %c
+}
+
+define <3 x fp128> @stofp_v3i128_v3f128(<3 x i128> %a) {
+; CHECK-LABEL: stofp_v3i128_v3f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    mov x19, x5
+; CHECK-NEXT:    mov x20, x4
+; CHECK-NEXT:    mov x21, x3
+; CHECK-NEXT:    mov x22, x2
+; CHECK-NEXT:    bl __floattitf
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:    mov x1, x21
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floattitf
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floattitf
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <3 x i128> %a to <3 x fp128>
+  ret <3 x fp128> %c
+}
+
+define <3 x fp128> @utofp_v3i128_v3f128(<3 x i128> %a) {
+; CHECK-LABEL: utofp_v3i128_v3f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    mov x19, x5
+; CHECK-NEXT:    mov x20, x4
+; CHECK-NEXT:    mov x21, x3
+; CHECK-NEXT:    mov x22, x2
+; CHECK-NEXT:    bl __floatuntitf
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:    mov x1, x21
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatuntitf
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatuntitf
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <3 x i128> %a to <3 x fp128>
+  ret <3 x fp128> %c
+}
+
+define <2 x fp128> @stofp_v2i64_v2f128(<2 x i64> %a) {
+; CHECK-LABEL: stofp_v2i64_v2f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatditf
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x0, v0.d[1]
+; CHECK-NEXT:    bl __floatditf
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <2 x i64> %a to <2 x fp128>
+  ret <2 x fp128> %c
+}
+
+define <2 x fp128> @utofp_v2i64_v2f128(<2 x i64> %a) {
+; CHECK-LABEL: utofp_v2i64_v2f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatunditf
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x0, v0.d[1]
+; CHECK-NEXT:    bl __floatunditf
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <2 x i64> %a to <2 x fp128>
+  ret <2 x fp128> %c
+}
+
+define <3 x fp128> @stofp_v3i64_v3f128(<3 x i64> %a) {
+; CHECK-LABEL: stofp_v3i64_v3f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatditf
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    bl __floatditf
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    bl __floatditf
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <3 x i64> %a to <3 x fp128>
+  ret <3 x fp128> %c
+}
+
+define <3 x fp128> @utofp_v3i64_v3f128(<3 x i64> %a) {
+; CHECK-LABEL: utofp_v3i64_v3f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatunditf
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    bl __floatunditf
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    bl __floatunditf
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <3 x i64> %a to <3 x fp128>
+  ret <3 x fp128> %c
+}
+
+define <2 x double> @stofp_v2i128_v2f64(<2 x i128> %a) {
+; CHECK-LABEL: stofp_v2i128_v2f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    bl __floattidf
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floattidf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <2 x i128> %a to <2 x double>
+  ret <2 x double> %c
+}
+
+define <2 x double> @utofp_v2i128_v2f64(<2 x i128> %a) {
+; CHECK-LABEL: utofp_v2i128_v2f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    bl __floatuntidf
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatuntidf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <2 x i128> %a to <2 x double>
+  ret <2 x double> %c
+}
+
+define <3 x double> @stofp_v3i128_v3f64(<3 x i128> %a) {
+; CHECK-LABEL: stofp_v3i128_v3f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    .cfi_offset b8, -56
+; CHECK-NEXT:    .cfi_offset b9, -64
+; CHECK-NEXT:    mov x19, x5
+; CHECK-NEXT:    mov x20, x4
+; CHECK-NEXT:    mov x21, x3
+; CHECK-NEXT:    mov x22, x2
+; CHECK-NEXT:    bl __floattidf
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:    mov x1, x21
+; CHECK-NEXT:    fmov d8, d0
+; CHECK-NEXT:    bl __floattidf
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    fmov d9, d0
+; CHECK-NEXT:    bl __floattidf
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov d2, d0
+; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov d0, d8
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    fmov d1, d9
+; CHECK-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <3 x i128> %a to <3 x double>
+  ret <3 x double> %c
+}
+
+define <3 x double> @utofp_v3i128_v3f64(<3 x i128> %a) {
+; CHECK-LABEL: utofp_v3i128_v3f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    .cfi_offset b8, -56
+; CHECK-NEXT:    .cfi_offset b9, -64
+; CHECK-NEXT:    mov x19, x5
+; CHECK-NEXT:    mov x20, x4
+; CHECK-NEXT:    mov x21, x3
+; CHECK-NEXT:    mov x22, x2
+; CHECK-NEXT:    bl __floatuntidf
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:    mov x1, x21
+; CHECK-NEXT:    fmov d8, d0
+; CHECK-NEXT:    bl __floatuntidf
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    fmov d9, d0
+; CHECK-NEXT:    bl __floatuntidf
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov d2, d0
+; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov d0, d8
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    fmov d1, d9
+; CHECK-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <3 x i128> %a to <3 x double>
+  ret <3 x double> %c
+}
+
 define <2 x double> @stofp_v2i64_v2f64(<2 x i64> %a) {
 ; CHECK-LABEL: stofp_v2i64_v2f64:
 ; CHECK:       // %bb.0: // %entry
@@ -712,6 +1437,114 @@ entry:
   ret <32 x double> %c
 }
 
+define <2 x fp128> @stofp_v2i32_v2f128(<2 x i32> %a) {
+; CHECK-LABEL: stofp_v2i32_v2f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w0, v1.s[1]
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <2 x i32> %a to <2 x fp128>
+  ret <2 x fp128> %c
+}
+
+define <2 x fp128> @utofp_v2i32_v2f128(<2 x i32> %a) {
+; CHECK-LABEL: utofp_v2i32_v2f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w0, v1.s[1]
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <2 x i32> %a to <2 x fp128>
+  ret <2 x fp128> %c
+}
+
+define <3 x fp128> @stofp_v3i32_v3f128(<3 x i32> %a) {
+; CHECK-LABEL: stofp_v3i32_v3f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    mov w0, v0.s[1]
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldp q0, q2, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <3 x i32> %a to <3 x fp128>
+  ret <3 x fp128> %c
+}
+
+define <3 x fp128> @utofp_v3i32_v3f128(<3 x i32> %a) {
+; CHECK-LABEL: utofp_v3i32_v3f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    mov w0, v0.s[1]
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldp q0, q2, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <3 x i32> %a to <3 x fp128>
+  ret <3 x fp128> %c
+}
+
 define <2 x double> @stofp_v2i32_v2f64(<2 x i32> %a) {
 ; CHECK-LABEL: stofp_v2i32_v2f64:
 ; CHECK:       // %bb.0: // %entry
@@ -1166,6 +1999,118 @@ entry:
   ret <32 x double> %c
 }
 
+define <2 x fp128> @stofp_v2i16_v2f128(<2 x i16> %a) {
+; CHECK-LABEL: stofp_v2i16_v2f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    sxth w0, w8
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    sxth w0, w8
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <2 x i16> %a to <2 x fp128>
+  ret <2 x fp128> %c
+}
+
+define <2 x fp128> @utofp_v2i16_v2f128(<2 x i16> %a) {
+; CHECK-LABEL: utofp_v2i16_v2f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <2 x i16> %a to <2 x fp128>
+  ret <2 x fp128> %c
+}
+
+define <3 x fp128> @stofp_v3i16_v3f128(<3 x i16> %a) {
+; CHECK-LABEL: stofp_v3i16_v3f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov w0, v0.h[0]
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smov w0, v1.h[1]
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    smov w0, v1.h[2]
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <3 x i16> %a to <3 x fp128>
+  ret <3 x fp128> %c
+}
+
+define <3 x fp128> @utofp_v3i16_v3f128(<3 x i16> %a) {
+; CHECK-LABEL: utofp_v3i16_v3f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    umov w0, v1.h[1]
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    umov w0, v1.h[2]
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <3 x i16> %a to <3 x fp128>
+  ret <3 x fp128> %c
+}
+
 define <2 x double> @stofp_v2i16_v2f64(<2 x i16> %a) {
 ; CHECK-SD-LABEL: stofp_v2i16_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -1704,6 +2649,122 @@ entry:
   ret <32 x double> %c
 }
 
+define <2 x fp128> @stofp_v2i8_v2f128(<2 x i8> %a) {
+; CHECK-LABEL: stofp_v2i8_v2f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    sxtb w0, w8
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    sxtb w0, w8
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <2 x i8> %a to <2 x fp128>
+  ret <2 x fp128> %c
+}
+
+define <2 x fp128> @utofp_v2i8_v2f128(<2 x i8> %a) {
+; CHECK-LABEL: utofp_v2i8_v2f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <2 x i8> %a to <2 x fp128>
+  ret <2 x fp128> %c
+}
+
+define <3 x fp128> @stofp_v3i8_v3f128(<3 x i8> %a) {
+; CHECK-LABEL: stofp_v3i8_v3f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    sxtb w0, w0
+; CHECK-NEXT:    mov w19, w2
+; CHECK-NEXT:    mov w20, w1
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    sxtb w0, w20
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    sxtb w0, w19
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatsitf
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <3 x i8> %a to <3 x fp128>
+  ret <3 x fp128> %c
+}
+
+define <3 x fp128> @utofp_v3i8_v3f128(<3 x i8> %a) {
+; CHECK-LABEL: utofp_v3i8_v3f128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    and w0, w0, #0xff
+; CHECK-NEXT:    mov w19, w2
+; CHECK-NEXT:    mov w20, w1
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    and w0, w20, #0xff
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    and w0, w19, #0xff
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatunsitf
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <3 x i8> %a to <3 x fp128>
+  ret <3 x fp128> %c
+}
+
 define <2 x double> @stofp_v2i8_v2f64(<2 x i8> %a) {
 ; CHECK-SD-LABEL: stofp_v2i8_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -2612,6 +3673,164 @@ entry:
   ret <32 x double> %c
 }
 
+define <2 x float> @stofp_v2i128_v2f32(<2 x i128> %a) {
+; CHECK-LABEL: stofp_v2i128_v2f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    bl __floattisf
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floattisf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <2 x i128> %a to <2 x float>
+  ret <2 x float> %c
+}
+
+define <2 x float> @utofp_v2i128_v2f32(<2 x i128> %a) {
+; CHECK-LABEL: utofp_v2i128_v2f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    bl __floatuntisf
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatuntisf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <2 x i128> %a to <2 x float>
+  ret <2 x float> %c
+}
+
+define <3 x float> @stofp_v3i128_v3f32(<3 x i128> %a) {
+; CHECK-LABEL: stofp_v3i128_v3f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    mov x21, x1
+; CHECK-NEXT:    mov x22, x0
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    mov x19, x5
+; CHECK-NEXT:    mov x20, x4
+; CHECK-NEXT:    bl __floattisf
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:    mov x1, x21
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floattisf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floattisf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+entry:
+  %c = sitofp <3 x i128> %a to <3 x float>
+  ret <3 x float> %c
+}
+
+define <3 x float> @utofp_v3i128_v3f32(<3 x i128> %a) {
+; CHECK-LABEL: utofp_v3i128_v3f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    mov x21, x1
+; CHECK-NEXT:    mov x22, x0
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    mov x19, x5
+; CHECK-NEXT:    mov x20, x4
+; CHECK-NEXT:    bl __floatuntisf
+; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:    mov x1, x21
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatuntisf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __floatuntisf
+; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+entry:
+  %c = uitofp <3 x i128> %a to <3 x float>
+  ret <3 x float> %c
+}
+
 define <2 x float> @stofp_v2i64_v2f32(<2 x i64> %a) {
 ; CHECK-LABEL: stofp_v2i64_v2f32:
 ; CHECK:       // %bb.0: // %entry
@@ -3818,6 +5037,578 @@ entry:
   ret <32 x float> %c
 }
 
+define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) {
+; CHECK-SD-NOFP16-LABEL: stofp_v2i128_v2f16:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    sub sp, sp, #48
+; CHECK-SD-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NOFP16-NEXT:    mov x19, x1
+; CHECK-SD-NOFP16-NEXT:    mov x20, x0
+; CHECK-SD-NOFP16-NEXT:    mov x0, x2
+; CHECK-SD-NOFP16-NEXT:    mov x1, x3
+; CHECK-SD-NOFP16-NEXT:    bl __floattisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    mov x0, x20
+; CHECK-SD-NOFP16-NEXT:    mov x1, x19
+; CHECK-SD-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    bl __floattisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT:    add sp, sp, #48
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: stofp_v2i128_v2f16:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    sub sp, sp, #48
+; CHECK-SD-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-FP16-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-FP16-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-FP16-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-FP16-NEXT:    mov x19, x1
+; CHECK-SD-FP16-NEXT:    mov x20, x0
+; CHECK-SD-FP16-NEXT:    mov x0, x2
+; CHECK-SD-FP16-NEXT:    mov x1, x3
+; CHECK-SD-FP16-NEXT:    bl __floattihf
+; CHECK-SD-FP16-NEXT:    mov x0, x20
+; CHECK-SD-FP16-NEXT:    mov x1, x19
+; CHECK-SD-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    bl __floattihf
+; CHECK-SD-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-FP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-FP16-NEXT:    add sp, sp, #48
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: stofp_v2i128_v2f16:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    sub sp, sp, #48
+; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NOFP16-NEXT:    mov x19, x1
+; CHECK-GI-NOFP16-NEXT:    mov x20, x0
+; CHECK-GI-NOFP16-NEXT:    mov x0, x2
+; CHECK-GI-NOFP16-NEXT:    mov x1, x3
+; CHECK-GI-NOFP16-NEXT:    bl __floattisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    mov x0, x20
+; CHECK-GI-NOFP16-NEXT:    mov x1, x19
+; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    bl __floattisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NOFP16-NEXT:    add sp, sp, #48
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: stofp_v2i128_v2f16:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    sub sp, sp, #48
+; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-FP16-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-FP16-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-FP16-NEXT:    mov x19, x1
+; CHECK-GI-FP16-NEXT:    mov x20, x0
+; CHECK-GI-FP16-NEXT:    mov x0, x2
+; CHECK-GI-FP16-NEXT:    mov x1, x3
+; CHECK-GI-FP16-NEXT:    bl __floattihf
+; CHECK-GI-FP16-NEXT:    mov x0, x20
+; CHECK-GI-FP16-NEXT:    mov x1, x19
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    bl __floattihf
+; CHECK-GI-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-FP16-NEXT:    add sp, sp, #48
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = sitofp <2 x i128> %a to <2 x half>
+  ret <2 x half> %c
+}
+
+define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) {
+; CHECK-SD-NOFP16-LABEL: utofp_v2i128_v2f16:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    sub sp, sp, #48
+; CHECK-SD-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NOFP16-NEXT:    mov x19, x1
+; CHECK-SD-NOFP16-NEXT:    mov x20, x0
+; CHECK-SD-NOFP16-NEXT:    mov x0, x2
+; CHECK-SD-NOFP16-NEXT:    mov x1, x3
+; CHECK-SD-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    mov x0, x20
+; CHECK-SD-NOFP16-NEXT:    mov x1, x19
+; CHECK-SD-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT:    add sp, sp, #48
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: utofp_v2i128_v2f16:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    sub sp, sp, #48
+; CHECK-SD-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-FP16-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-FP16-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-FP16-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-FP16-NEXT:    mov x19, x1
+; CHECK-SD-FP16-NEXT:    mov x20, x0
+; CHECK-SD-FP16-NEXT:    mov x0, x2
+; CHECK-SD-FP16-NEXT:    mov x1, x3
+; CHECK-SD-FP16-NEXT:    bl __floatuntihf
+; CHECK-SD-FP16-NEXT:    mov x0, x20
+; CHECK-SD-FP16-NEXT:    mov x1, x19
+; CHECK-SD-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    bl __floatuntihf
+; CHECK-SD-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-FP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-FP16-NEXT:    add sp, sp, #48
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: utofp_v2i128_v2f16:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    sub sp, sp, #48
+; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NOFP16-NEXT:    mov x19, x1
+; CHECK-GI-NOFP16-NEXT:    mov x20, x0
+; CHECK-GI-NOFP16-NEXT:    mov x0, x2
+; CHECK-GI-NOFP16-NEXT:    mov x1, x3
+; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    mov x0, x20
+; CHECK-GI-NOFP16-NEXT:    mov x1, x19
+; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NOFP16-NEXT:    add sp, sp, #48
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: utofp_v2i128_v2f16:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    sub sp, sp, #48
+; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-FP16-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-FP16-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-FP16-NEXT:    mov x19, x1
+; CHECK-GI-FP16-NEXT:    mov x20, x0
+; CHECK-GI-FP16-NEXT:    mov x0, x2
+; CHECK-GI-FP16-NEXT:    mov x1, x3
+; CHECK-GI-FP16-NEXT:    bl __floatuntihf
+; CHECK-GI-FP16-NEXT:    mov x0, x20
+; CHECK-GI-FP16-NEXT:    mov x1, x19
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    bl __floatuntihf
+; CHECK-GI-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-FP16-NEXT:    add sp, sp, #48
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = uitofp <2 x i128> %a to <2 x half>
+  ret <2 x half> %c
+}
+
+define <3 x half> @stofp_v3i128_v3f16(<3 x i128> %a) {
+; CHECK-SD-NOFP16-LABEL: stofp_v3i128_v3f16:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    sub sp, sp, #64
+; CHECK-SD-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NOFP16-NEXT:    mov x21, x1
+; CHECK-SD-NOFP16-NEXT:    mov x22, x0
+; CHECK-SD-NOFP16-NEXT:    mov x0, x2
+; CHECK-SD-NOFP16-NEXT:    mov x1, x3
+; CHECK-SD-NOFP16-NEXT:    mov x19, x5
+; CHECK-SD-NOFP16-NEXT:    mov x20, x4
+; CHECK-SD-NOFP16-NEXT:    bl __floattisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    mov x0, x22
+; CHECK-SD-NOFP16-NEXT:    mov x1, x21
+; CHECK-SD-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    bl __floattisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    mov x0, x20
+; CHECK-SD-NOFP16-NEXT:    mov x1, x19
+; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    bl __floattisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s0
+; CHECK-SD-NOFP16-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT:    add sp, sp, #64
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: stofp_v3i128_v3f16:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    sub sp, sp, #64
+; CHECK-SD-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-FP16-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-FP16-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-FP16-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-FP16-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-FP16-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-FP16-NEXT:    mov x21, x1
+; CHECK-SD-FP16-NEXT:    mov x22, x0
+; CHECK-SD-FP16-NEXT:    mov x0, x2
+; CHECK-SD-FP16-NEXT:    mov x1, x3
+; CHECK-SD-FP16-NEXT:    mov x19, x5
+; CHECK-SD-FP16-NEXT:    mov x20, x4
+; CHECK-SD-FP16-NEXT:    bl __floattihf
+; CHECK-SD-FP16-NEXT:    mov x0, x22
+; CHECK-SD-FP16-NEXT:    mov x1, x21
+; CHECK-SD-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    bl __floattihf
+; CHECK-SD-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-FP16-NEXT:    mov x0, x20
+; CHECK-SD-FP16-NEXT:    mov x1, x19
+; CHECK-SD-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    bl __floattihf
+; CHECK-SD-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-FP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    mov v1.h[2], v0.h[0]
+; CHECK-SD-FP16-NEXT:    fmov d0, d1
+; CHECK-SD-FP16-NEXT:    add sp, sp, #64
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: stofp_v3i128_v3f16:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    sub sp, sp, #64
+; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NOFP16-NEXT:    mov x21, x1
+; CHECK-GI-NOFP16-NEXT:    mov x22, x0
+; CHECK-GI-NOFP16-NEXT:    mov x0, x2
+; CHECK-GI-NOFP16-NEXT:    mov x1, x3
+; CHECK-GI-NOFP16-NEXT:    mov x19, x5
+; CHECK-GI-NOFP16-NEXT:    mov x20, x4
+; CHECK-GI-NOFP16-NEXT:    bl __floattisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    mov x0, x22
+; CHECK-GI-NOFP16-NEXT:    mov x1, x21
+; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    bl __floattisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    mov x0, x20
+; CHECK-GI-NOFP16-NEXT:    mov x1, x19
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    bl __floattisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h1, s0
+; CHECK-GI-NOFP16-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NOFP16-NEXT:    add sp, sp, #64
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: stofp_v3i128_v3f16:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    sub sp, sp, #64
+; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-FP16-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-FP16-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-FP16-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-FP16-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-FP16-NEXT:    mov x21, x1
+; CHECK-GI-FP16-NEXT:    mov x22, x0
+; CHECK-GI-FP16-NEXT:    mov x0, x2
+; CHECK-GI-FP16-NEXT:    mov x1, x3
+; CHECK-GI-FP16-NEXT:    mov x19, x5
+; CHECK-GI-FP16-NEXT:    mov x20, x4
+; CHECK-GI-FP16-NEXT:    bl __floattihf
+; CHECK-GI-FP16-NEXT:    mov x0, x22
+; CHECK-GI-FP16-NEXT:    mov x1, x21
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    bl __floattihf
+; CHECK-GI-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-FP16-NEXT:    mov x0, x20
+; CHECK-GI-FP16-NEXT:    mov x1, x19
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    bl __floattihf
+; CHECK-GI-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[0]
+; CHECK-GI-FP16-NEXT:    fmov d0, d1
+; CHECK-GI-FP16-NEXT:    add sp, sp, #64
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = sitofp <3 x i128> %a to <3 x half>
+  ret <3 x half> %c
+}
+
+define <3 x half> @utofp_v3i128_v3f16(<3 x i128> %a) {
+; CHECK-SD-NOFP16-LABEL: utofp_v3i128_v3f16:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    sub sp, sp, #64
+; CHECK-SD-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NOFP16-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NOFP16-NEXT:    mov x21, x1
+; CHECK-SD-NOFP16-NEXT:    mov x22, x0
+; CHECK-SD-NOFP16-NEXT:    mov x0, x2
+; CHECK-SD-NOFP16-NEXT:    mov x1, x3
+; CHECK-SD-NOFP16-NEXT:    mov x19, x5
+; CHECK-SD-NOFP16-NEXT:    mov x20, x4
+; CHECK-SD-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    mov x0, x22
+; CHECK-SD-NOFP16-NEXT:    mov x1, x21
+; CHECK-SD-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    mov x0, x20
+; CHECK-SD-NOFP16-NEXT:    mov x1, x19
+; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s0
+; CHECK-SD-NOFP16-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT:    add sp, sp, #64
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: utofp_v3i128_v3f16:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    sub sp, sp, #64
+; CHECK-SD-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-FP16-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-FP16-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-FP16-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-FP16-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-FP16-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-FP16-NEXT:    mov x21, x1
+; CHECK-SD-FP16-NEXT:    mov x22, x0
+; CHECK-SD-FP16-NEXT:    mov x0, x2
+; CHECK-SD-FP16-NEXT:    mov x1, x3
+; CHECK-SD-FP16-NEXT:    mov x19, x5
+; CHECK-SD-FP16-NEXT:    mov x20, x4
+; CHECK-SD-FP16-NEXT:    bl __floatuntihf
+; CHECK-SD-FP16-NEXT:    mov x0, x22
+; CHECK-SD-FP16-NEXT:    mov x1, x21
+; CHECK-SD-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    bl __floatuntihf
+; CHECK-SD-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-FP16-NEXT:    mov x0, x20
+; CHECK-SD-FP16-NEXT:    mov x1, x19
+; CHECK-SD-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-SD-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-FP16-NEXT:    bl __floatuntihf
+; CHECK-SD-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-FP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-FP16-NEXT:    mov v1.h[2], v0.h[0]
+; CHECK-SD-FP16-NEXT:    fmov d0, d1
+; CHECK-SD-FP16-NEXT:    add sp, sp, #64
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: utofp_v3i128_v3f16:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    sub sp, sp, #64
+; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NOFP16-NEXT:    mov x21, x1
+; CHECK-GI-NOFP16-NEXT:    mov x22, x0
+; CHECK-GI-NOFP16-NEXT:    mov x0, x2
+; CHECK-GI-NOFP16-NEXT:    mov x1, x3
+; CHECK-GI-NOFP16-NEXT:    mov x19, x5
+; CHECK-GI-NOFP16-NEXT:    mov x20, x4
+; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    mov x0, x22
+; CHECK-GI-NOFP16-NEXT:    mov x1, x21
+; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    mov x0, x20
+; CHECK-GI-NOFP16-NEXT:    mov x1, x19
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
+; CHECK-GI-NOFP16-NEXT:    fcvt h1, s0
+; CHECK-GI-NOFP16-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NOFP16-NEXT:    add sp, sp, #64
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: utofp_v3i128_v3f16:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    sub sp, sp, #64
+; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-FP16-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-FP16-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-FP16-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-FP16-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-FP16-NEXT:    mov x21, x1
+; CHECK-GI-FP16-NEXT:    mov x22, x0
+; CHECK-GI-FP16-NEXT:    mov x0, x2
+; CHECK-GI-FP16-NEXT:    mov x1, x3
+; CHECK-GI-FP16-NEXT:    mov x19, x5
+; CHECK-GI-FP16-NEXT:    mov x20, x4
+; CHECK-GI-FP16-NEXT:    bl __floatuntihf
+; CHECK-GI-FP16-NEXT:    mov x0, x22
+; CHECK-GI-FP16-NEXT:    mov x1, x21
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    bl __floatuntihf
+; CHECK-GI-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-FP16-NEXT:    mov x0, x20
+; CHECK-GI-FP16-NEXT:    mov x1, x19
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    bl __floatuntihf
+; CHECK-GI-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[0]
+; CHECK-GI-FP16-NEXT:    fmov d0, d1
+; CHECK-GI-FP16-NEXT:    add sp, sp, #64
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = uitofp <3 x i128> %a to <3 x half>
+  ret <3 x half> %c
+}
+
 define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-SD-NOFP16-LABEL: stofp_v2i64_v2f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
@@ -6262,1823 +8053,3 @@ entry:
   %c = uitofp <32 x i8> %a to <32 x half>
   ret <32 x half> %c
 }
-
-define bfloat @stofp_i64_bf16(i64 %a) {
-; CHECK-LABEL: stofp_i64_bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp x0, #0
-; CHECK-NEXT:    and x11, x0, #0x8000000000000000
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    cneg x9, x0, mi
-; CHECK-NEXT:    lsr x10, x9, #53
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    and x10, x9, #0xfffffffffffff000
-; CHECK-NEXT:    csel x10, x10, x9, ne
-; CHECK-NEXT:    scvtf d0, x10
-; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    tst x9, #0xfff
-; CHECK-NEXT:    csel w10, wzr, w10, eq
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    orr x9, x9, x11
-; CHECK-NEXT:    orr x9, x9, x10
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fcvtxn s0, d0
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ubfx w10, w9, #16, #1
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp i64 %a to bfloat
-  ret bfloat %c
-}
-
-define bfloat @utofp_i64_bf16(i64 %a) {
-; CHECK-LABEL: utofp_i64_bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    lsr x9, x0, #53
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    cmp x9, #0
-; CHECK-NEXT:    and x9, x0, #0xfffffffffffff000
-; CHECK-NEXT:    csel x9, x9, x0, ne
-; CHECK-NEXT:    ucvtf d0, x9
-; CHECK-NEXT:    cset w9, ne
-; CHECK-NEXT:    tst x0, #0xfff
-; CHECK-NEXT:    csel w9, wzr, w9, eq
-; CHECK-NEXT:    fmov x10, d0
-; CHECK-NEXT:    orr x9, x10, x9
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fcvtxn s0, d0
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ubfx w10, w9, #16, #1
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp i64 %a to bfloat
-  ret bfloat %c
-}
-
-define bfloat @stofp_i32_bf16(i32 %a) {
-; CHECK-LABEL: stofp_i32_bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf d0, w0
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    fcvtxn s0, d0
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ubfx w10, w9, #16, #1
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp i32 %a to bfloat
-  ret bfloat %c
-}
-
-define bfloat @utofp_i32_bf16(i32 %a) {
-; CHECK-LABEL: utofp_i32_bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf d0, w0
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    fcvtxn s0, d0
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ubfx w10, w9, #16, #1
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp i32 %a to bfloat
-  ret bfloat %c
-}
-
-define bfloat @stofp_i16_bf16(i16 %a) {
-; CHECK-LABEL: stofp_i16_bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxth w9, w0
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    scvtf s0, w9
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ubfx w10, w9, #16, #1
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp i16 %a to bfloat
-  ret bfloat %c
-}
-
-define bfloat @utofp_i16_bf16(i16 %a) {
-; CHECK-LABEL: utofp_i16_bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w9, w0, #0xffff
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    ucvtf s0, w9
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ubfx w10, w9, #16, #1
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp i16 %a to bfloat
-  ret bfloat %c
-}
-
-define bfloat @stofp_i8_bf16(i8 %a) {
-; CHECK-LABEL: stofp_i8_bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w9, w0
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    scvtf s0, w9
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ubfx w10, w9, #16, #1
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp i8 %a to bfloat
-  ret bfloat %c
-}
-
-define bfloat @utofp_i8_bf16(i8 %a) {
-; CHECK-LABEL: utofp_i8_bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w9, w0, #0xff
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    ucvtf s0, w9
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ubfx w10, w9, #16, #1
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp i8 %a to bfloat
-  ret bfloat %c
-}
-
-define <2 x bfloat> @stofp_v2i64_v2bf16(<2 x i64> %a) {
-; CHECK-LABEL: stofp_v2i64_v2bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x9, v0.d[1]
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    cmp x9, #0
-; CHECK-NEXT:    cneg x10, x9, mi
-; CHECK-NEXT:    and x9, x9, #0x8000000000000000
-; CHECK-NEXT:    lsr x11, x10, #53
-; CHECK-NEXT:    and x12, x10, #0xfffffffffffff000
-; CHECK-NEXT:    cmp x11, #0
-; CHECK-NEXT:    csel x11, x12, x10, ne
-; CHECK-NEXT:    cset w12, ne
-; CHECK-NEXT:    tst x10, #0xfff
-; CHECK-NEXT:    fmov x10, d0
-; CHECK-NEXT:    csel w12, wzr, w12, eq
-; CHECK-NEXT:    scvtf d0, x11
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    cneg x13, x10, mi
-; CHECK-NEXT:    and x10, x10, #0x8000000000000000
-; CHECK-NEXT:    lsr x14, x13, #53
-; CHECK-NEXT:    cmp x14, #0
-; CHECK-NEXT:    and x14, x13, #0xfffffffffffff000
-; CHECK-NEXT:    csel x11, x14, x13, ne
-; CHECK-NEXT:    cset w14, ne
-; CHECK-NEXT:    tst x13, #0xfff
-; CHECK-NEXT:    scvtf d1, x11
-; CHECK-NEXT:    fmov x11, d0
-; CHECK-NEXT:    orr x9, x11, x9
-; CHECK-NEXT:    csel w11, wzr, w14, eq
-; CHECK-NEXT:    fmov x13, d1
-; CHECK-NEXT:    orr x9, x9, x12
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    orr x10, x13, x10
-; CHECK-NEXT:    orr x10, x10, x11
-; CHECK-NEXT:    fcvtxn s0, d0
-; CHECK-NEXT:    fmov d1, x10
-; CHECK-NEXT:    fcvtxn s1, d1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ubfx w11, w9, #16, #1
-; CHECK-NEXT:    add w9, w9, w8
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    add w9, w11, w9
-; CHECK-NEXT:    lsr w9, w9, #16
-; CHECK-NEXT:    ubfx w12, w10, #16, #1
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    add w8, w12, w8
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <2 x i64> %a to <2 x bfloat>
-  ret <2 x bfloat> %c
-}
-
-define <2 x bfloat> @utofp_v2i64_v2bf16(<2 x i64> %a) {
-; CHECK-LABEL: utofp_v2i64_v2bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x9, v0.d[1]
-; CHECK-NEXT:    fmov x11, d0
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsr x10, x9, #53
-; CHECK-NEXT:    and x12, x9, #0xfffffffffffff000
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    lsr x10, x11, #53
-; CHECK-NEXT:    csel x12, x12, x9, ne
-; CHECK-NEXT:    cset w13, ne
-; CHECK-NEXT:    tst x9, #0xfff
-; CHECK-NEXT:    csel w9, wzr, w13, eq
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    and x10, x11, #0xfffffffffffff000
-; CHECK-NEXT:    csel x10, x10, x11, ne
-; CHECK-NEXT:    ucvtf d0, x12
-; CHECK-NEXT:    ucvtf d1, x10
-; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    tst x11, #0xfff
-; CHECK-NEXT:    csel w10, wzr, w10, eq
-; CHECK-NEXT:    fmov x11, d0
-; CHECK-NEXT:    fmov x12, d1
-; CHECK-NEXT:    orr x9, x11, x9
-; CHECK-NEXT:    orr x10, x12, x10
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fmov d1, x10
-; CHECK-NEXT:    fcvtxn s0, d0
-; CHECK-NEXT:    fcvtxn s1, d1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    ubfx w11, w9, #16, #1
-; CHECK-NEXT:    add w9, w9, w8
-; CHECK-NEXT:    ubfx w12, w10, #16, #1
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    add w9, w11, w9
-; CHECK-NEXT:    add w8, w12, w8
-; CHECK-NEXT:    lsr w9, w9, #16
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <2 x i64> %a to <2 x bfloat>
-  ret <2 x bfloat> %c
-}
-
-define <3 x bfloat> @stofp_v3i64_v3bf16(<3 x i64> %a) {
-; CHECK-LABEL: stofp_v3i64_v3bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    scvtf v1.2d, v2.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
-; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <3 x i64> %a to <3 x bfloat>
-  ret <3 x bfloat> %c
-}
-
-define <3 x bfloat> @utofp_v3i64_v3bf16(<3 x i64> %a) {
-; CHECK-LABEL: utofp_v3i64_v3bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ucvtf v1.2d, v2.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
-; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <3 x i64> %a to <3 x bfloat>
-  ret <3 x bfloat> %c
-}
-
-define <4 x bfloat> @stofp_v4i64_v4bf16(<4 x i64> %a) {
-; CHECK-LABEL: stofp_v4i64_v4bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
-; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <4 x i64> %a to <4 x bfloat>
-  ret <4 x bfloat> %c
-}
-
-define <4 x bfloat> @utofp_v4i64_v4bf16(<4 x i64> %a) {
-; CHECK-LABEL: utofp_v4i64_v4bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
-; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <4 x i64> %a to <4 x bfloat>
-  ret <4 x bfloat> %c
-}
-
-define <8 x bfloat> @stofp_v8i64_v8bf16(<8 x i64> %a) {
-; CHECK-LABEL: stofp_v8i64_v8bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v4.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-NEXT:    add v6.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
-; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    bit v0.16b, v1.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <8 x i64> %a to <8 x bfloat>
-  ret <8 x bfloat> %c
-}
-
-define <8 x bfloat> @utofp_v8i64_v8bf16(<8 x i64> %a) {
-; CHECK-LABEL: utofp_v8i64_v8bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v4.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-NEXT:    add v6.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
-; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    bit v0.16b, v1.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <8 x i64> %a to <8 x bfloat>
-  ret <8 x bfloat> %c
-}
-
-define <16 x bfloat> @stofp_v16i64_v16bf16(<16 x i64> %a) {
-; CHECK-LABEL: stofp_v16i64_v16bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v6.2d, v6.2d
-; CHECK-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-NEXT:    scvtf v7.2d, v7.2d
-; CHECK-NEXT:    scvtf v5.2d, v5.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v6.2s, v6.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v7.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
-; CHECK-NEXT:    add v19.4s, v0.4s, v3.4s
-; CHECK-NEXT:    add v18.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v20.4s, v6.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
-; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v17.16b, v1.16b
-; CHECK-NEXT:    fcmeq v17.4s, v2.4s, v2.4s
-; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v7.4s, v7.4s, v19.4s
-; CHECK-NEXT:    fcmeq v19.4s, v6.4s, v6.4s
-; CHECK-NEXT:    add v5.4s, v5.4s, v18.4s
-; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    fcmeq v3.4s, v4.4s, v4.4s
-; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
-; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v5.16b, v17.16b
-; CHECK-NEXT:    mov v5.16b, v19.16b
-; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
-; CHECK-NEXT:    bif v1.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <16 x i64> %a to <16 x bfloat>
-  ret <16 x bfloat> %c
-}
-
-define <16 x bfloat> @utofp_v16i64_v16bf16(<16 x i64> %a) {
-; CHECK-LABEL: utofp_v16i64_v16bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v6.2d, v6.2d
-; CHECK-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    ucvtf v7.2d, v7.2d
-; CHECK-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v6.2s, v6.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v7.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
-; CHECK-NEXT:    add v19.4s, v0.4s, v3.4s
-; CHECK-NEXT:    add v18.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v20.4s, v6.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
-; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v17.16b, v1.16b
-; CHECK-NEXT:    fcmeq v17.4s, v2.4s, v2.4s
-; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v7.4s, v7.4s, v19.4s
-; CHECK-NEXT:    fcmeq v19.4s, v6.4s, v6.4s
-; CHECK-NEXT:    add v5.4s, v5.4s, v18.4s
-; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    fcmeq v3.4s, v4.4s, v4.4s
-; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
-; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v5.16b, v17.16b
-; CHECK-NEXT:    mov v5.16b, v19.16b
-; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
-; CHECK-NEXT:    bif v1.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <16 x i64> %a to <16 x bfloat>
-  ret <16 x bfloat> %c
-}
-
-define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) {
-; CHECK-LABEL: stofp_v32i64_v32bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v17.2d, v2.2d
-; CHECK-NEXT:    scvtf v18.2d, v0.2d
-; CHECK-NEXT:    scvtf v19.2d, v3.2d
-; CHECK-NEXT:    scvtf v3.2d, v6.2d
-; CHECK-NEXT:    ldp q21, q20, [sp, #32]
-; CHECK-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-NEXT:    scvtf v6.2d, v7.2d
-; CHECK-NEXT:    scvtf v5.2d, v5.2d
-; CHECK-NEXT:    ldp q24, q23, [sp, #64]
-; CHECK-NEXT:    movi v16.4s, #1
-; CHECK-NEXT:    fcvtn v0.2s, v17.2d
-; CHECK-NEXT:    scvtf v17.2d, v1.2d
-; CHECK-NEXT:    fcvtn v1.2s, v18.2d
-; CHECK-NEXT:    fcvtn v3.2s, v3.2d
-; CHECK-NEXT:    ldp q18, q7, [sp]
-; CHECK-NEXT:    scvtf v21.2d, v21.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    scvtf v20.2d, v20.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v19.2d
-; CHECK-NEXT:    ldp q22, q19, [sp, #96]
-; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT:    fcvtn2 v3.4s, v6.2d
-; CHECK-NEXT:    scvtf v18.2d, v18.2d
-; CHECK-NEXT:    scvtf v17.2d, v24.2d
-; CHECK-NEXT:    fcvtn v6.2s, v21.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT:    scvtf v22.2d, v22.2d
-; CHECK-NEXT:    scvtf v21.2d, v23.2d
-; CHECK-NEXT:    scvtf v7.2d, v7.2d
-; CHECK-NEXT:    ushr v24.4s, v0.4s, #16
-; CHECK-NEXT:    add v5.4s, v0.4s, v2.4s
-; CHECK-NEXT:    scvtf v19.2d, v19.2d
-; CHECK-NEXT:    ushr v23.4s, v1.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v3.4s, #16
-; CHECK-NEXT:    fcvtn v18.2s, v18.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v20.2d
-; CHECK-NEXT:    add v26.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fcvtn v17.2s, v17.2d
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    fcvtn v22.2s, v22.2d
-; CHECK-NEXT:    fcmeq v20.4s, v0.4s, v0.4s
-; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    fcmeq v27.4s, v3.4s, v3.4s
-; CHECK-NEXT:    fcvtn2 v18.4s, v7.2d
-; CHECK-NEXT:    add v7.4s, v3.4s, v2.4s
-; CHECK-NEXT:    orr v3.4s, #64, lsl #16
-; CHECK-NEXT:    add v5.4s, v24.4s, v5.4s
-; CHECK-NEXT:    and v24.16b, v25.16b, v16.16b
-; CHECK-NEXT:    ushr v25.4s, v4.4s, #16
-; CHECK-NEXT:    fcvtn2 v22.4s, v19.2d
-; CHECK-NEXT:    add v19.4s, v23.4s, v26.4s
-; CHECK-NEXT:    ushr v26.4s, v6.4s, #16
-; CHECK-NEXT:    fcvtn2 v17.4s, v21.2d
-; CHECK-NEXT:    fcmeq v21.4s, v1.4s, v1.4s
-; CHECK-NEXT:    orr v1.4s, #64, lsl #16
-; CHECK-NEXT:    and v23.16b, v25.16b, v16.16b
-; CHECK-NEXT:    add v25.4s, v4.4s, v2.4s
-; CHECK-NEXT:    add v7.4s, v24.4s, v7.4s
-; CHECK-NEXT:    ushr v24.4s, v18.4s, #16
-; CHECK-NEXT:    add v30.4s, v18.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v5.16b, v20.16b
-; CHECK-NEXT:    ushr v28.4s, v22.4s, #16
-; CHECK-NEXT:    add v31.4s, v22.4s, v2.4s
-; CHECK-NEXT:    add v23.4s, v23.4s, v25.4s
-; CHECK-NEXT:    and v25.16b, v26.16b, v16.16b
-; CHECK-NEXT:    add v26.4s, v6.4s, v2.4s
-; CHECK-NEXT:    ushr v29.4s, v17.4s, #16
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    add v2.4s, v17.4s, v2.4s
-; CHECK-NEXT:    and v28.16b, v28.16b, v16.16b
-; CHECK-NEXT:    bit v3.16b, v7.16b, v27.16b
-; CHECK-NEXT:    bit v1.16b, v19.16b, v21.16b
-; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
-; CHECK-NEXT:    fcmeq v26.4s, v6.4s, v6.4s
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
-; CHECK-NEXT:    and v16.16b, v29.16b, v16.16b
-; CHECK-NEXT:    add v24.4s, v24.4s, v30.4s
-; CHECK-NEXT:    fcmeq v30.4s, v18.4s, v18.4s
-; CHECK-NEXT:    add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT:    fcmeq v31.4s, v22.4s, v22.4s
-; CHECK-NEXT:    fcmeq v29.4s, v4.4s, v4.4s
-; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    orr v18.4s, #64, lsl #16
-; CHECK-NEXT:    orr v22.4s, #64, lsl #16
-; CHECK-NEXT:    mov v5.16b, v26.16b
-; CHECK-NEXT:    add v2.4s, v16.4s, v2.4s
-; CHECK-NEXT:    fcmeq v16.4s, v17.4s, v17.4s
-; CHECK-NEXT:    orr v17.4s, #64, lsl #16
-; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    mov v7.16b, v31.16b
-; CHECK-NEXT:    bit v4.16b, v23.16b, v29.16b
-; CHECK-NEXT:    bsl v5.16b, v25.16b, v6.16b
-; CHECK-NEXT:    mov v6.16b, v30.16b
-; CHECK-NEXT:    bsl v16.16b, v2.16b, v17.16b
-; CHECK-NEXT:    bsl v7.16b, v28.16b, v22.16b
-; CHECK-NEXT:    bsl v6.16b, v24.16b, v18.16b
-; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v3.8h
-; CHECK-NEXT:    uzp2 v3.8h, v16.8h, v7.8h
-; CHECK-NEXT:    uzp2 v2.8h, v6.8h, v5.8h
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <32 x i64> %a to <32 x bfloat>
-  ret <32 x bfloat> %c
-}
-
-define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) {
-; CHECK-LABEL: utofp_v32i64_v32bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v17.2d, v2.2d
-; CHECK-NEXT:    ucvtf v18.2d, v0.2d
-; CHECK-NEXT:    ucvtf v19.2d, v3.2d
-; CHECK-NEXT:    ucvtf v3.2d, v6.2d
-; CHECK-NEXT:    ldp q21, q20, [sp, #32]
-; CHECK-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-NEXT:    ucvtf v6.2d, v7.2d
-; CHECK-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-NEXT:    ldp q24, q23, [sp, #64]
-; CHECK-NEXT:    movi v16.4s, #1
-; CHECK-NEXT:    fcvtn v0.2s, v17.2d
-; CHECK-NEXT:    ucvtf v17.2d, v1.2d
-; CHECK-NEXT:    fcvtn v1.2s, v18.2d
-; CHECK-NEXT:    fcvtn v3.2s, v3.2d
-; CHECK-NEXT:    ldp q18, q7, [sp]
-; CHECK-NEXT:    ucvtf v21.2d, v21.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    ucvtf v20.2d, v20.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v19.2d
-; CHECK-NEXT:    ldp q22, q19, [sp, #96]
-; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT:    fcvtn2 v3.4s, v6.2d
-; CHECK-NEXT:    ucvtf v18.2d, v18.2d
-; CHECK-NEXT:    ucvtf v17.2d, v24.2d
-; CHECK-NEXT:    fcvtn v6.2s, v21.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT:    ucvtf v22.2d, v22.2d
-; CHECK-NEXT:    ucvtf v21.2d, v23.2d
-; CHECK-NEXT:    ucvtf v7.2d, v7.2d
-; CHECK-NEXT:    ushr v24.4s, v0.4s, #16
-; CHECK-NEXT:    add v5.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ucvtf v19.2d, v19.2d
-; CHECK-NEXT:    ushr v23.4s, v1.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v3.4s, #16
-; CHECK-NEXT:    fcvtn v18.2s, v18.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v20.2d
-; CHECK-NEXT:    add v26.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fcvtn v17.2s, v17.2d
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    fcvtn v22.2s, v22.2d
-; CHECK-NEXT:    fcmeq v20.4s, v0.4s, v0.4s
-; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    fcmeq v27.4s, v3.4s, v3.4s
-; CHECK-NEXT:    fcvtn2 v18.4s, v7.2d
-; CHECK-NEXT:    add v7.4s, v3.4s, v2.4s
-; CHECK-NEXT:    orr v3.4s, #64, lsl #16
-; CHECK-NEXT:    add v5.4s, v24.4s, v5.4s
-; CHECK-NEXT:    and v24.16b, v25.16b, v16.16b
-; CHECK-NEXT:    ushr v25.4s, v4.4s, #16
-; CHECK-NEXT:    fcvtn2 v22.4s, v19.2d
-; CHECK-NEXT:    add v19.4s, v23.4s, v26.4s
-; CHECK-NEXT:    ushr v26.4s, v6.4s, #16
-; CHECK-NEXT:    fcvtn2 v17.4s, v21.2d
-; CHECK-NEXT:    fcmeq v21.4s, v1.4s, v1.4s
-; CHECK-NEXT:    orr v1.4s, #64, lsl #16
-; CHECK-NEXT:    and v23.16b, v25.16b, v16.16b
-; CHECK-NEXT:    add v25.4s, v4.4s, v2.4s
-; CHECK-NEXT:    add v7.4s, v24.4s, v7.4s
-; CHECK-NEXT:    ushr v24.4s, v18.4s, #16
-; CHECK-NEXT:    add v30.4s, v18.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v5.16b, v20.16b
-; CHECK-NEXT:    ushr v28.4s, v22.4s, #16
-; CHECK-NEXT:    add v31.4s, v22.4s, v2.4s
-; CHECK-NEXT:    add v23.4s, v23.4s, v25.4s
-; CHECK-NEXT:    and v25.16b, v26.16b, v16.16b
-; CHECK-NEXT:    add v26.4s, v6.4s, v2.4s
-; CHECK-NEXT:    ushr v29.4s, v17.4s, #16
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    add v2.4s, v17.4s, v2.4s
-; CHECK-NEXT:    and v28.16b, v28.16b, v16.16b
-; CHECK-NEXT:    bit v3.16b, v7.16b, v27.16b
-; CHECK-NEXT:    bit v1.16b, v19.16b, v21.16b
-; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
-; CHECK-NEXT:    fcmeq v26.4s, v6.4s, v6.4s
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
-; CHECK-NEXT:    and v16.16b, v29.16b, v16.16b
-; CHECK-NEXT:    add v24.4s, v24.4s, v30.4s
-; CHECK-NEXT:    fcmeq v30.4s, v18.4s, v18.4s
-; CHECK-NEXT:    add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT:    fcmeq v31.4s, v22.4s, v22.4s
-; CHECK-NEXT:    fcmeq v29.4s, v4.4s, v4.4s
-; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    orr v18.4s, #64, lsl #16
-; CHECK-NEXT:    orr v22.4s, #64, lsl #16
-; CHECK-NEXT:    mov v5.16b, v26.16b
-; CHECK-NEXT:    add v2.4s, v16.4s, v2.4s
-; CHECK-NEXT:    fcmeq v16.4s, v17.4s, v17.4s
-; CHECK-NEXT:    orr v17.4s, #64, lsl #16
-; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    mov v7.16b, v31.16b
-; CHECK-NEXT:    bit v4.16b, v23.16b, v29.16b
-; CHECK-NEXT:    bsl v5.16b, v25.16b, v6.16b
-; CHECK-NEXT:    mov v6.16b, v30.16b
-; CHECK-NEXT:    bsl v16.16b, v2.16b, v17.16b
-; CHECK-NEXT:    bsl v7.16b, v28.16b, v22.16b
-; CHECK-NEXT:    bsl v6.16b, v24.16b, v18.16b
-; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v3.8h
-; CHECK-NEXT:    uzp2 v3.8h, v16.8h, v7.8h
-; CHECK-NEXT:    uzp2 v2.8h, v6.8h, v5.8h
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <32 x i64> %a to <32 x bfloat>
-  ret <32 x bfloat> %c
-}
-
-define <2 x bfloat> @stofp_v2i32_v2bf16(<2 x i32> %a) {
-; CHECK-LABEL: stofp_v2i32_v2bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <2 x i32> %a to <2 x bfloat>
-  ret <2 x bfloat> %c
-}
-
-define <2 x bfloat> @utofp_v2i32_v2bf16(<2 x i32> %a) {
-; CHECK-LABEL: utofp_v2i32_v2bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <2 x i32> %a to <2 x bfloat>
-  ret <2 x bfloat> %c
-}
-
-define <3 x bfloat> @stofp_v3i32_v3bf16(<3 x i32> %a) {
-; CHECK-LABEL: stofp_v3i32_v3bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <3 x i32> %a to <3 x bfloat>
-  ret <3 x bfloat> %c
-}
-
-define <3 x bfloat> @utofp_v3i32_v3bf16(<3 x i32> %a) {
-; CHECK-LABEL: utofp_v3i32_v3bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <3 x i32> %a to <3 x bfloat>
-  ret <3 x bfloat> %c
-}
-
-define <4 x bfloat> @stofp_v4i32_v4bf16(<4 x i32> %a) {
-; CHECK-LABEL: stofp_v4i32_v4bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <4 x i32> %a to <4 x bfloat>
-  ret <4 x bfloat> %c
-}
-
-define <4 x bfloat> @utofp_v4i32_v4bf16(<4 x i32> %a) {
-; CHECK-LABEL: utofp_v4i32_v4bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <4 x i32> %a to <4 x bfloat>
-  ret <4 x bfloat> %c
-}
-
-define <8 x bfloat> @stofp_v8i32_v8bf16(<8 x i32> %a) {
-; CHECK-LABEL: stofp_v8i32_v8bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    scvtf v1.4s, v1.4s
-; CHECK-NEXT:    movi v5.4s, #127, msl #8
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v4.4s, v1.4s, #16
-; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v5.4s
-; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v5.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <8 x i32> %a to <8 x bfloat>
-  ret <8 x bfloat> %c
-}
-
-define <8 x bfloat> @utofp_v8i32_v8bf16(<8 x i32> %a) {
-; CHECK-LABEL: utofp_v8i32_v8bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    ucvtf v1.4s, v1.4s
-; CHECK-NEXT:    movi v5.4s, #127, msl #8
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v4.4s, v1.4s, #16
-; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v5.4s
-; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v5.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <8 x i32> %a to <8 x bfloat>
-  ret <8 x bfloat> %c
-}
-
-define <16 x bfloat> @stofp_v16i32_v16bf16(<16 x i32> %a) {
-; CHECK-LABEL: stofp_v16i32_v16bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v2.4s, v2.4s
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    scvtf v4.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    scvtf v3.4s, v3.4s
-; CHECK-NEXT:    movi v17.4s, #127, msl #8
-; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v6.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v7.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v3.4s, #16
-; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v6.16b, v6.16b, v1.16b
-; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
-; CHECK-NEXT:    and v5.16b, v7.16b, v1.16b
-; CHECK-NEXT:    and v6.16b, v16.16b, v1.16b
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v17.4s
-; CHECK-NEXT:    addhn v1.4h, v2.4s, v17.4s
-; CHECK-NEXT:    add v2.4s, v5.4s, v4.4s
-; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
-; CHECK-NEXT:    addhn2 v0.8h, v2.4s, v17.4s
-; CHECK-NEXT:    addhn2 v1.8h, v3.4s, v17.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <16 x i32> %a to <16 x bfloat>
-  ret <16 x bfloat> %c
-}
-
-define <16 x bfloat> @utofp_v16i32_v16bf16(<16 x i32> %a) {
-; CHECK-LABEL: utofp_v16i32_v16bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v2.4s, v2.4s
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    ucvtf v4.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ucvtf v3.4s, v3.4s
-; CHECK-NEXT:    movi v17.4s, #127, msl #8
-; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v6.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v7.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v3.4s, #16
-; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v6.16b, v6.16b, v1.16b
-; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
-; CHECK-NEXT:    and v5.16b, v7.16b, v1.16b
-; CHECK-NEXT:    and v6.16b, v16.16b, v1.16b
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v17.4s
-; CHECK-NEXT:    addhn v1.4h, v2.4s, v17.4s
-; CHECK-NEXT:    add v2.4s, v5.4s, v4.4s
-; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
-; CHECK-NEXT:    addhn2 v0.8h, v2.4s, v17.4s
-; CHECK-NEXT:    addhn2 v1.8h, v3.4s, v17.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <16 x i32> %a to <16 x bfloat>
-  ret <16 x bfloat> %c
-}
-
-define <32 x bfloat> @stofp_v32i32_v32bf16(<32 x i32> %a) {
-; CHECK-LABEL: stofp_v32i32_v32bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    scvtf v2.4s, v2.4s
-; CHECK-NEXT:    scvtf v4.4s, v4.4s
-; CHECK-NEXT:    scvtf v6.4s, v6.4s
-; CHECK-NEXT:    movi v16.4s, #1
-; CHECK-NEXT:    scvtf v1.4s, v1.4s
-; CHECK-NEXT:    scvtf v17.4s, v3.4s
-; CHECK-NEXT:    scvtf v5.4s, v5.4s
-; CHECK-NEXT:    scvtf v7.4s, v7.4s
-; CHECK-NEXT:    movi v21.4s, #127, msl #8
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v18.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v19.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v20.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v22.4s, v1.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
-; CHECK-NEXT:    ushr v24.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v7.4s, #16
-; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
-; CHECK-NEXT:    and v18.16b, v18.16b, v16.16b
-; CHECK-NEXT:    and v19.16b, v19.16b, v16.16b
-; CHECK-NEXT:    and v20.16b, v20.16b, v16.16b
-; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    and v3.16b, v22.16b, v16.16b
-; CHECK-NEXT:    add v2.4s, v18.4s, v2.4s
-; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
-; CHECK-NEXT:    add v6.4s, v20.4s, v6.4s
-; CHECK-NEXT:    and v18.16b, v23.16b, v16.16b
-; CHECK-NEXT:    and v19.16b, v24.16b, v16.16b
-; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
-; CHECK-NEXT:    add v20.4s, v3.4s, v1.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v21.4s
-; CHECK-NEXT:    addhn v1.4h, v2.4s, v21.4s
-; CHECK-NEXT:    addhn v2.4h, v4.4s, v21.4s
-; CHECK-NEXT:    addhn v3.4h, v6.4s, v21.4s
-; CHECK-NEXT:    add v4.4s, v18.4s, v17.4s
-; CHECK-NEXT:    add v5.4s, v19.4s, v5.4s
-; CHECK-NEXT:    add v6.4s, v16.4s, v7.4s
-; CHECK-NEXT:    addhn2 v0.8h, v20.4s, v21.4s
-; CHECK-NEXT:    addhn2 v1.8h, v4.4s, v21.4s
-; CHECK-NEXT:    addhn2 v2.8h, v5.4s, v21.4s
-; CHECK-NEXT:    addhn2 v3.8h, v6.4s, v21.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <32 x i32> %a to <32 x bfloat>
-  ret <32 x bfloat> %c
-}
-
-define <32 x bfloat> @utofp_v32i32_v32bf16(<32 x i32> %a) {
-; CHECK-LABEL: utofp_v32i32_v32bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    ucvtf v2.4s, v2.4s
-; CHECK-NEXT:    ucvtf v4.4s, v4.4s
-; CHECK-NEXT:    ucvtf v6.4s, v6.4s
-; CHECK-NEXT:    movi v16.4s, #1
-; CHECK-NEXT:    ucvtf v1.4s, v1.4s
-; CHECK-NEXT:    ucvtf v17.4s, v3.4s
-; CHECK-NEXT:    ucvtf v5.4s, v5.4s
-; CHECK-NEXT:    ucvtf v7.4s, v7.4s
-; CHECK-NEXT:    movi v21.4s, #127, msl #8
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v18.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v19.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v20.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v22.4s, v1.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
-; CHECK-NEXT:    ushr v24.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v7.4s, #16
-; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
-; CHECK-NEXT:    and v18.16b, v18.16b, v16.16b
-; CHECK-NEXT:    and v19.16b, v19.16b, v16.16b
-; CHECK-NEXT:    and v20.16b, v20.16b, v16.16b
-; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    and v3.16b, v22.16b, v16.16b
-; CHECK-NEXT:    add v2.4s, v18.4s, v2.4s
-; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
-; CHECK-NEXT:    add v6.4s, v20.4s, v6.4s
-; CHECK-NEXT:    and v18.16b, v23.16b, v16.16b
-; CHECK-NEXT:    and v19.16b, v24.16b, v16.16b
-; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
-; CHECK-NEXT:    add v20.4s, v3.4s, v1.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v21.4s
-; CHECK-NEXT:    addhn v1.4h, v2.4s, v21.4s
-; CHECK-NEXT:    addhn v2.4h, v4.4s, v21.4s
-; CHECK-NEXT:    addhn v3.4h, v6.4s, v21.4s
-; CHECK-NEXT:    add v4.4s, v18.4s, v17.4s
-; CHECK-NEXT:    add v5.4s, v19.4s, v5.4s
-; CHECK-NEXT:    add v6.4s, v16.4s, v7.4s
-; CHECK-NEXT:    addhn2 v0.8h, v20.4s, v21.4s
-; CHECK-NEXT:    addhn2 v1.8h, v4.4s, v21.4s
-; CHECK-NEXT:    addhn2 v2.8h, v5.4s, v21.4s
-; CHECK-NEXT:    addhn2 v3.8h, v6.4s, v21.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <32 x i32> %a to <32 x bfloat>
-  ret <32 x bfloat> %c
-}
-
-define <2 x bfloat> @stofp_v2i16_v2bf16(<2 x i16> %a) {
-; CHECK-LABEL: stofp_v2i16_v2bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <2 x i16> %a to <2 x bfloat>
-  ret <2 x bfloat> %c
-}
-
-define <2 x bfloat> @utofp_v2i16_v2bf16(<2 x i16> %a) {
-; CHECK-LABEL: utofp_v2i16_v2bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <2 x i16> %a to <2 x bfloat>
-  ret <2 x bfloat> %c
-}
-
-define <3 x bfloat> @stofp_v3i16_v3bf16(<3 x i16> %a) {
-; CHECK-LABEL: stofp_v3i16_v3bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <3 x i16> %a to <3 x bfloat>
-  ret <3 x bfloat> %c
-}
-
-define <3 x bfloat> @utofp_v3i16_v3bf16(<3 x i16> %a) {
-; CHECK-LABEL: utofp_v3i16_v3bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <3 x i16> %a to <3 x bfloat>
-  ret <3 x bfloat> %c
-}
-
-define <4 x bfloat> @stofp_v4i16_v4bf16(<4 x i16> %a) {
-; CHECK-LABEL: stofp_v4i16_v4bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <4 x i16> %a to <4 x bfloat>
-  ret <4 x bfloat> %c
-}
-
-define <4 x bfloat> @utofp_v4i16_v4bf16(<4 x i16> %a) {
-; CHECK-LABEL: utofp_v4i16_v4bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <4 x i16> %a to <4 x bfloat>
-  ret <4 x bfloat> %c
-}
-
-define <8 x bfloat> @stofp_v8i16_v8bf16(<8 x i16> %a) {
-; CHECK-LABEL: stofp_v8i16_v8bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v4.4s, #127, msl #8
-; CHECK-NEXT:    scvtf v2.4s, v2.4s
-; CHECK-NEXT:    scvtf v3.4s, v0.4s
-; CHECK-NEXT:    ushr v0.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    addhn v0.4h, v2.4s, v0.4s
-; CHECK-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <8 x i16> %a to <8 x bfloat>
-  ret <8 x bfloat> %c
-}
-
-define <8 x bfloat> @utofp_v8i16_v8bf16(<8 x i16> %a) {
-; CHECK-LABEL: utofp_v8i16_v8bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v4.4s, #127, msl #8
-; CHECK-NEXT:    ucvtf v2.4s, v2.4s
-; CHECK-NEXT:    ucvtf v3.4s, v0.4s
-; CHECK-NEXT:    ushr v0.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    addhn v0.4h, v2.4s, v0.4s
-; CHECK-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <8 x i16> %a to <8 x bfloat>
-  ret <8 x bfloat> %c
-}
-
-define <16 x bfloat> @stofp_v16i16_v16bf16(<16 x i16> %a) {
-; CHECK-LABEL: stofp_v16i16_v16bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-NEXT:    sshll v4.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    movi v7.4s, #127, msl #8
-; CHECK-NEXT:    scvtf v3.4s, v3.4s
-; CHECK-NEXT:    scvtf v4.4s, v4.4s
-; CHECK-NEXT:    scvtf v5.4s, v0.4s
-; CHECK-NEXT:    scvtf v6.4s, v1.4s
-; CHECK-NEXT:    ushr v0.4s, v3.4s, #16
-; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v6.4s, #16
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v17.16b, v2.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v7.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v7.4s
-; CHECK-NEXT:    addhn v0.4h, v3.4s, v0.4s
-; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
-; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
-; CHECK-NEXT:    addhn2 v0.8h, v5.4s, v3.4s
-; CHECK-NEXT:    addhn2 v1.8h, v6.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <16 x i16> %a to <16 x bfloat>
-  ret <16 x bfloat> %c
-}
-
-define <16 x bfloat> @utofp_v16i16_v16bf16(<16 x i16> %a) {
-; CHECK-LABEL: utofp_v16i16_v16bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v4.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    movi v7.4s, #127, msl #8
-; CHECK-NEXT:    ucvtf v3.4s, v3.4s
-; CHECK-NEXT:    ucvtf v4.4s, v4.4s
-; CHECK-NEXT:    ucvtf v5.4s, v0.4s
-; CHECK-NEXT:    ucvtf v6.4s, v1.4s
-; CHECK-NEXT:    ushr v0.4s, v3.4s, #16
-; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v6.4s, #16
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v17.16b, v2.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v7.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v7.4s
-; CHECK-NEXT:    addhn v0.4h, v3.4s, v0.4s
-; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
-; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
-; CHECK-NEXT:    addhn2 v0.8h, v5.4s, v3.4s
-; CHECK-NEXT:    addhn2 v1.8h, v6.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <16 x i16> %a to <16 x bfloat>
-  ret <16 x bfloat> %c
-}
-
-define <32 x bfloat> @stofp_v32i16_v32bf16(<32 x i16> %a) {
-; CHECK-LABEL: stofp_v32i16_v32bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v4.4s, v1.4h, #0
-; CHECK-NEXT:    sshll v5.4s, v0.4h, #0
-; CHECK-NEXT:    sshll v6.4s, v2.4h, #0
-; CHECK-NEXT:    sshll v7.4s, v3.4h, #0
-; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-NEXT:    movi v16.4s, #1
-; CHECK-NEXT:    scvtf v5.4s, v5.4s
-; CHECK-NEXT:    scvtf v4.4s, v4.4s
-; CHECK-NEXT:    scvtf v6.4s, v6.4s
-; CHECK-NEXT:    scvtf v7.4s, v7.4s
-; CHECK-NEXT:    scvtf v17.4s, v0.4s
-; CHECK-NEXT:    scvtf v18.4s, v1.4s
-; CHECK-NEXT:    scvtf v19.4s, v2.4s
-; CHECK-NEXT:    scvtf v20.4s, v3.4s
-; CHECK-NEXT:    movi v21.4s, #127, msl #8
-; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v2.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v3.4s, v7.4s, #16
-; CHECK-NEXT:    ushr v22.4s, v17.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v18.4s, #16
-; CHECK-NEXT:    ushr v24.4s, v19.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v20.4s, #16
-; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v2.16b, v2.16b, v16.16b
-; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
-; CHECK-NEXT:    and v22.16b, v22.16b, v16.16b
-; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v21.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v21.4s
-; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
-; CHECK-NEXT:    addhn v0.4h, v5.4s, v0.4s
-; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
-; CHECK-NEXT:    addhn v2.4h, v6.4s, v2.4s
-; CHECK-NEXT:    addhn v3.4h, v7.4s, v3.4s
-; CHECK-NEXT:    add v4.4s, v22.4s, v21.4s
-; CHECK-NEXT:    add v5.4s, v23.4s, v21.4s
-; CHECK-NEXT:    add v6.4s, v24.4s, v21.4s
-; CHECK-NEXT:    add v7.4s, v16.4s, v21.4s
-; CHECK-NEXT:    addhn2 v0.8h, v17.4s, v4.4s
-; CHECK-NEXT:    addhn2 v1.8h, v18.4s, v5.4s
-; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v6.4s
-; CHECK-NEXT:    addhn2 v3.8h, v20.4s, v7.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <32 x i16> %a to <32 x bfloat>
-  ret <32 x bfloat> %c
-}
-
-define <32 x bfloat> @utofp_v32i16_v32bf16(<32 x i16> %a) {
-; CHECK-LABEL: utofp_v32i16_v32bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v4.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v5.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v6.4s, v2.4h, #0
-; CHECK-NEXT:    ushll v7.4s, v3.4h, #0
-; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-NEXT:    movi v16.4s, #1
-; CHECK-NEXT:    ucvtf v5.4s, v5.4s
-; CHECK-NEXT:    ucvtf v4.4s, v4.4s
-; CHECK-NEXT:    ucvtf v6.4s, v6.4s
-; CHECK-NEXT:    ucvtf v7.4s, v7.4s
-; CHECK-NEXT:    ucvtf v17.4s, v0.4s
-; CHECK-NEXT:    ucvtf v18.4s, v1.4s
-; CHECK-NEXT:    ucvtf v19.4s, v2.4s
-; CHECK-NEXT:    ucvtf v20.4s, v3.4s
-; CHECK-NEXT:    movi v21.4s, #127, msl #8
-; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v2.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v3.4s, v7.4s, #16
-; CHECK-NEXT:    ushr v22.4s, v17.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v18.4s, #16
-; CHECK-NEXT:    ushr v24.4s, v19.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v20.4s, #16
-; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v2.16b, v2.16b, v16.16b
-; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
-; CHECK-NEXT:    and v22.16b, v22.16b, v16.16b
-; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v21.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v21.4s
-; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
-; CHECK-NEXT:    addhn v0.4h, v5.4s, v0.4s
-; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
-; CHECK-NEXT:    addhn v2.4h, v6.4s, v2.4s
-; CHECK-NEXT:    addhn v3.4h, v7.4s, v3.4s
-; CHECK-NEXT:    add v4.4s, v22.4s, v21.4s
-; CHECK-NEXT:    add v5.4s, v23.4s, v21.4s
-; CHECK-NEXT:    add v6.4s, v24.4s, v21.4s
-; CHECK-NEXT:    add v7.4s, v16.4s, v21.4s
-; CHECK-NEXT:    addhn2 v0.8h, v17.4s, v4.4s
-; CHECK-NEXT:    addhn2 v1.8h, v18.4s, v5.4s
-; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v6.4s
-; CHECK-NEXT:    addhn2 v3.8h, v20.4s, v7.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <32 x i16> %a to <32 x bfloat>
-  ret <32 x bfloat> %c
-}
-
-define <2 x bfloat> @stofp_v2i8_v2bf16(<2 x i8> %a) {
-; CHECK-LABEL: stofp_v2i8_v2bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov w9, v0.s[1]
-; CHECK-NEXT:    fmov w10, s0
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    sxtb w10, w10
-; CHECK-NEXT:    sxtb w9, w9
-; CHECK-NEXT:    scvtf s1, w10
-; CHECK-NEXT:    scvtf s0, w9
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ubfx w12, w10, #16, #1
-; CHECK-NEXT:    ubfx w11, w9, #16, #1
-; CHECK-NEXT:    add w9, w9, w8
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    add w8, w12, w8
-; CHECK-NEXT:    add w9, w11, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsr w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <2 x i8> %a to <2 x bfloat>
-  ret <2 x bfloat> %c
-}
-
-define <2 x bfloat> @utofp_v2i8_v2bf16(<2 x i8> %a) {
-; CHECK-LABEL: utofp_v2i8_v2bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov w9, v0.s[1]
-; CHECK-NEXT:    fmov w10, s0
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    and w10, w10, #0xff
-; CHECK-NEXT:    and w9, w9, #0xff
-; CHECK-NEXT:    ucvtf s1, w10
-; CHECK-NEXT:    ucvtf s0, w9
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ubfx w12, w10, #16, #1
-; CHECK-NEXT:    ubfx w11, w9, #16, #1
-; CHECK-NEXT:    add w9, w9, w8
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    add w8, w12, w8
-; CHECK-NEXT:    add w9, w11, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsr w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <2 x i8> %a to <2 x bfloat>
-  ret <2 x bfloat> %c
-}
-
-define <3 x bfloat> @stofp_v3i8_v3bf16(<3 x i8> %a) {
-; CHECK-LABEL: stofp_v3i8_v3bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    mov v0.h[1], w1
-; CHECK-NEXT:    mov v0.h[2], w2
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <3 x i8> %a to <3 x bfloat>
-  ret <3 x bfloat> %c
-}
-
-define <3 x bfloat> @utofp_v3i8_v3bf16(<3 x i8> %a) {
-; CHECK-LABEL: utofp_v3i8_v3bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    mov v0.h[1], w1
-; CHECK-NEXT:    mov v0.h[2], w2
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <3 x i8> %a to <3 x bfloat>
-  ret <3 x bfloat> %c
-}
-
-define <4 x bfloat> @stofp_v4i8_v4bf16(<4 x i8> %a) {
-; CHECK-LABEL: stofp_v4i8_v4bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <4 x i8> %a to <4 x bfloat>
-  ret <4 x bfloat> %c
-}
-
-define <4 x bfloat> @utofp_v4i8_v4bf16(<4 x i8> %a) {
-; CHECK-LABEL: utofp_v4i8_v4bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <4 x i8> %a to <4 x bfloat>
-  ret <4 x bfloat> %c
-}
-
-define <8 x bfloat> @stofp_v8i8_v8bf16(<8 x i8> %a) {
-; CHECK-LABEL: stofp_v8i8_v8bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v4.4s, #127, msl #8
-; CHECK-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    scvtf v2.4s, v2.4s
-; CHECK-NEXT:    scvtf v3.4s, v0.4s
-; CHECK-NEXT:    ushr v0.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    addhn v0.4h, v2.4s, v0.4s
-; CHECK-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <8 x i8> %a to <8 x bfloat>
-  ret <8 x bfloat> %c
-}
-
-define <8 x bfloat> @utofp_v8i8_v8bf16(<8 x i8> %a) {
-; CHECK-LABEL: utofp_v8i8_v8bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v4.4s, #127, msl #8
-; CHECK-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    ucvtf v2.4s, v2.4s
-; CHECK-NEXT:    ucvtf v3.4s, v0.4s
-; CHECK-NEXT:    ushr v0.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    addhn v0.4h, v2.4s, v0.4s
-; CHECK-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <8 x i8> %a to <8 x bfloat>
-  ret <8 x bfloat> %c
-}
-
-define <16 x bfloat> @stofp_v16i8_v16bf16(<16 x i8> %a) {
-; CHECK-LABEL: stofp_v16i8_v16bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v2.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v7.4s, #127, msl #8
-; CHECK-NEXT:    sshll v3.4s, v2.4h, #0
-; CHECK-NEXT:    sshll v4.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    scvtf v3.4s, v3.4s
-; CHECK-NEXT:    scvtf v4.4s, v4.4s
-; CHECK-NEXT:    scvtf v2.4s, v2.4s
-; CHECK-NEXT:    scvtf v6.4s, v0.4s
-; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
-; CHECK-NEXT:    ushr v0.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v6.4s, #16
-; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT:    and v17.16b, v17.16b, v1.16b
-; CHECK-NEXT:    add v5.4s, v5.4s, v7.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
-; CHECK-NEXT:    addhn v1.4h, v3.4s, v5.4s
-; CHECK-NEXT:    addhn v0.4h, v4.4s, v0.4s
-; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
-; CHECK-NEXT:    add v4.4s, v17.4s, v7.4s
-; CHECK-NEXT:    addhn2 v1.8h, v2.4s, v3.4s
-; CHECK-NEXT:    addhn2 v0.8h, v6.4s, v4.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <16 x i8> %a to <16 x bfloat>
-  ret <16 x bfloat> %c
-}
-
-define <16 x bfloat> @utofp_v16i8_v16bf16(<16 x i8> %a) {
-; CHECK-LABEL: utofp_v16i8_v16bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v7.4s, #127, msl #8
-; CHECK-NEXT:    ushll v3.4s, v2.4h, #0
-; CHECK-NEXT:    ushll v4.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    ucvtf v3.4s, v3.4s
-; CHECK-NEXT:    ucvtf v4.4s, v4.4s
-; CHECK-NEXT:    ucvtf v2.4s, v2.4s
-; CHECK-NEXT:    ucvtf v6.4s, v0.4s
-; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
-; CHECK-NEXT:    ushr v0.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v6.4s, #16
-; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT:    and v17.16b, v17.16b, v1.16b
-; CHECK-NEXT:    add v5.4s, v5.4s, v7.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
-; CHECK-NEXT:    addhn v1.4h, v3.4s, v5.4s
-; CHECK-NEXT:    addhn v0.4h, v4.4s, v0.4s
-; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
-; CHECK-NEXT:    add v4.4s, v17.4s, v7.4s
-; CHECK-NEXT:    addhn2 v1.8h, v2.4s, v3.4s
-; CHECK-NEXT:    addhn2 v0.8h, v6.4s, v4.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <16 x i8> %a to <16 x bfloat>
-  ret <16 x bfloat> %c
-}
-
-define <32 x bfloat> @stofp_v32i8_v32bf16(<32 x i8> %a) {
-; CHECK-LABEL: stofp_v32i8_v32bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v3.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v4.8h, v1.16b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    movi v21.4s, #127, msl #8
-; CHECK-NEXT:    sshll v5.4s, v3.4h, #0
-; CHECK-NEXT:    sshll v6.4s, v0.4h, #0
-; CHECK-NEXT:    sshll v7.4s, v4.4h, #0
-; CHECK-NEXT:    sshll v16.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    scvtf v5.4s, v5.4s
-; CHECK-NEXT:    scvtf v6.4s, v6.4s
-; CHECK-NEXT:    scvtf v7.4s, v7.4s
-; CHECK-NEXT:    scvtf v16.4s, v16.4s
-; CHECK-NEXT:    scvtf v17.4s, v3.4s
-; CHECK-NEXT:    scvtf v4.4s, v4.4s
-; CHECK-NEXT:    scvtf v18.4s, v0.4s
-; CHECK-NEXT:    scvtf v19.4s, v1.4s
-; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v3.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v1.4s, v7.4s, #16
-; CHECK-NEXT:    ushr v20.4s, v16.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
-; CHECK-NEXT:    ushr v24.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v22.4s, v18.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v19.4s, #16
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v20.16b, v20.16b, v2.16b
-; CHECK-NEXT:    and v23.16b, v23.16b, v2.16b
-; CHECK-NEXT:    and v24.16b, v24.16b, v2.16b
-; CHECK-NEXT:    and v22.16b, v22.16b, v2.16b
-; CHECK-NEXT:    and v25.16b, v25.16b, v2.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
-; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
-; CHECK-NEXT:    add v26.4s, v1.4s, v21.4s
-; CHECK-NEXT:    add v20.4s, v20.4s, v21.4s
-; CHECK-NEXT:    addhn v1.4h, v5.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v6.4s, v3.4s
-; CHECK-NEXT:    addhn v3.4h, v7.4s, v26.4s
-; CHECK-NEXT:    addhn v2.4h, v16.4s, v20.4s
-; CHECK-NEXT:    add v5.4s, v22.4s, v21.4s
-; CHECK-NEXT:    add v6.4s, v23.4s, v21.4s
-; CHECK-NEXT:    add v7.4s, v24.4s, v21.4s
-; CHECK-NEXT:    add v16.4s, v25.4s, v21.4s
-; CHECK-NEXT:    addhn2 v0.8h, v18.4s, v5.4s
-; CHECK-NEXT:    addhn2 v1.8h, v17.4s, v6.4s
-; CHECK-NEXT:    addhn2 v3.8h, v4.4s, v7.4s
-; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v16.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = sitofp <32 x i8> %a to <32 x bfloat>
-  ret <32 x bfloat> %c
-}
-
-define <32 x bfloat> @utofp_v32i8_v32bf16(<32 x i8> %a) {
-; CHECK-LABEL: utofp_v32i8_v32bf16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v3.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v4.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    movi v21.4s, #127, msl #8
-; CHECK-NEXT:    ushll v5.4s, v3.4h, #0
-; CHECK-NEXT:    ushll v6.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v7.4s, v4.4h, #0
-; CHECK-NEXT:    ushll v16.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-NEXT:    ushll2 v4.4s, v4.8h, #0
-; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ucvtf v5.4s, v5.4s
-; CHECK-NEXT:    ucvtf v6.4s, v6.4s
-; CHECK-NEXT:    ucvtf v7.4s, v7.4s
-; CHECK-NEXT:    ucvtf v16.4s, v16.4s
-; CHECK-NEXT:    ucvtf v17.4s, v3.4s
-; CHECK-NEXT:    ucvtf v4.4s, v4.4s
-; CHECK-NEXT:    ucvtf v18.4s, v0.4s
-; CHECK-NEXT:    ucvtf v19.4s, v1.4s
-; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v3.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v1.4s, v7.4s, #16
-; CHECK-NEXT:    ushr v20.4s, v16.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
-; CHECK-NEXT:    ushr v24.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v22.4s, v18.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v19.4s, #16
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v20.16b, v20.16b, v2.16b
-; CHECK-NEXT:    and v23.16b, v23.16b, v2.16b
-; CHECK-NEXT:    and v24.16b, v24.16b, v2.16b
-; CHECK-NEXT:    and v22.16b, v22.16b, v2.16b
-; CHECK-NEXT:    and v25.16b, v25.16b, v2.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
-; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
-; CHECK-NEXT:    add v26.4s, v1.4s, v21.4s
-; CHECK-NEXT:    add v20.4s, v20.4s, v21.4s
-; CHECK-NEXT:    addhn v1.4h, v5.4s, v0.4s
-; CHECK-NEXT:    addhn v0.4h, v6.4s, v3.4s
-; CHECK-NEXT:    addhn v3.4h, v7.4s, v26.4s
-; CHECK-NEXT:    addhn v2.4h, v16.4s, v20.4s
-; CHECK-NEXT:    add v5.4s, v22.4s, v21.4s
-; CHECK-NEXT:    add v6.4s, v23.4s, v21.4s
-; CHECK-NEXT:    add v7.4s, v24.4s, v21.4s
-; CHECK-NEXT:    add v16.4s, v25.4s, v21.4s
-; CHECK-NEXT:    addhn2 v0.8h, v18.4s, v5.4s
-; CHECK-NEXT:    addhn2 v1.8h, v17.4s, v6.4s
-; CHECK-NEXT:    addhn2 v3.8h, v4.4s, v7.4s
-; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v16.4s
-; CHECK-NEXT:    ret
-entry:
-  %c = uitofp <32 x i8> %a to <32 x bfloat>
-  ret <32 x bfloat> %c
-}
diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
new file mode 100644
index 0000000..ae681ee
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -force-vector-interleave=1 -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0)
+  ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_fixed_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
+  ret <4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    add z0.s, z2.s, z0.s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_quart:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    add z2.s, z2.s, z3.s
+; CHECK-NEXT:    add z0.s, z4.s, z0.s
+; CHECK-NEXT:    add z0.s, z2.s, z0.s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
+; CHECK-LABEL: partial_reduce_add_half_8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-NEXT:    add z1.s, z1.s, z3.s
+; CHECK-NEXT:    add z0.s, z4.s, z0.s
+; CHECK-NEXT:    add z1.s, z5.s, z1.s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
+  ret <vscale x 8 x i32> %partial.reduce
+}
+
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32>, <vscale x 8 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32>, <vscale x 16 x i32>)
+
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32>)
+
+attributes #0 = { "target-features"="+sve2" }
diff --git a/llvm/test/CodeGen/RISCV/convert-highly-predictable-select-to-branch.ll b/llvm/test/CodeGen/RISCV/convert-highly-predictable-select-to-branch.ll
new file mode 100644
index 0000000..1abd774
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/convert-highly-predictable-select-to-branch.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -mattr=+zicond < %s | FileCheck %s --check-prefixes=CHECK,CHEAP
+; RUN: llc -mtriple=riscv64 -mattr=+zicond,+predictable-select-expensive < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE
+
+; Test has not predictable select, which should not be transformed to a branch
+define i32 @test1(i32 %a) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sext.w a1, a0
+; CHECK-NEXT:    slti a1, a1, 1
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    czero.nez a0, a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 %a, 1
+  %dec = sub i32 %a, 1
+  %res = select i1 %cmp, i32 0, i32 %dec, !prof !0
+  ret i32 %res
+}
+
+; Test has highly predictable select according to profile data,
+; which should be transformed to a branch on cores with enabled FeaturePredictableSelectIsExpensive
+define i32 @test2(i32 %a) {
+; CHEAP-LABEL: test2:
+; CHEAP:       # %bb.0: # %entry
+; CHEAP-NEXT:    sext.w a1, a0
+; CHEAP-NEXT:    slti a1, a1, 1
+; CHEAP-NEXT:    addiw a0, a0, -1
+; CHEAP-NEXT:    czero.nez a0, a0, a1
+; CHEAP-NEXT:    ret
+;
+; EXPENSIVE-LABEL: test2:
+; EXPENSIVE:       # %bb.0: # %entry
+; EXPENSIVE-NEXT:    sext.w a1, a0
+; EXPENSIVE-NEXT:    blez a1, .LBB1_2
+; EXPENSIVE-NEXT:  # %bb.1: # %select.false
+; EXPENSIVE-NEXT:    addiw a0, a0, -1
+; EXPENSIVE-NEXT:    ret
+; EXPENSIVE-NEXT:  .LBB1_2:
+; EXPENSIVE-NEXT:    li a0, 0
+; EXPENSIVE-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 %a, 1
+  %dec = sub i32 %a, 1
+  %res = select i1 %cmp, i32 0, i32 %dec, !prof !1
+  ret i32 %res
+}
+
+!0 = !{!"branch_weights", i32 1, i32 1}
+!1 = !{!"branch_weights", i32 1, i32 1000}
diff --git a/llvm/test/CodeGen/RISCV/fold-binop-into-select-return-constant.ll b/llvm/test/CodeGen/RISCV/fold-binop-into-select-return-constant.ll
new file mode 100644
index 0000000..000da23
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/fold-binop-into-select-return-constant.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -mattr=+m < %s  | FileCheck  %s
+
+define i64 @fold_binop_into_select_return_constant(i1 %c) {
+; CHECK-LABEL: fold_binop_into_select_return_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %select1 = select i1 %c, i32 4, i32 8
+  %select2 = sext i32 %select1 to i64
+  %div1 = sdiv i64 %select2, -5141143369814759789
+  ret i64 %div1
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index da8db34..e95b0bf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -34,15 +34,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
-  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i2 = getelementptr inbounds i8, ptr %A, i64 %index
   %wide.load = load <32 x i8>, ptr %i2, align 1
   %i4 = add <32 x i8> %wide.load, %wide.masked.gather
   store <32 x i8> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -79,7 +79,7 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
   %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
   %i2 = getelementptr inbounds i8, ptr %A, i64 %index
@@ -87,7 +87,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   %i4 = add <32 x i8> %wide.load, %wide.masked.gather
   store <32 x i8> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -120,15 +120,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
-  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i2 = getelementptr inbounds i8, ptr %A, i64 %index
   %wide.load = load <32 x i8>, ptr %i2, align 1
   %i4 = add <32 x i8> %wide.load, %wide.masked.gather
   store <32 x i8> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -159,15 +159,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
-  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i2 = getelementptr inbounds i8, ptr %A, i64 %index
   %wide.load = load <32 x i8>, ptr %i2, align 1
   %i4 = add <32 x i8> %wide.load, %wide.masked.gather
   store <32 x i8> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -197,15 +197,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <8 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5)
   %i1 = getelementptr inbounds i8, ptr %B, <8 x i64> %i
-  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i2 = getelementptr inbounds i8, ptr %A, i64 %index
   %wide.load = load <8 x i32>, ptr %i2, align 4
   %i4 = add <8 x i32> %wide.load, %wide.masked.gather
   store <8 x i32> %i4, ptr %i2, align 4
   %index.next = add nuw i64 %index, 8
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -271,15 +271,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
-  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i2 = getelementptr inbounds i8, ptr %A, i64 %index
   %wide.load = load <32 x i8>, ptr %i2, align 1
   %i4 = udiv <32 x i8> %wide.masked.gather, %wide.load
   store <32 x i8> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -317,13 +317,13 @@ vector.body:                                      ; preds = %vector.body, %entry
   %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
   %i = getelementptr inbounds i8, ptr %B, i64 %index
   %wide.load = load <32 x i8>, ptr %i, align 1
-  %i2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
-  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i4 = add <32 x i8> %wide.masked.gather, %wide.load
-  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true))
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i5 = icmp eq i64 %index.next, 1024
   br i1 %i5, label %for.cond.cleanup, label %vector.body
 
@@ -362,13 +362,13 @@ vector.body:                                      ; preds = %vector.body, %entry
   %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
   %i = getelementptr inbounds i8, ptr %B, i64 %index
   %wide.load = load <32 x i8>, ptr %i, align 1
-  %i2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
   %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
   %i4 = add <32 x i8> %wide.masked.gather, %wide.load
   call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i5 = icmp eq i64 %index.next, 1024
   br i1 %i5, label %for.cond.cleanup, label %vector.body
 
@@ -408,15 +408,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = shl nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %i = shl nsw <8 x i64> %vec.ind, splat (i64 2)
   %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
-  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i2 = getelementptr inbounds i32, ptr %A, i64 %index
   %wide.load = load <8 x i32>, ptr %i2, align 1
   %i4 = add <8 x i32> %wide.load, %wide.masked.gather
   store <8 x i32> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 8
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -456,13 +456,13 @@ vector.body:                                      ; preds = %vector.body, %entry
   %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
   %i = getelementptr inbounds i32, ptr %B, i64 %index
   %wide.load = load <8 x i32>, ptr %i, align 1
-  %i2 = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %i2 = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
   %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2
-  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i4 = add <8 x i32> %wide.masked.gather, %wide.load
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true))
   %index.next = add nuw i64 %index, 8
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
   %i5 = icmp eq i64 %index.next, 1024
   br i1 %i5, label %for.cond.cleanup, label %vector.body
 
@@ -509,11 +509,11 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
-  %step.add = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %step.add = add <8 x i64> %vec.ind, splat (i64 8)
   %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1
   %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1
-  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
-  %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
+  %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i2 = getelementptr inbounds i32, ptr %A, i64 %index
   %wide.load = load <8 x i32>, ptr %i2, align 4
   %i4 = getelementptr inbounds i32, ptr %i2, i64 8
@@ -523,7 +523,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   store <8 x i32> %i6, ptr %i2, align 4
   store <8 x i32> %i7, ptr %i4, align 4
   %index.next = add nuw i64 %index, 16
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 16)
   %i10 = icmp eq i64 %index.next, 1024
   br i1 %i10, label %for.cond.cleanup, label %vector.body
 
@@ -582,39 +582,39 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %i = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
   %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
-  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind
-  %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  %i4 = or disjoint <8 x i64> %vec.ind, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-  %i5 = shl nsw <8 x i64> %i4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true))
+  %i4 = or disjoint <8 x i64> %vec.ind, splat (i64 1)
+  %i5 = shl nsw <8 x i64> %i4, splat (i64 2)
   %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5
-  %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4
-  %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  %i9 = or disjoint <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
-  %i10 = shl nsw <8 x i64> %i9, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true))
+  %i9 = or disjoint <8 x i64> %vec.ind, splat (i64 2)
+  %i10 = shl nsw <8 x i64> %i9, splat (i64 2)
   %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10
-  %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9
-  %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true))
   %i14 = or disjoint <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
   %i15 = shl nsw <8 x i64> %i14, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
   %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15
-  %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14
-  %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true))
   %index.next = add nuw i64 %index, 8
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
   %i19 = icmp eq i64 %index.next, 256
   br i1 %i19, label %for.cond.cleanup, label %vector.body
 
@@ -682,13 +682,13 @@ bb:
 bb2:                                              ; preds = %bb2, %bb
   %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
   %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
-  %i4 = mul nuw nsw <2 x i64> %i3, <i64 5, i64 5>
-  %i5 = mul <2 x i64> %i3, <i64 5, i64 5>
+  %i4 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
+  %i5 = mul <2 x i64> %i3, splat (i64 5)
   %i6 = add <2 x i64> %i5, <i64 10, i64 10>
   %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4
   %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6
-  %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> <i1 true, i1 true>, <2 x ptr> undef)
-  %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> <i1 true, i1 true>, <2 x ptr> undef)
+  %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
+  %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
   %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i
   store <2 x ptr> %i9, ptr %i11, align 8
   %i13 = getelementptr inbounds ptr, ptr %i11, i64 2
@@ -763,13 +763,13 @@ bb2:                                              ; preds = %bb2, %bb
   %i6 = load <2 x ptr>, ptr %i4, align 8
   %i7 = getelementptr inbounds ptr, ptr %i4, i64 2
   %i9 = load <2 x ptr>, ptr %i7, align 8
-  %i10 = mul nuw nsw <2 x i64> %i3, <i64 5, i64 5>
-  %i11 = mul <2 x i64> %i3, <i64 5, i64 5>
+  %i10 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
+  %i11 = mul <2 x i64> %i3, splat (i64 5)
   %i12 = add <2 x i64> %i11, <i64 10, i64 10>
   %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10
   %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12
-  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> <i1 true, i1 true>)
-  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> <i1 true, i1 true>)
+  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> splat (i1 true))
+  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> splat (i1 true))
   %i15 = add nuw i64 %i, 4
   %i16 = add <2 x i64> %i3, <i64 4, i64 4>
   %i17 = icmp eq i64 %i15, 1024
@@ -863,15 +863,15 @@ bb15:                                             ; preds = %bb15, %bb9
   %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ]
   %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ]
   %i18 = add i64 %i16, %i4
-  %i19 = mul nsw <32 x i64> %i17, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i19 = mul nsw <32 x i64> %i17, splat (i64 5)
   %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19
-  %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18
   %i24 = load <32 x i8>, ptr %i22, align 1
   %i25 = add <32 x i8> %i24, %i21
   store <32 x i8> %i25, ptr %i22, align 1
   %i27 = add nuw i64 %i16, 32
-  %i28 = add <32 x i64> %i17, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %i28 = add <32 x i64> %i17, splat (i64 32)
   %i29 = icmp eq i64 %i27, %i10
   br i1 %i29, label %bb30, label %bb15
 
@@ -938,15 +938,15 @@ bb2:                                              ; preds = %bb
 bb4:                                              ; preds = %bb4, %bb2
   %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ]
   %i6 = phi <16 x i64> [ %i14, %bb4 ], [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %bb2 ]
-  %i7 = mul <16 x i64> %i6, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i7 = mul <16 x i64> %i6, splat (i64 5)
   %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7
-  %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
+  %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> splat (i1 true), <16 x i8> undef)
   %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5
   %i11 = load <16 x i8>, ptr %i10, align 1
   %i12 = add <16 x i8> %i11, %i9
   store <16 x i8> %i12, ptr %i10, align 1
   %i13 = add nuw i64 %i5, 16
-  %i14 = add <16 x i64> %i6, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+  %i14 = add <16 x i64> %i6, splat (i64 16)
   %i15 = icmp eq i64 %i13, %i
   br i1 %i15, label %bb16, label %bb4
 
@@ -977,15 +977,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <8 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5)
   %i1 = getelementptr inbounds float, ptr %B, <8 x i64> %i
-  %wide.masked.gather = call <8 x float> @llvm.masked.gather.v8f32.v32p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
+  %wide.masked.gather = call <8 x float> @llvm.masked.gather.v8f32.v32p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x float> undef)
   %i2 = getelementptr inbounds float, ptr %A, i64 %index
   %wide.load = load <8 x float>, ptr %i2, align 4
   %i4 = fadd <8 x float> %wide.load, %wide.masked.gather
   store <8 x float> %i4, ptr %i2, align 4
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index 77243c0..ab5885a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -34,15 +34,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
-  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i2 = getelementptr inbounds i8, ptr %A, i64 %index
   %wide.load = load <32 x i8>, ptr %i2, align 1
   %i4 = add <32 x i8> %wide.load, %wide.masked.gather
   store <32 x i8> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -76,7 +76,7 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
   %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
   %i2 = getelementptr inbounds i8, ptr %A, i64 %index
@@ -84,7 +84,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   %i4 = add <32 x i8> %wide.load, %wide.masked.gather
   store <32 x i8> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -118,15 +118,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
-  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i2 = getelementptr inbounds i8, ptr %A, i64 %index
   %wide.load = load <32 x i8>, ptr %i2, align 1
   %i4 = add <32 x i8> %wide.load, %wide.masked.gather
   store <32 x i8> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -160,15 +160,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
-  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i2 = getelementptr inbounds i8, ptr %A, i64 %index
   %wide.load = load <32 x i8>, ptr %i2, align 1
   %i4 = add <32 x i8> %wide.load, %wide.masked.gather
   store <32 x i8> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -208,13 +208,13 @@ vector.body:                                      ; preds = %vector.body, %entry
   %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
   %i = getelementptr inbounds i8, ptr %B, i64 %index
   %wide.load = load <32 x i8>, ptr %i, align 1
-  %i2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
-  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i4 = add <32 x i8> %wide.masked.gather, %wide.load
-  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true))
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i5 = icmp eq i64 %index.next, 1024
   br i1 %i5, label %for.cond.cleanup, label %vector.body
 
@@ -250,13 +250,13 @@ vector.body:                                      ; preds = %vector.body, %entry
   %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
   %i = getelementptr inbounds i8, ptr %B, i64 %index
   %wide.load = load <32 x i8>, ptr %i, align 1
-  %i2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
   %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
   %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
   %i4 = add <32 x i8> %wide.masked.gather, %wide.load
   call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
   %i5 = icmp eq i64 %index.next, 1024
   br i1 %i5, label %for.cond.cleanup, label %vector.body
 
@@ -294,15 +294,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = shl nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %i = shl nsw <8 x i64> %vec.ind, splat (i64 2)
   %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
-  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i2 = getelementptr inbounds i32, ptr %A, i64 %index
   %wide.load = load <8 x i32>, ptr %i2, align 1
   %i4 = add <8 x i32> %wide.load, %wide.masked.gather
   store <8 x i32> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 8
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -343,13 +343,13 @@ vector.body:                                      ; preds = %vector.body, %entry
   %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
   %i = shl nsw <8 x i64> %vec.ind, %.splat
   %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
-  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i2 = getelementptr inbounds i32, ptr %A, i64 %index
   %wide.load = load <8 x i32>, ptr %i2, align 1
   %i4 = add <8 x i32> %wide.load, %wide.masked.gather
   store <8 x i32> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 8
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -390,13 +390,13 @@ vector.body:                                      ; preds = %vector.body, %entry
   %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
   %i = shl nsw <8 x i64> %.splat, %vec.ind
   %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
-  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i2 = getelementptr inbounds i32, ptr %A, i64 %index
   %wide.load = load <8 x i32>, ptr %i2, align 1
   %i4 = add <8 x i32> %wide.load, %wide.masked.gather
   store <8 x i32> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 8
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
@@ -436,13 +436,13 @@ vector.body:                                      ; preds = %vector.body, %entry
   %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
   %i = getelementptr inbounds i32, ptr %B, i64 %index
   %wide.load = load <8 x i32>, ptr %i, align 1
-  %i2 = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %i2 = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
   %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2
-  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i4 = add <8 x i32> %wide.masked.gather, %wide.load
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true))
   %index.next = add nuw i64 %index, 8
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
   %i5 = icmp eq i64 %index.next, 1024
   br i1 %i5, label %for.cond.cleanup, label %vector.body
 
@@ -492,11 +492,11 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
-  %step.add = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %step.add = add <8 x i64> %vec.ind, splat (i64 8)
   %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1
   %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1
-  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
-  %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
+  %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i2 = getelementptr inbounds i32, ptr %A, i64 %index
   %wide.load = load <8 x i32>, ptr %i2, align 4
   %i4 = getelementptr inbounds i32, ptr %i2, i64 8
@@ -506,7 +506,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   store <8 x i32> %i6, ptr %i2, align 4
   store <8 x i32> %i7, ptr %i4, align 4
   %index.next = add nuw i64 %index, 16
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 16)
   %i10 = icmp eq i64 %index.next, 1024
   br i1 %i10, label %for.cond.cleanup, label %vector.body
 
@@ -580,39 +580,39 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  %i = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
   %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
-  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind
-  %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  %i4 = or disjoint <8 x i64> %vec.ind, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-  %i5 = shl nsw <8 x i64> %i4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true))
+  %i4 = or disjoint <8 x i64> %vec.ind, splat (i64 1)
+  %i5 = shl nsw <8 x i64> %i4, splat (i64 2)
   %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5
-  %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4
-  %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  %i9 = or disjoint <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
-  %i10 = shl nsw <8 x i64> %i9, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true))
+  %i9 = or disjoint <8 x i64> %vec.ind, splat (i64 2)
+  %i10 = shl nsw <8 x i64> %i9, splat (i64 2)
   %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10
-  %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9
-  %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  %i14 = or disjoint <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
-  %i15 = shl nsw <8 x i64> %i14, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true))
+  %i14 = or disjoint <8 x i64> %vec.ind, splat (i64 3)
+  %i15 = shl nsw <8 x i64> %i14, splat (i64 2)
   %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15
-  %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14
-  %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
   %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true))
   %index.next = add nuw i64 %index, 8
-  %vec.ind.next = add <8 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
   %i19 = icmp eq i64 %index.next, 256
   br i1 %i19, label %for.cond.cleanup, label %vector.body
 
@@ -680,13 +680,13 @@ bb:
 bb2:                                              ; preds = %bb2, %bb
   %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
   %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
-  %i4 = mul nuw nsw <2 x i64> %i3, <i64 5, i64 5>
-  %i5 = mul <2 x i64> %i3, <i64 5, i64 5>
+  %i4 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
+  %i5 = mul <2 x i64> %i3, splat (i64 5)
   %i6 = add <2 x i64> %i5, <i64 10, i64 10>
   %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4
   %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6
-  %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> <i1 true, i1 true>, <2 x ptr> undef)
-  %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> <i1 true, i1 true>, <2 x ptr> undef)
+  %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
+  %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
   %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i
   store <2 x ptr> %i9, ptr %i11, align 8
   %i13 = getelementptr inbounds ptr, ptr %i11, i64 2
@@ -761,13 +761,13 @@ bb2:                                              ; preds = %bb2, %bb
   %i6 = load <2 x ptr>, ptr %i4, align 8
   %i7 = getelementptr inbounds ptr, ptr %i4, i64 2
   %i9 = load <2 x ptr>, ptr %i7, align 8
-  %i10 = mul nuw nsw <2 x i64> %i3, <i64 5, i64 5>
-  %i11 = mul <2 x i64> %i3, <i64 5, i64 5>
+  %i10 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
+  %i11 = mul <2 x i64> %i3, splat (i64 5)
   %i12 = add <2 x i64> %i11, <i64 10, i64 10>
   %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10
   %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12
-  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> <i1 true, i1 true>)
-  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> <i1 true, i1 true>)
+  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> splat (i1 true))
+  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> splat (i1 true))
   %i15 = add nuw i64 %i, 4
   %i16 = add <2 x i64> %i3, <i64 4, i64 4>
   %i17 = icmp eq i64 %i15, 1024
@@ -856,15 +856,15 @@ bb15:                                             ; preds = %bb15, %bb9
   %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ]
   %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ]
   %i18 = add i64 %i16, %i4
-  %i19 = mul nsw <32 x i64> %i17, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i19 = mul nsw <32 x i64> %i17, splat (i64 5)
   %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19
-  %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18
   %i24 = load <32 x i8>, ptr %i22, align 1
   %i25 = add <32 x i8> %i24, %i21
   store <32 x i8> %i25, ptr %i22, align 1
   %i27 = add nuw i64 %i16, 32
-  %i28 = add <32 x i64> %i17, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %i28 = add <32 x i64> %i17, splat (i64 32)
   %i29 = icmp eq i64 %i27, %i10
   br i1 %i29, label %bb30, label %bb15
 
@@ -932,15 +932,15 @@ bb2:                                              ; preds = %bb
 bb4:                                              ; preds = %bb4, %bb2
   %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ]
   %i6 = phi <16 x i64> [ %i14, %bb4 ], [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %bb2 ]
-  %i7 = mul <16 x i64> %i6, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
+  %i7 = mul <16 x i64> %i6, splat (i64 5)
   %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7
-  %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
+  %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> splat (i1 true), <16 x i8> undef)
   %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5
   %i11 = load <16 x i8>, ptr %i10, align 1
   %i12 = add <16 x i8> %i11, %i9
   store <16 x i8> %i12, ptr %i10, align 1
   %i13 = add nuw i64 %i5, 16
-  %i14 = add <16 x i64> %i6, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+  %i14 = add <16 x i64> %i6, splat (i64 16)
   %i15 = icmp eq i64 %i13, %i
   br i1 %i15, label %bb16, label %bb4
 
@@ -958,7 +958,7 @@ entry:
   %0 = insertelement <8 x ptr> poison, ptr %a, i64 0
   %1 = shufflevector <8 x ptr> %0, <8 x ptr> poison, <8 x i32> zeroinitializer
   %2 = getelementptr i8, <8 x ptr> %1, <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
-  %3 = tail call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %2, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> poison)
+  %3 = tail call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %2, i32 1, <8 x i1> splat (i1 true), <8 x i8> poison)
   ret <8 x i8> %3
 }
 
@@ -991,15 +991,15 @@ entry:
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   %vec.ind = phi <32 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, %entry ], [ %vec.ind.next, %vector.body ]
-  %i = mul nuw nsw <32 x i16> %vec.ind, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %i = mul nuw nsw <32 x i16> %vec.ind, splat (i16 5)
   %i1 = getelementptr inbounds i8, ptr %B, <32 x i16> %i
-  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
   %i2 = getelementptr inbounds i8, ptr %A, i64 %index
   %wide.load = load <32 x i8>, ptr %i2, align 1
   %i4 = add <32 x i8> %wide.load, %wide.masked.gather
   store <32 x i8> %i4, ptr %i2, align 1
   %index.next = add nuw i64 %index, 32
-  %vec.ind.next = add <32 x i16> %vec.ind, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  %vec.ind.next = add <32 x i16> %vec.ind, splat (i16 32)
   %i6 = icmp eq i64 %index.next, 1024
   br i1 %i6, label %for.cond.cleanup, label %vector.body
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index 7011928..f93022c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -1125,3 +1125,39 @@ exit:
   call void @llvm.riscv.vse.nxv8i8(<vscale x 8 x i8> %1, ptr %p, i64 1)
   ret void
 }
+
+; Check that we don't forward an AVL if we wouldn't be able to extend its
+; LiveInterval without clobbering other val nos.
+define <vscale x 4 x i32> @unforwardable_avl(i64 %n, <vscale x 4 x i32> %v, i1 %cmp) {
+; CHECK-LABEL: unforwardable_avl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a2, a0, e32, m2, ta, ma
+; CHECK-NEXT:    andi a1, a1, 1
+; CHECK-NEXT:  .LBB27_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    bnez a1, .LBB27_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vadd.vv v10, v8, v8
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.riscv.vsetvli.i64(i64 %n, i64 2, i64 1)
+  br label %for.body
+
+for.body:
+  ; Use %n in a PHI here so its virtual register is assigned to a second time here.
+  %1 = phi i64 [ %3, %for.body ], [ %n, %entry ]
+  %2 = tail call i64 @llvm.riscv.vsetvli.i64(i64 %1, i64 0, i64 0)
+  %3 = add i64 %1, 1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  %4 = tail call <vscale x 4 x i32> @llvm.riscv.vadd.nxv2f32.nxv2f32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> %v, <vscale x 4 x i32> %v, i64 -1)
+  ; VL toggle needed here: If the %n AVL was forwarded here we wouldn't be able
+  ; to extend it's LiveInterval because it would clobber the assignment at %1.
+  %5 = tail call <vscale x 4 x i32> @llvm.riscv.vadd.nxv2f32.nxv2f32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> %4, <vscale x 4 x i32> %v, i64 %0)
+  ret <vscale x 4 x i32> %5
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
index 4091d17..ee10c99 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
@@ -134,6 +134,10 @@
     ret void
   }
 
+  define void @unforwardable_avl() {
+    ret void
+  }
+
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 
   declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, i64) #1
@@ -990,3 +994,43 @@ body: |
     %x:gpr = PseudoVMV_X_S undef $noreg, 6
     PseudoBR %bb.1
 ...
+---
+name: unforwardable_avl
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: unforwardable_avl
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $x10, $v8m2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %avl:gprnox0 = COPY $x10
+  ; CHECK-NEXT:   %outvl:gprnox0 = PseudoVSETVLI %avl, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $v8m2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead %avl:gprnox0 = ADDI %avl, 1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $v8m2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT:   renamable $v10m2 = PseudoVADD_VV_M2 undef renamable $v10m2, renamable $v8m2, renamable $v8m2, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   dead $x0 = PseudoVSETVLI %outvl, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT:   renamable $v8m2 = PseudoVADD_VV_M2 undef renamable $v8m2, killed renamable $v10m2, renamable $v8m2, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   PseudoRET implicit $v8m2
+  bb.0:
+    liveins: $x10, $v8m2
+    %avl:gprnox0 = COPY $x10
+    %outvl:gprnox0 = PseudoVSETVLI %avl:gprnox0, 209, implicit-def dead $vl, implicit-def dead $vtype
+
+  bb.1:
+    liveins: $v8m2
+    %avl:gprnox0 = ADDI %avl:gprnox0, 1
+
+  bb.2:
+    liveins: $v8m2
+    renamable $v10m2 = PseudoVADD_VV_M2 undef renamable $v10m2, renamable $v8m2, renamable $v8m2, -1, 5, 0
+    renamable $v8m2 = PseudoVADD_VV_M2 undef renamable $v8m2, killed renamable $v10m2, killed renamable $v8m2, %outvl:gprnox0, 5, 0
+    PseudoRET implicit $v8m2
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 40648ad..ca855cf 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -44,7 +44,6 @@
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
 ; CHECK-NEXT:       Local Stack Slot Allocation
 ; CHECK-NEXT:       X86 speculative load hardening
-; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       X86 EFLAGS copy lowering
 ; CHECK-NEXT:       X86 DynAlloca Expander
 ; CHECK-NEXT:       Fast Tile Register Preconfigure
diff --git a/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll b/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll
index daa521d..0c7275e 100644
--- a/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll
+++ b/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll
@@ -70,3 +70,31 @@ define i32 @mask_offset_scale_zext_i32_i64(ptr %base, i32 %i) {
   %load = load i32, ptr %arrayidx, align 4
   ret i32 %load
 }
+
+; PR97533 - multiple uses of shl node (add + gep) in the same dependency chain.
+define i64 @add_shl_zext(ptr %ptr, i8 %arg) nounwind {
+; X86-LABEL: add_shl_zext:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 4(%esi,%ecx,4), %edx
+; X86-NEXT:    leal (,%ecx,8), %eax
+; X86-NEXT:    addl (%esi,%ecx,4), %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: add_shl_zext:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    shll $3, %eax
+; X64-NEXT:    addq (%rdi,%rax), %rax
+; X64-NEXT:    retq
+  %idx = zext i8 %arg to i64
+  %gep = getelementptr ptr, ptr %ptr, i64 %idx
+  %val = load i64, ptr %gep, align 8
+  %shl = shl i64 %idx, 3
+  %sum = add i64 %val, %shl
+  ret i64 %sum
+}
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 15c496b..9bee9d0 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -125,7 +125,6 @@
 ; CHECK-NEXT:       X86 Optimize Call Frame
 ; CHECK-NEXT:       X86 Avoid Store Forwarding Block
 ; CHECK-NEXT:       X86 speculative load hardening
-; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       X86 EFLAGS copy lowering
 ; CHECK-NEXT:       X86 DynAlloca Expander
 ; CHECK-NEXT:       MachineDominator Tree Construction
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index 30c3d53d..c9edd3f 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -444,12 +444,10 @@ define i64 @ashr_add_neg_shl_i32(i64 %r) nounwind {
 define i64 @ashr_add_neg_shl_i8(i64 %r) nounwind {
 ; X86-LABEL: ashr_add_neg_shl_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $24, %eax
-; X86-NEXT:    movl $33554432, %edx # imm = 0x2000000
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    sarl $24, %eax
+; X86-NEXT:    movb $2, %al
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll
index 870912b..39cbee5 100644
--- a/llvm/test/CodeGen/X86/sttni.ll
+++ b/llvm/test/CodeGen/X86/sttni.ll
@@ -341,9 +341,10 @@ define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs,
 ; X64-NEXT:  .LBB8_2: # %compare
 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    andl $7, %ecx
-; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; X64-NEXT:    addl %ecx, %ecx
+; X64-NEXT:    movzwl -24(%rsp,%rcx), %eax
 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
+; X64-NEXT:    subw -40(%rsp,%rcx), %ax
 ; X64-NEXT:    movzwl %ax, %eax
 ; X64-NEXT:    retq
 entry:
@@ -481,9 +482,10 @@ define i32 @pcmpestri_mem_diff_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32
 ; X64-NEXT:  .LBB11_2: # %compare
 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    andl $7, %ecx
-; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; X64-NEXT:    addl %ecx, %ecx
+; X64-NEXT:    movzwl -24(%rsp,%rcx), %eax
 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
+; X64-NEXT:    subw -40(%rsp,%rcx), %ax
 ; X64-NEXT:    movzwl %ax, %eax
 ; X64-NEXT:    retq
 entry:
@@ -795,9 +797,10 @@ define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
 ; X64-NEXT:  .LBB20_2: # %compare
 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    andl $7, %ecx
-; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; X64-NEXT:    addl %ecx, %ecx
+; X64-NEXT:    movzwl -24(%rsp,%rcx), %eax
 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
+; X64-NEXT:    subw -40(%rsp,%rcx), %ax
 ; X64-NEXT:    movzwl %ax, %eax
 ; X64-NEXT:    retq
 entry:
@@ -915,9 +918,10 @@ define i32 @pcmpistri_mem_diff_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
 ; X64-NEXT:  .LBB23_2: # %compare
 ; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    andl $7, %ecx
-; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; X64-NEXT:    addl %ecx, %ecx
+; X64-NEXT:    movzwl -24(%rsp,%rcx), %eax
 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
+; X64-NEXT:    subw -40(%rsp,%rcx), %ax
 ; X64-NEXT:    movzwl %ax, %eax
 ; X64-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 584a749..6350344 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --no_x86_scrub_mem_shuffle
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
@@ -19,9 +19,9 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
 ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; SSE3-NEXT:    movq %xmm1, %rcx
 ; SSE3-NEXT:    andl $1, %ecx
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSE3-NEXT:    movsd -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
 ; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE3-NEXT:    retq
 ;
@@ -32,9 +32,9 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; SSSE3-NEXT:    movq %xmm1, %rcx
 ; SSSE3-NEXT:    andl $1, %ecx
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSSE3-NEXT:    movsd -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
 ; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSSE3-NEXT:    retq
 ;
@@ -73,16 +73,16 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
 ; SSE3-NEXT:    movd %xmm2, %edx
 ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
 ; SSE3-NEXT:    movd %xmm1, %esi
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
 ; SSE3-NEXT:    andl $3, %eax
 ; SSE3-NEXT:    andl $3, %ecx
 ; SSE3-NEXT:    andl $3, %edx
 ; SSE3-NEXT:    andl $3, %esi
-; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movss -24(%rsp,%rsi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movss -24(%rsp,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
 ; SSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movss -24(%rsp,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movss -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
 ; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE3-NEXT:    retq
@@ -137,7 +137,7 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
 ; SSE3-NEXT:    pextrw $5, %xmm1, %r8d
 ; SSE3-NEXT:    pextrw $6, %xmm1, %r9d
 ; SSE3-NEXT:    pextrw $7, %xmm1, %r10d
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
 ; SSE3-NEXT:    andl $7, %eax
 ; SSE3-NEXT:    andl $7, %ecx
 ; SSE3-NEXT:    andl $7, %edx
@@ -226,69 +226,69 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
 define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v16i8:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movaps %xmm1, -40(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSE3-NEXT:    movzbl -25(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm1
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -26(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm2
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -27(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm4
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -28(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm3
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -29(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm6
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -30(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm7
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -31(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm8
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -32(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm5
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -33(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm9
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -34(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm10
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -35(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm12
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -36(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm11
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -37(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm13
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -38(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm14
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -39(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm15
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -40(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm0
@@ -382,9 +382,9 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun
 ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; SSE3-NEXT:    movq %xmm1, %rcx
 ; SSE3-NEXT:    andl $1, %ecx
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: var_shuffle_v2f64:
@@ -394,9 +394,9 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; SSSE3-NEXT:    movq %xmm1, %rcx
 ; SSSE3-NEXT:    andl $1, %ecx
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shuffle_v2f64:
@@ -434,16 +434,16 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi
 ; SSE3-NEXT:    movd %xmm2, %edx
 ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
 ; SSE3-NEXT:    movd %xmm1, %esi
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
 ; SSE3-NEXT:    andl $3, %eax
 ; SSE3-NEXT:    andl $3, %ecx
 ; SSE3-NEXT:    andl $3, %edx
 ; SSE3-NEXT:    andl $3, %esi
-; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movss -24(%rsp,%rsi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movss -24(%rsp,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
 ; SSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movss -24(%rsp,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movss -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
 ; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE3-NEXT:    retq
@@ -490,69 +490,69 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi
 define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movaps %xmm1, -40(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSE3-NEXT:    movzbl -25(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm1
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -26(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm2
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -27(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm4
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -28(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm3
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -29(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm6
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -30(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm7
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -31(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm8
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -32(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm5
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -33(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm9
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -34(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm10
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -35(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm12
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -36(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm11
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -37(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm13
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -38(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm14
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -39(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm15
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -40(%rsp), %eax
 ; SSE3-NEXT:    andl $15, %eax
 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm0
@@ -649,56 +649,56 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
 ; SSE3-NEXT:    pushq %r12
 ; SSE3-NEXT:    pushq %rbx
 ; SSE3-NEXT:    subq $424, %rsp # imm = 0x1A8
-; SSE3-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movaps %xmm2, -128(%rsp)
+; SSE3-NEXT:    movaps %xmm1, 400(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 384(%rsp)
+; SSE3-NEXT:    movzbl -128(%rsp), %eax
 ; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r14d
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r15d
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm1, 368(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 352(%rsp)
+; SSE3-NEXT:    movzbl -127(%rsp), %ecx
+; SSE3-NEXT:    movaps %xmm1, 336(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 320(%rsp)
+; SSE3-NEXT:    movzbl -126(%rsp), %edx
+; SSE3-NEXT:    movaps %xmm1, 304(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 288(%rsp)
+; SSE3-NEXT:    movzbl -125(%rsp), %esi
+; SSE3-NEXT:    movaps %xmm1, 272(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 256(%rsp)
+; SSE3-NEXT:    movzbl -124(%rsp), %edi
+; SSE3-NEXT:    movaps %xmm1, 240(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 224(%rsp)
+; SSE3-NEXT:    movzbl -123(%rsp), %r8d
+; SSE3-NEXT:    movaps %xmm1, 208(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 192(%rsp)
+; SSE3-NEXT:    movzbl -122(%rsp), %r9d
+; SSE3-NEXT:    movaps %xmm1, 176(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 160(%rsp)
+; SSE3-NEXT:    movzbl -121(%rsp), %r10d
+; SSE3-NEXT:    movaps %xmm1, 144(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 128(%rsp)
+; SSE3-NEXT:    movzbl -120(%rsp), %r11d
+; SSE3-NEXT:    movaps %xmm1, 112(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 96(%rsp)
+; SSE3-NEXT:    movzbl -119(%rsp), %ebx
+; SSE3-NEXT:    movaps %xmm1, 80(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 64(%rsp)
+; SSE3-NEXT:    movzbl -118(%rsp), %r14d
+; SSE3-NEXT:    movaps %xmm1, 48(%rsp)
+; SSE3-NEXT:    movaps %xmm0, 32(%rsp)
+; SSE3-NEXT:    movzbl -117(%rsp), %r15d
+; SSE3-NEXT:    movaps %xmm1, 16(%rsp)
 ; SSE3-NEXT:    movaps %xmm0, (%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r12d
-; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r13d
-; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
-; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -116(%rsp), %r12d
+; SSE3-NEXT:    movaps %xmm1, -16(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -32(%rsp)
+; SSE3-NEXT:    movzbl -115(%rsp), %r13d
+; SSE3-NEXT:    movaps %xmm1, -48(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -64(%rsp)
+; SSE3-NEXT:    movzbl -114(%rsp), %ebp
+; SSE3-NEXT:    movaps %xmm1, -80(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -96(%rsp)
+; SSE3-NEXT:    movzbl -113(%rsp), %eax
 ; SSE3-NEXT:    andl $31, %eax
 ; SSE3-NEXT:    movzbl -96(%rsp,%rax), %eax
 ; SSE3-NEXT:    movd %eax, %xmm1
@@ -781,56 +781,56 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
 ; SSSE3-NEXT:    pushq %r12
 ; SSSE3-NEXT:    pushq %rbx
 ; SSSE3-NEXT:    subq $424, %rsp # imm = 0x1A8
-; SSSE3-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    movaps %xmm2, -128(%rsp)
+; SSSE3-NEXT:    movaps %xmm1, 400(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 384(%rsp)
+; SSSE3-NEXT:    movzbl -128(%rsp), %eax
 ; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r14d
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r15d
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps %xmm1, 368(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 352(%rsp)
+; SSSE3-NEXT:    movzbl -127(%rsp), %ecx
+; SSSE3-NEXT:    movaps %xmm1, 336(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 320(%rsp)
+; SSSE3-NEXT:    movzbl -126(%rsp), %edx
+; SSSE3-NEXT:    movaps %xmm1, 304(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 288(%rsp)
+; SSSE3-NEXT:    movzbl -125(%rsp), %esi
+; SSSE3-NEXT:    movaps %xmm1, 272(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 256(%rsp)
+; SSSE3-NEXT:    movzbl -124(%rsp), %edi
+; SSSE3-NEXT:    movaps %xmm1, 240(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 224(%rsp)
+; SSSE3-NEXT:    movzbl -123(%rsp), %r8d
+; SSSE3-NEXT:    movaps %xmm1, 208(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 192(%rsp)
+; SSSE3-NEXT:    movzbl -122(%rsp), %r9d
+; SSSE3-NEXT:    movaps %xmm1, 176(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 160(%rsp)
+; SSSE3-NEXT:    movzbl -121(%rsp), %r10d
+; SSSE3-NEXT:    movaps %xmm1, 144(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 128(%rsp)
+; SSSE3-NEXT:    movzbl -120(%rsp), %r11d
+; SSSE3-NEXT:    movaps %xmm1, 112(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 96(%rsp)
+; SSSE3-NEXT:    movzbl -119(%rsp), %ebx
+; SSSE3-NEXT:    movaps %xmm1, 80(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 64(%rsp)
+; SSSE3-NEXT:    movzbl -118(%rsp), %r14d
+; SSSE3-NEXT:    movaps %xmm1, 48(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, 32(%rsp)
+; SSSE3-NEXT:    movzbl -117(%rsp), %r15d
+; SSSE3-NEXT:    movaps %xmm1, 16(%rsp)
 ; SSSE3-NEXT:    movaps %xmm0, (%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r12d
-; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r13d
-; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
-; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    movzbl -116(%rsp), %r12d
+; SSSE3-NEXT:    movaps %xmm1, -16(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -32(%rsp)
+; SSSE3-NEXT:    movzbl -115(%rsp), %r13d
+; SSSE3-NEXT:    movaps %xmm1, -48(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -64(%rsp)
+; SSSE3-NEXT:    movzbl -114(%rsp), %ebp
+; SSSE3-NEXT:    movaps %xmm1, -80(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -96(%rsp)
+; SSSE3-NEXT:    movzbl -113(%rsp), %eax
 ; SSSE3-NEXT:    andl $31, %eax
 ; SSSE3-NEXT:    movzbl -96(%rsp,%rax), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm1
@@ -908,39 +908,39 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    subq $392, %rsp # imm = 0x188
 ; SSE41-NEXT:    movd %xmm2, %eax
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 368(%rsp)
+; SSE41-NEXT:    movaps %xmm0, 352(%rsp)
 ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 336(%rsp)
+; SSE41-NEXT:    movaps %xmm0, 320(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 304(%rsp)
+; SSE41-NEXT:    movaps %xmm0, 288(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 272(%rsp)
+; SSE41-NEXT:    movaps %xmm0, 256(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 240(%rsp)
+; SSE41-NEXT:    movaps %xmm0, 224(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 208(%rsp)
+; SSE41-NEXT:    movaps %xmm0, 192(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 176(%rsp)
+; SSE41-NEXT:    movaps %xmm0, 160(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 144(%rsp)
+; SSE41-NEXT:    movaps %xmm0, 128(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 112(%rsp)
+; SSE41-NEXT:    movaps %xmm0, 96(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 80(%rsp)
+; SSE41-NEXT:    movaps %xmm0, 64(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 48(%rsp)
+; SSE41-NEXT:    movaps %xmm0, 32(%rsp)
+; SSE41-NEXT:    movaps %xmm1, 16(%rsp)
 ; SSE41-NEXT:    movaps %xmm0, (%rsp)
-; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm1, -16(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -32(%rsp)
+; SSE41-NEXT:    movaps %xmm1, -48(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -64(%rsp)
+; SSE41-NEXT:    movaps %xmm1, -80(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -96(%rsp)
+; SSE41-NEXT:    movaps %xmm1, -112(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -128(%rsp)
 ; SSE41-NEXT:    movzbl 352(%rsp,%rax), %eax
 ; SSE41-NEXT:    movd %eax, %xmm0
 ; SSE41-NEXT:    pextrb $1, %xmm2, %eax
@@ -1102,14 +1102,15 @@ define void @indices_convert() {
 ; SSE3-LABEL: indices_convert:
 ; SSE3:       # %bb.0: # %bb
 ; SSE3-NEXT:    movaps (%rax), %xmm0
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -40(%rsp)
 ; SSE3-NEXT:    movl (%rax), %eax
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -56(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -72(%rsp)
 ; SSE3-NEXT:    andl $3, %eax
-; SSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE3-NEXT:    shll $3, %eax
+; SSE3-NEXT:    movsd -72(%rsp,%rax), %xmm0 # xmm0 = mem[0],zero
+; SSE3-NEXT:    movsd -40(%rsp,%rax), %xmm1 # xmm1 = mem[0],zero
 ; SSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE3-NEXT:    movups %xmm1, (%rax)
 ; SSE3-NEXT:    retq
@@ -1117,14 +1118,15 @@ define void @indices_convert() {
 ; SSSE3-LABEL: indices_convert:
 ; SSSE3:       # %bb.0: # %bb
 ; SSSE3-NEXT:    movaps (%rax), %xmm0
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -40(%rsp)
 ; SSSE3-NEXT:    movl (%rax), %eax
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -56(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -72(%rsp)
 ; SSSE3-NEXT:    andl $3, %eax
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT:    shll $3, %eax
+; SSSE3-NEXT:    movsd -72(%rsp,%rax), %xmm0 # xmm0 = mem[0],zero
+; SSSE3-NEXT:    movsd -40(%rsp,%rax), %xmm1 # xmm1 = mem[0],zero
 ; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSSE3-NEXT:    movups %xmm1, (%rax)
 ; SSSE3-NEXT:    retq
@@ -1133,15 +1135,15 @@ define void @indices_convert() {
 ; SSE41:       # %bb.0: # %bb
 ; SSE41-NEXT:    movaps (%rax), %xmm0
 ; SSE41-NEXT:    extractps $2, %xmm0, %eax
-; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -24(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -40(%rsp)
 ; SSE41-NEXT:    andl $3, %eax
 ; SSE41-NEXT:    extractps $3, %xmm0, %ecx
-; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -56(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -72(%rsp)
 ; SSE41-NEXT:    andl $3, %ecx
-; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE41-NEXT:    movsd -72(%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero
+; SSE41-NEXT:    movsd -40(%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero
 ; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE41-NEXT:    movups %xmm1, (%rax)
 ; SSE41-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 5c82b92..0baa5bd 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --no_x86_scrub_mem_shuffle
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefix=XOP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=INT256,AVX2
@@ -1189,8 +1189,8 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr
 ; AVX2-NEXT:    andl $3, %eax
 ; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
 ; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vmovq (%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq (%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovapd {{.*#+}} ymm1 = [34,68,102,136]
@@ -1211,8 +1211,8 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr
 ; AVX512-NEXT:    andl $3, %eax
 ; AVX512-NEXT:    vpextrd $1, %xmm1, %ecx
 ; AVX512-NEXT:    andl $3, %ecx
-; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vmovq (%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovq (%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX512-NEXT:    vpcmpgtq %zmm0, %zmm2, %k1
 ; AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [17,51,85,119]
@@ -1234,8 +1234,8 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr
 ; AVX512VL-NEXT:    andl $3, %eax
 ; AVX512VL-NEXT:    vpextrd $1, %xmm1, %ecx
 ; AVX512VL-NEXT:    andl $3, %ecx
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512VL-NEXT:    vmovq (%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero
+; AVX512VL-NEXT:    vmovq (%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX512VL-NEXT:    vpcmpgtq %ymm0, %ymm2, %k1
 ; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} ymm0 = [34,68,102,136]
diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll
index 032ffb0..8878801 100644
--- a/llvm/test/CodeGen/X86/var-permute-512.ll
+++ b/llvm/test/CodeGen/X86/var-permute-512.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --no_x86_scrub_mem_shuffle
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index 8dda271..364390a 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -259,6 +259,50 @@ define void @blendv_split(ptr %p, <8 x i32> %cond, <8 x i32> %a, <8 x i32> %x, <
   ret void
 }
 
+; Concatenate 128-bit pblendvb back together on AVX2+ targets (hidden by SSE __m128i bitcasts)
+define <4 x i64> @vselect_concat_split_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i64>  %c, <4 x i64> %d) {
+; AVX1-LABEL: vselect_concat_split_v16i8:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vselect_concat_split_v16i8:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vselect_concat_split_v16i8:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vpternlogq $216, %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %a.bc = bitcast <4 x i64> %a to <32 x i8>
+  %b.bc = bitcast <4 x i64> %b to <32 x i8>
+  %c.bc = bitcast <4 x i64> %c to <32 x i8>
+  %d.bc = bitcast <4 x i64> %d to <32 x i8>
+  %cmp = icmp slt <32 x i8> %c.bc, %d.bc
+  %a.lo = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b.lo = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %cmp.lo = shufflevector <32 x i1> %cmp, <32 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %lo = select <16 x i1> %cmp.lo, <16 x i8> %b.lo, <16 x i8> %a.lo
+  %a.hi = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %b.hi = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %cmp.hi = shufflevector <32 x i1> %cmp, <32 x i1> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %hi = select <16 x i1> %cmp.hi, <16 x i8> %b.hi, <16 x i8> %a.hi
+  %concat = shufflevector <16 x i8> %lo, <16 x i8> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %result = bitcast <32 x i8> %concat to <4 x i64>
+  ret <4 x i64> %result
+}
+
 ; Regression test for rGea8fb3b60196
 define void @vselect_concat() {
 ; AVX-LABEL: vselect_concat:
diff --git a/llvm/test/Demangle/ms-auto-templates.test b/llvm/test/Demangle/ms-auto-templates.test
index a90ffb6..414885d 100644
--- a/llvm/test/Demangle/ms-auto-templates.test
+++ b/llvm/test/Demangle/ms-auto-templates.test
@@ -55,3 +55,45 @@
 
 ??0?$AutoNTTPClass@$MH0A@$M_N0A@$MD0GB@@@QEAA@XZ
 ; CHECK: public: __cdecl AutoNTTPClass<0, 0, 97>::AutoNTTPClass<0, 0, 97>(void)
+
+??0?$AutoNTTPClass@$M$$T0A@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<0>::AutoNTTPClass<0>(void)
+
+??0?$AutoNTTPClass@$0A@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<0>::AutoNTTPClass<0>(void)
+
+??0?$AutoNTTPClass@$MP8S@@EAAXXZ1?f@1@QEAAXXZ@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<&public: void __cdecl S::f(void)>::AutoNTTPClass<&public: void __cdecl S::f(void)>(void)
+
+??0?$AutoNTTPClass@$1?f@S@@QEAAXXZ@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<&public: void __cdecl S::f(void)>::AutoNTTPClass<&public: void __cdecl S::f(void)>(void)
+
+??0?$AutoNTTPClass@$MP8M@@EAAXXZH?f@1@QEAAXXZA@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<{public: void __cdecl M::f(void), 0}>::AutoNTTPClass<{public: void __cdecl M::f(void), 0}>(void)
+
+??0?$AutoNTTPClass@$H?f@M@@QEAAXXZA@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<{public: void __cdecl M::f(void), 0}>::AutoNTTPClass<{public: void __cdecl M::f(void), 0}>(void)
+
+??0?$AutoNTTPClass@$MP8V@@EAAXXZI?f@1@QEAAXXZA@A@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<{public: void __cdecl V::f(void), 0, 0}>::AutoNTTPClass<{public: void __cdecl V::f(void), 0, 0}>(void)
+
+??0?$AutoNTTPClass@$I?f@V@@QEAAXXZA@A@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<{public: void __cdecl V::f(void), 0, 0}>::AutoNTTPClass<{public: void __cdecl V::f(void), 0, 0}>(void)
+
+??0?$AutoNTTPClass@$MPEQS@@H07@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<8>::AutoNTTPClass<8>(void)
+
+??0?$AutoNTTPClass@$07@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<8>::AutoNTTPClass<8>(void)
+
+??0?$AutoNTTPClass@$MPEQM@@H0M@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<12>::AutoNTTPClass<12>(void)
+
+??0?$AutoNTTPClass@$0M@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<12>::AutoNTTPClass<12>(void)
+
+??0?$AutoNTTPClass@$MPEQV@@HFBA@A@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<{16, 0}>::AutoNTTPClass<{16, 0}>(void)
+
+??0?$AutoNTTPClass@$FBA@A@@@QEAA@XZ
+; CHECK: public: __cdecl AutoNTTPClass<{16, 0}>::AutoNTTPClass<{16, 0}>(void)
diff --git a/llvm/test/MC/X86/AMX/amx-error.s b/llvm/test/MC/X86/AMX/amx-error.s
new file mode 100644
index 0000000..ee2ac83
--- /dev/null
+++ b/llvm/test/MC/X86/AMX/amx-error.s
@@ -0,0 +1,25 @@
+// RUN: not llvm-mc -triple x86_64 %s 2>&1 | FileCheck %s
+
+// CHECK: error: all tmm registers must be distinct
+tcmmimfp16ps %tmm0, %tmm0, %tmm0
+
+// CHECK: error: all tmm registers must be distinct
+tcmmrlfp16ps %tmm1, %tmm0, %tmm1
+
+// CHECK: error: all tmm registers must be distinct
+tdpbf16ps %tmm2, %tmm2, %tmm0
+
+// CHECK: error: all tmm registers must be distinct
+tdpfp16ps %tmm3, %tmm0, %tmm0
+
+// CHECK: error: all tmm registers must be distinct
+tdpbssd %tmm0, %tmm0, %tmm0
+
+// CHECK: error: all tmm registers must be distinct
+tdpbsud %tmm1, %tmm0, %tmm1
+
+// CHECK: error: all tmm registers must be distinct
+tdpbusd %tmm2, %tmm2, %tmm0
+
+// CHECK: error: all tmm registers must be distinct
+tdpbuud %tmm3, %tmm0, %tmm0
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
index 0024b0a..caaed62 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
@@ -220,3 +220,24 @@ define <2 x i16> @and_with_poison(<2 x i8> %a) {
   %res = and <2 x i16> %zext, <i16 u0xff, i16 poison>
   ret <2 x i16> %res
 }
+
+
+
+define <4 x i64> @issue_97674_getConstantOnEdge(i1 %cond) {
+entry:
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  %folds = add <4 x i64> zeroinitializer, <i64 1, i64 1, i64 1, i64 1>
+  br label %if.end
+
+if.end:
+  %r = phi <4 x i64> [ %folds, %if.then ], [ zeroinitializer, %entry ]
+  ret <4 x i64> %r
+}
+    
+define <4 x i64> @issue_97674_getConstant() {
+entry:
+  %folds = add <4 x i64> zeroinitializer, zeroinitializer
+  ret <4 x i64> %folds
+}
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-pmulh.ll b/llvm/test/Transforms/InstCombine/X86/x86-pmulh.ll
index 185ab46..947c7d3 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-pmulh.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-pmulh.ll
@@ -233,3 +233,40 @@ define <32 x i16> @elts_pmulh_512(<32 x i16> %a0, <32 x i16> %a1) {
   %4 = shufflevector <32 x i16> %3, <32 x i16> poison, <32 x i32> zeroinitializer
   ret <32 x i16> %4
 }
+
+;
+; Known Bits
+;
+
+define <8 x i16> @known_pmulh_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
+; CHECK-LABEL: @known_pmulh_128(
+; CHECK-NEXT:    ret <8 x i16> [[A2:%.*]]
+;
+  %x0 = lshr <8 x i16> %a0, <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %x1 = and <8 x i16> %a1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %m = tail call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %x0, <8 x i16> %x1)
+  %r = add <8 x i16> %m, %a2
+  ret <8 x i16> %r
+}
+
+define <16 x i16> @known_pmulh_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2) {
+; CHECK-LABEL: @known_pmulh_256(
+; CHECK-NEXT:    ret <16 x i16> [[A2:%.*]]
+;
+  %x0 = lshr <16 x i16> %a0, <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %x1 = and <16 x i16> %a1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %m = tail call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %x0, <16 x i16> %x1)
+  %r = add <16 x i16> %m, %a2
+  ret <16 x i16> %r
+}
+
+define <32 x i16> @known_pmulh_512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> %a2) {
+; CHECK-LABEL: @known_pmulh_512(
+; CHECK-NEXT:    ret <32 x i16> [[A2:%.*]]
+;
+  %x0 = lshr <32 x i16> %a0, <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %x1 = and <32 x i16> %a1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %m = tail call <32 x i16> @llvm.x86.avx512.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1)
+  %r = add <32 x i16> %m, %a2
+  ret <32 x i16> %r
+}
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-pmulhu.ll b/llvm/test/Transforms/InstCombine/X86/x86-pmulhu.ll
index b18833f..560969f 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-pmulhu.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-pmulhu.ll
@@ -227,3 +227,40 @@ define <32 x i16> @elts_pmulhu_512(<32 x i16> %a0, <32 x i16> %a1) {
   %4 = shufflevector <32 x i16> %3, <32 x i16> poison, <32 x i32> zeroinitializer
   ret <32 x i16> %4
 }
+
+;
+; Known Bits
+;
+
+define <8 x i16> @known_pmulhu_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
+; CHECK-LABEL: @known_pmulhu_128(
+; CHECK-NEXT:    ret <8 x i16> [[A2:%.*]]
+;
+  %x0 = lshr <8 x i16> %a0, <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %x1 = and <8 x i16> %a1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %m = tail call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %x0, <8 x i16> %x1)
+  %r = add <8 x i16> %m, %a2
+  ret <8 x i16> %r
+}
+
+define <16 x i16> @known_pmulhu_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2) {
+; CHECK-LABEL: @known_pmulhu_256(
+; CHECK-NEXT:    ret <16 x i16> [[A2:%.*]]
+;
+  %x0 = lshr <16 x i16> %a0, <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %x1 = and <16 x i16> %a1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %m = tail call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %x0, <16 x i16> %x1)
+  %r = add <16 x i16> %m, %a2
+  ret <16 x i16> %r
+}
+
+define <32 x i16> @known_pmulhu_512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> %a2) {
+; CHECK-LABEL: @known_pmulhu_512(
+; CHECK-NEXT:    ret <32 x i16> [[A2:%.*]]
+;
+  %x0 = lshr <32 x i16> %a0, <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %x1 = and <32 x i16> %a1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %m = tail call <32 x i16> @llvm.x86.avx512.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1)
+  %r = add <32 x i16> %m, %a2
+  ret <32 x i16> %r
+}
diff --git a/llvm/test/Transforms/LoopFlatten/preserving_debugloc_br.ll b/llvm/test/Transforms/LoopFlatten/preserving_debugloc_br.ll
new file mode 100644
index 0000000..eb15c3e
--- /dev/null
+++ b/llvm/test/Transforms/LoopFlatten/preserving_debugloc_br.ll
@@ -0,0 +1,74 @@
+; RUN: opt -S -passes="loop(loop-flatten)" < %s | FileCheck %s
+
+; Check that LoopFlatten's DoFlattenLoopPair() propagates the debug location of the
+; original terminator to the new branch instruction.
+
+define i32 @test1(i32 %val, ptr nocapture %A) !dbg !5 {
+; CHECK-LABEL: define i32 @test1(
+; CHECK-LABEL: for.body3:
+; CHECK:         br label %for.inc6, !dbg [[DBG22:![0-9]+]]
+; CHECK-LABEL: for.inc6:
+;
+entry:
+  br label %for.body, !dbg !8
+
+for.body:                                         ; preds = %for.inc6, %entry
+  %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ], !dbg !9
+  %mul = mul nuw nsw i32 %i.018, 20, !dbg !10
+  br label %for.body3, !dbg !11
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ], !dbg !12
+  %add = add nuw nsw i32 %j.017, %mul, !dbg !13
+  %arrayidx = getelementptr inbounds i16, ptr %A, i32 %add, !dbg !14
+  %0 = load i16, ptr %arrayidx, align 2, !dbg !15
+  %conv16 = zext i16 %0 to i32, !dbg !16
+  %add4 = add i32 %conv16, %val, !dbg !17
+  %conv5 = trunc i32 %add4 to i16, !dbg !18
+  store i16 %conv5, ptr %arrayidx, align 2, !dbg !19
+  %inc = add nuw nsw i32 %j.017, 1, !dbg !20
+  %exitcond = icmp ne i32 %inc, 20, !dbg !21
+  br i1 %exitcond, label %for.body3, label %for.inc6, !dbg !22
+
+for.inc6:                                         ; preds = %for.body3
+  %inc7 = add nuw nsw i32 %i.018, 1, !dbg !23
+  %exitcond19 = icmp ne i32 %inc7, 10, !dbg !24
+  br i1 %exitcond19, label %for.body, label %for.end8, !dbg !25
+
+for.end8:                                         ; preds = %for.inc6
+  ret i32 10, !dbg !26
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+; CHECK: [[DBG22]] = !DILocation(line: 15,
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "temp.ll", directory: "/")
+!2 = !{i32 19}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test1", linkageName: "test1", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 1, scope: !5)
+!9 = !DILocation(line: 2, column: 1, scope: !5)
+!10 = !DILocation(line: 3, column: 1, scope: !5)
+!11 = !DILocation(line: 4, column: 1, scope: !5)
+!12 = !DILocation(line: 5, column: 1, scope: !5)
+!13 = !DILocation(line: 6, column: 1, scope: !5)
+!14 = !DILocation(line: 7, column: 1, scope: !5)
+!15 = !DILocation(line: 8, column: 1, scope: !5)
+!16 = !DILocation(line: 9, column: 1, scope: !5)
+!17 = !DILocation(line: 10, column: 1, scope: !5)
+!18 = !DILocation(line: 11, column: 1, scope: !5)
+!19 = !DILocation(line: 12, column: 1, scope: !5)
+!20 = !DILocation(line: 13, column: 1, scope: !5)
+!21 = !DILocation(line: 14, column: 1, scope: !5)
+!22 = !DILocation(line: 15, column: 1, scope: !5)
+!23 = !DILocation(line: 16, column: 1, scope: !5)
+!24 = !DILocation(line: 17, column: 1, scope: !5)
+!25 = !DILocation(line: 18, column: 1, scope: !5)
+!26 = !DILocation(line: 19, column: 1, scope: !5)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
index 7353acd..1ab1908 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
@@ -20,6 +20,7 @@ define ptr @foo(ptr %a0, ptr %a1, i64 %a2) {
 ; CHECK-NEXT:    mv a3, a0
 ; CHECK-NEXT:  .LBB0_3: # %do.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vse8.v v8, (a3)
 ; CHECK-NEXT:    add a3, a3, a4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
new file mode 100644
index 0000000..75907e0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+; Make sure we do not pick <vscale x 1 x i64> as VF for a loop with a
+; first-order recurrence.
+define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
+; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for(
+; CHECK-SAME: ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[SCALAR_RECUR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %for = phi i64 [ 0, %entry ], [ %l, %loop ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l = load i64, ptr %gep, align 8
+  %ec = icmp eq i64 %iv, 22
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i64 [ %for, %loop ]
+  ret i64 %res
+}
+
+attributes #0 = { "target-features"="+64bit,+v,+zvl128b,+zvl256b" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
new file mode 100644
index 0000000..314a133
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -scalable-vectorization=on -force-vector-width=1 -force-target-supports-scalable-vectors=true -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+
+
+define i64 @pr97452_scalable_vf1_for_live_out(ptr %src) {
+; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for_live_out(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %for = phi i64 [ 0, %entry ], [ %l, %loop ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l = load i64, ptr %gep, align 8
+  %ec = icmp eq i64 %iv, 22
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i64 [ %for, %loop ]
+  ret i64 %res
+}
+
+
+define void @pr97452_scalable_vf1_for_no_live_out(ptr %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @pr97452_scalable_vf1_for_no_live_out(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[L]], ptr [[GEP_DST]], align 8
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %for = phi i64 [ 0, %entry ], [ %l, %loop ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l = load i64, ptr %gep, align 8
+  %gep.dst = getelementptr inbounds i64, ptr %dst, i64 %iv
+  store i64 %l, ptr %gep.dst
+  %ec = icmp eq i64 %iv, 22
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LowerConstantIntrinsics/preserving-debugloc-br.ll b/llvm/test/Transforms/LowerConstantIntrinsics/preserving-debugloc-br.ll
new file mode 100644
index 0000000..0302c9d
--- /dev/null
+++ b/llvm/test/Transforms/LowerConstantIntrinsics/preserving-debugloc-br.ll
@@ -0,0 +1,48 @@
+; RUN: opt -S -passes=lower-constant-intrinsics < %s | FileCheck %s
+
+; Check that LowerConstantIntrinsics's replaceConditionalBranchesOnConstant() correctly
+; propagates the debug location from the old br instruction to the new one.
+
+; Function Attrs: nounwind
+define i32 @test_branch(i32 %in) !dbg !5 {
+; CHECK-LABEL: define i32 @test_branch(
+; CHECK:           br label %[[FALSE:.*]], !dbg [[DBG8:![0-9]+]]
+; CHECK:       [[FALSE]]:
+;
+  %v = call i1 @llvm.is.constant.i32(i32 %in), !dbg !8
+  br i1 %v, label %True, label %False, !dbg !9
+
+True:                                             ; preds = %0
+  %call1 = tail call i32 @subfun_1(), !dbg !10
+  ret i32 %call1, !dbg !11
+
+False:                                            ; preds = %0
+  %call2 = tail call i32 @subfun_2(), !dbg !12
+  ret i32 %call2, !dbg !13
+}
+
+declare i32 @subfun_1()
+declare i32 @subfun_2()
+
+declare i1 @llvm.is.constant.i32(i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+; CHECK: [[DBG8]] = !DILocation(line: 2,
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "main.ll", directory: "/")
+!2 = !{i32 6}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test_branch", linkageName: "test_branch", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 1, scope: !5)
+!9 = !DILocation(line: 2, column: 1, scope: !5)
+!10 = !DILocation(line: 3, column: 1, scope: !5)
+!11 = !DILocation(line: 4, column: 1, scope: !5)
+!12 = !DILocation(line: 5, column: 1, scope: !5)
+!13 = !DILocation(line: 6, column: 1, scope: !5)
diff --git a/llvm/test/Transforms/Mem2Reg/single-store.ll b/llvm/test/Transforms/Mem2Reg/single-store.ll
new file mode 100644
index 0000000..f864227
--- /dev/null
+++ b/llvm/test/Transforms/Mem2Reg/single-store.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=mem2reg < %s | FileCheck %s
+
+define i8 @single_store_literal_poison(i1 %cond) {
+; CHECK-LABEL: define i8 @single_store_literal_poison(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[EXIT:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 undef
+;
+  %a = alloca i8, align 1
+  br i1 %cond, label %if, label %exit
+
+if:
+  store i8 poison, ptr %a, align 1
+  br label %exit
+
+exit:
+  %v = load i8, ptr %a, align 1
+  ret i8 %v
+}
+
+; FIXME: This is a miscompile.
+define i8 @single_store_maybe_poison(i1 %cond, i8 %x) {
+; CHECK-LABEL: define i8 @single_store_maybe_poison(
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]]) {
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[EXIT:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %a = alloca i8, align 1
+  br i1 %cond, label %if, label %exit
+
+if:
+  store i8 %x, ptr %a, align 1
+  br label %exit
+
+exit:
+  %v = load i8, ptr %a, align 1
+  ret i8 %v
+}
+
+define i8 @single_store_cant_be_poison(i1 %cond, i8 noundef %x) {
+; CHECK-LABEL: define i8 @single_store_cant_be_poison(
+; CHECK-SAME: i1 [[COND:%.*]], i8 noundef [[X:%.*]]) {
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[EXIT:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %a = alloca i8, align 1
+  br i1 %cond, label %if, label %exit
+
+if:
+  store i8 %x, ptr %a, align 1
+  br label %exit
+
+exit:
+  %v = load i8, ptr %a, align 1
+  ret i8 %v
+}
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
index 7dc0ba50..03fbb5e 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
@@ -37,14 +37,14 @@ declare <1 x i64> @llvm.vp.mul.v1i64(<1 x i64>, <1 x i64>, <1 x i1>, i32)
 declare <4 x i64> @llvm.vp.mul.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
 
 define <vscale x 1 x i64> @add_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @add_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = add i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @add_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = add i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @add_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -55,6 +55,15 @@ define <vscale x 1 x i64> @add_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @add_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -80,14 +89,14 @@ define <vscale x 1 x i64> @add_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 }
 
 define <vscale x 1 x i64> @sub_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @sub_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = sub i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @sub_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = sub i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @sub_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -98,6 +107,15 @@ define <vscale x 1 x i64> @sub_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @sub_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -123,14 +141,14 @@ define <vscale x 1 x i64> @sub_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 }
 
 define <vscale x 1 x i64> @mul_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @mul_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = mul i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @mul_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = mul i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @mul_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -141,6 +159,15 @@ define <vscale x 1 x i64> @mul_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @mul_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -166,14 +193,14 @@ define <vscale x 1 x i64> @mul_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 }
 
 define <vscale x 1 x i64> @sdiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @sdiv_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = sdiv i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @sdiv_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = sdiv i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @sdiv_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -184,6 +211,15 @@ define <vscale x 1 x i64> @sdiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @sdiv_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -230,14 +266,14 @@ define <vscale x 1 x i64> @sdiv_nxv1i64_unspeculatable(i64 %x, i64 %y, i32 zeroe
 }
 
 define <vscale x 1 x i64> @udiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @udiv_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = udiv i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @udiv_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = udiv i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @udiv_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -248,6 +284,15 @@ define <vscale x 1 x i64> @udiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @udiv_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -294,14 +339,14 @@ define <vscale x 1 x i64> @udiv_nxv1i64_unspeculatable(i64 %x, i64 %y, i32 zeroe
 }
 
 define <vscale x 1 x i64> @srem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @srem_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = srem i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @srem_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = srem i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @srem_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -312,6 +357,15 @@ define <vscale x 1 x i64> @srem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @srem_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -358,14 +412,14 @@ define <vscale x 1 x i64> @srem_nxv1i64_unspeculatable(i64 %x, i64 %y, i32 zeroe
 }
 
 define <vscale x 1 x i64> @urem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @urem_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = urem i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @urem_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = urem i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @urem_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -376,6 +430,15 @@ define <vscale x 1 x i64> @urem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @urem_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -598,14 +661,14 @@ define <vscale x 1 x i64> @urem_nxv1i64_anymask_knownvl(i64 %x, i64 %y, <vscale
 }
 
 define <vscale x 1 x i64> @ashr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @ashr_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = ashr i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @ashr_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = ashr i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @ashr_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -616,6 +679,15 @@ define <vscale x 1 x i64> @ashr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @ashr_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -641,14 +713,14 @@ define <vscale x 1 x i64> @ashr_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 }
 
 define <vscale x 1 x i64> @lshr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @lshr_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = lshr i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @lshr_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = lshr i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @lshr_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -659,6 +731,15 @@ define <vscale x 1 x i64> @lshr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @lshr_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -684,14 +765,14 @@ define <vscale x 1 x i64> @lshr_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 }
 
 define <vscale x 1 x i64> @shl_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @shl_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = shl i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @shl_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = shl i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @shl_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -702,6 +783,15 @@ define <vscale x 1 x i64> @shl_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @shl_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -727,14 +817,14 @@ define <vscale x 1 x i64> @shl_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 }
 
 define <vscale x 1 x i64> @or_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @or_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = or i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @or_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = or i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @or_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -745,6 +835,15 @@ define <vscale x 1 x i64> @or_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y,
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @or_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -770,14 +869,14 @@ define <vscale x 1 x i64> @or_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <vs
 }
 
 define <vscale x 1 x i64> @and_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @and_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = and i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @and_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = and i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @and_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -788,6 +887,15 @@ define <vscale x 1 x i64> @and_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @and_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -813,14 +921,14 @@ define <vscale x 1 x i64> @and_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 }
 
 define <vscale x 1 x i64> @xor_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @xor_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = xor i64 [[Y:%.*]], 42
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+; VEC-COMBINE-64-LABEL: @xor_nxv1i64_allonesmask(
+; VEC-COMBINE-64-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-64-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP1:%.*]] = xor i64 [[Y:%.*]], 42
+; VEC-COMBINE-64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
+; VEC-COMBINE-64-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-64-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-64-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
 ; NO-VEC-COMBINE-LABEL: @xor_nxv1i64_allonesmask(
 ; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
@@ -831,6 +939,15 @@ define <vscale x 1 x i64> @xor_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
+; VEC-COMBINE-32-LABEL: @xor_nxv1i64_allonesmask(
+; VEC-COMBINE-32-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-32-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; VEC-COMBINE-32-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-32-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-32-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; VEC-COMBINE-32-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
@@ -1013,23 +1130,14 @@ define <vscale x 1 x i64> @umax_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 }
 
 define <vscale x 1 x float> @fadd_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @fadd_nxv1f32_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fadd float [[Y:%.*]], 4.200000e+01
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @fadd_nxv1f32_allonesmask(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; ALL-LABEL: @fadd_nxv1f32_allonesmask(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1056,23 +1164,14 @@ define <vscale x 1 x float> @fadd_nxv1f32_anymask(<vscale x 1 x float> %x, float
 }
 
 define <vscale x 1 x float> @fsub_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @fsub_nxv1f32_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fsub float [[Y:%.*]], 4.200000e+01
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @fsub_nxv1f32_allonesmask(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; ALL-LABEL: @fsub_nxv1f32_allonesmask(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1099,23 +1198,14 @@ define <vscale x 1 x float> @fsub_nxv1f32_anymask(<vscale x 1 x float> %x, float
 }
 
 define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fdiv float [[Y:%.*]], 4.200000e+01
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; ALL-LABEL: @fdiv_nxv1f32_allonesmask(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1185,23 +1275,14 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 }
 
 define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask_knownvl(<vscale x 1 x float> %x, float %y) {
-; VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fdiv float [[Y:%.*]], 4.200000e+01
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 4)
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; ALL-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
+; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
diff --git a/llvm/test/tools/llvm-config/paths.test b/llvm/test/tools/llvm-config/paths.test
index 6fa43a1..419f155 100644
--- a/llvm/test/tools/llvm-config/paths.test
+++ b/llvm/test/tools/llvm-config/paths.test
@@ -1,21 +1,21 @@
 # Check directory options for obvious issues.
 
 RUN: llvm-config --bindir 2>&1 | FileCheck --check-prefix=CHECK-BINDIR %s
-CHECK-BINDIR: {{.*}}{{/|\\\\}}bin
+CHECK-BINDIR: {{.*}}{{/|\\}}bin
 CHECK-BINDIR-NOT: error:
 CHECK-BINDIR-NOT: warning
 
 RUN: llvm-config --includedir 2>&1 | FileCheck --check-prefix=CHECK-INCLUDEDIR %s
-CHECK-INCLUDEDIR: {{.*}}{{/|\\\\}}include
+CHECK-INCLUDEDIR: {{.*}}{{/|\\}}include
 CHECK-INCLUDEDIR-NOT: error:
 CHECK-INCLUDEDIR-NOT: warning
 
 RUN: llvm-config --libdir 2>&1 | FileCheck --check-prefix=CHECK-LIBDIR %s
-CHECK-LIBDIR: {{.*}}{{/|\\\\}}lib{{.*}}
+CHECK-LIBDIR: {{.*}}{{/|\\}}lib{{.*}}
 CHECK-LIBDIR-NOT: error:
 CHECK-LIBDIR-NOT: warning
 
 RUN: llvm-config --cmakedir 2>&1 | FileCheck --check-prefix=CHECK-CMAKEDIR %s
-CHECK-CMAKEDIR: {{.*}}{{/|\\\\}}cmake{{/|\\\\}}llvm
+CHECK-CMAKEDIR: {{.*}}{{/|\\}}cmake{{/|\\}}llvm
 CHECK-CMAKEDIR-NOT: error:
 CHECK-CMAKEDIR-NOT: warning
diff --git a/llvm/tools/llvm-config/llvm-config.cpp b/llvm/tools/llvm-config/llvm-config.cpp
index db92d46..d5b76b1 100644
--- a/llvm/tools/llvm-config/llvm-config.cpp
+++ b/llvm/tools/llvm-config/llvm-config.cpp
@@ -24,7 +24,6 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Program.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
@@ -327,7 +326,7 @@ int main(int argc, char **argv) {
   // information.
   std::string ActivePrefix, ActiveBinDir, ActiveIncludeDir, ActiveLibDir,
               ActiveCMakeDir;
-  std::vector<std::string> ActiveIncludeOptions;
+  std::string ActiveIncludeOption;
   if (IsInDevelopmentTree) {
     ActiveIncludeDir = std::string(LLVM_SRC_ROOT) + "/include";
     ActivePrefix = CurrentExecPrefix;
@@ -353,8 +352,8 @@ int main(int argc, char **argv) {
     }
 
     // We need to include files from both the source and object trees.
-    ActiveIncludeOptions.push_back(ActiveIncludeDir);
-    ActiveIncludeOptions.push_back(ActiveObjRoot + "/include");
+    ActiveIncludeOption =
+        ("-I" + ActiveIncludeDir + " " + "-I" + ActiveObjRoot + "/include");
   } else {
     ActivePrefix = CurrentExecPrefix;
     {
@@ -373,7 +372,7 @@ int main(int argc, char **argv) {
       sys::fs::make_absolute(ActivePrefix, Path);
       ActiveCMakeDir = std::string(Path);
     }
-    ActiveIncludeOptions.push_back(ActiveIncludeDir);
+    ActiveIncludeOption = "-I" + ActiveIncludeDir;
   }
 
   /// We only use `shared library` mode in cases where the static library form
@@ -402,8 +401,8 @@ int main(int argc, char **argv) {
       std::replace(ActiveBinDir.begin(), ActiveBinDir.end(), '/', '\\');
       std::replace(ActiveLibDir.begin(), ActiveLibDir.end(), '/', '\\');
       std::replace(ActiveCMakeDir.begin(), ActiveCMakeDir.end(), '/', '\\');
-      for (auto &Include : ActiveIncludeOptions)
-        std::replace(Include.begin(), Include.end(), '/', '\\');
+      std::replace(ActiveIncludeOption.begin(), ActiveIncludeOption.end(), '/',
+                   '\\');
     }
     SharedDir = ActiveBinDir;
     StaticDir = ActiveLibDir;
@@ -505,20 +504,6 @@ int main(int argc, char **argv) {
   };
 
   raw_ostream &OS = outs();
-
-  // Render include paths and associated flags
-  auto RenderFlags = [&](StringRef Flags) {
-    bool First = true;
-    for (auto &Include : ActiveIncludeOptions) {
-      if (!First)
-        OS << ' ';
-      OS << "-I";
-      sys::printArg(OS, Include, /*Quote=*/true);
-      First = false;
-    }
-    OS << ' ' << Flags << '\n';
-  };
-
   for (int i = 1; i != argc; ++i) {
     StringRef Arg = argv[i];
 
@@ -527,30 +512,24 @@ int main(int argc, char **argv) {
       if (Arg == "--version") {
         OS << PACKAGE_VERSION << '\n';
       } else if (Arg == "--prefix") {
-        sys::printArg(OS, ActivePrefix, /*Quote=*/true);
-        OS << '\n';
+        OS << ActivePrefix << '\n';
       } else if (Arg == "--bindir") {
-        sys::printArg(OS, ActiveBinDir, /*Quote=*/true);
-        OS << '\n';
+        OS << ActiveBinDir << '\n';
       } else if (Arg == "--includedir") {
-        sys::printArg(OS, ActiveIncludeDir, /*Quote=*/true);
-        OS << '\n';
+        OS << ActiveIncludeDir << '\n';
       } else if (Arg == "--libdir") {
-        sys::printArg(OS, ActiveLibDir, /*Quote=*/true);
-        OS << '\n';
+        OS << ActiveLibDir << '\n';
       } else if (Arg == "--cmakedir") {
-        sys::printArg(OS, ActiveCMakeDir, /*Quote=*/true);
-        OS << '\n';
+        OS << ActiveCMakeDir << '\n';
       } else if (Arg == "--cppflags") {
-        RenderFlags(LLVM_CPPFLAGS);
+        OS << ActiveIncludeOption << ' ' << LLVM_CPPFLAGS << '\n';
       } else if (Arg == "--cflags") {
-        RenderFlags(LLVM_CFLAGS);
+        OS << ActiveIncludeOption << ' ' << LLVM_CFLAGS << '\n';
       } else if (Arg == "--cxxflags") {
-        RenderFlags(LLVM_CXXFLAGS);
+        OS << ActiveIncludeOption << ' ' << LLVM_CXXFLAGS << '\n';
       } else if (Arg == "--ldflags") {
-        OS << ((HostTriple.isWindowsMSVCEnvironment()) ? "-LIBPATH:" : "-L");
-        sys::printArg(OS, ActiveLibDir, /*Quote=*/true);
-        OS << ' ' << LLVM_LDFLAGS << '\n';
+        OS << ((HostTriple.isWindowsMSVCEnvironment()) ? "-LIBPATH:" : "-L")
+           << ActiveLibDir << ' ' << LLVM_LDFLAGS << '\n';
       } else if (Arg == "--system-libs") {
         PrintSystemLibs = true;
       } else if (Arg == "--libs") {
@@ -611,8 +590,7 @@ int main(int argc, char **argv) {
       } else if (Arg == "--shared-mode") {
         PrintSharedMode = true;
       } else if (Arg == "--obj-root") {
-        sys::printArg(OS, ActivePrefix, /*Quote=*/true);
-        OS << '\n';
+        OS << ActivePrefix << '\n';
       } else if (Arg == "--ignore-libllvm") {
         LinkDyLib = false;
         LinkMode = BuiltSharedLibs ? LinkModeShared : LinkModeAuto;
@@ -717,30 +695,26 @@ int main(int argc, char **argv) {
 
       auto PrintForLib = [&](const StringRef &Lib) {
         const bool Shared = LinkMode == LinkModeShared;
-        std::string LibFileName;
         if (PrintLibNames) {
-          LibFileName = GetComponentLibraryFileName(Lib, Shared);
+          OS << GetComponentLibraryFileName(Lib, Shared);
         } else if (PrintLibFiles) {
-          LibFileName = GetComponentLibraryPath(Lib, Shared);
+          OS << GetComponentLibraryPath(Lib, Shared);
         } else if (PrintLibs) {
           // On Windows, output full path to library without parameters.
           // Elsewhere, if this is a typical library name, include it using -l.
           if (HostTriple.isWindowsMSVCEnvironment()) {
-            LibFileName = GetComponentLibraryPath(Lib, Shared);
+            OS << GetComponentLibraryPath(Lib, Shared);
           } else {
-            OS << "-l";
             StringRef LibName;
             if (GetComponentLibraryNameSlice(Lib, LibName)) {
               // Extract library name (remove prefix and suffix).
-              LibFileName = LibName;
+              OS << "-l" << LibName;
             } else {
               // Lib is already a library name without prefix and suffix.
-              LibFileName = Lib;
+              OS << "-l" << Lib;
             }
           }
         }
-        if (!LibFileName.empty())
-          sys::printArg(OS, LibFileName, /*Quote=*/true);
       };
 
       if (LinkMode == LinkModeShared && LinkDyLib) {
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 97d09d6..d1c3dcb 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -68,11 +68,6 @@ std::string FormatExtensionFlags(int64_t Flags) {
 
 std::string FormatExtensionFlags(AArch64::ExtensionBitset Flags) {
   std::vector<StringRef> Features;
-
-  // AEK_NONE is not meant to be shown to the user so the target parser
-  // does not recognise it. It is relevant here though.
-  if (Flags.test(AArch64::AEK_NONE))
-    Features.push_back("none");
   AArch64::getExtensionFeatures(Flags, Features);
 
   // The target parser also includes every extension you don't have.
@@ -2009,10 +2004,9 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   for (auto Ext : Extensions)
     ExtVal.set(Ext);
 
-  // NONE has no feature names.
-  // We return True here because NONE is a valid choice.
-  EXPECT_TRUE(AArch64::getExtensionFeatures({AArch64::AEK_NONE}, Features));
-  EXPECT_TRUE(!Features.size());
+  // Test an empty set of features.
+  EXPECT_TRUE(AArch64::getExtensionFeatures({}, Features));
+  EXPECT_TRUE(Features.size() == 0);
 
   AArch64::getExtensionFeatures(ExtVal, Features);
   EXPECT_EQ(Extensions.size(), Features.size());
@@ -2092,8 +2086,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+complxnum"));
 
   // Assuming we listed every extension above, this should produce the same
-  // result. (note that AEK_NONE doesn't have a name so it won't be in the
-  // result despite its bit being set)
+  // result.
   std::vector<StringRef> AllFeatures;
   EXPECT_TRUE(AArch64::getExtensionFeatures(ExtVal, AllFeatures));
   EXPECT_THAT(Features, ::testing::ContainerEq(AllFeatures));
diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
index 1435696..a4b2502 100644
--- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
@@ -78,12 +78,10 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
 
   // Emit the ArchExtKind enum
   OS << "#ifdef EMIT_ARCHEXTKIND_ENUM\n"
-     << "enum ArchExtKind : unsigned {\n"
-     << "  AEK_NONE = 1,\n";
+     << "enum ArchExtKind : unsigned {\n";
   for (const Record *Rec : SortedExtensions) {
     auto AEK = Rec->getValueAsString("ArchExtKindSpelling").upper();
-    if (AEK != "AEK_NONE")
-      OS << "  " << AEK << ",\n";
+    OS << "  " << AEK << ",\n";
   }
   OS << "  AEK_NUM_EXTENSIONS\n"
      << "};\n"
@@ -108,7 +106,6 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
     OS << ", \"-" << Rec->getValueAsString("Name") << "\""; // negfeature
     OS << "},\n";
   };
-  OS << "  {\"none\", {}, AArch64::AEK_NONE, {}, {}, {}, {} },\n";
   OS << "};\n"
      << "#undef EMIT_EXTENSIONS\n"
      << "#endif // EMIT_EXTENSIONS\n"
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index bbf2b84..a4de4f0 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -58,6 +58,7 @@ static_library("bugprone") {
     "NotNullTerminatedResultCheck.cpp",
     "OptionalValueConversionCheck.cpp",
     "ParentVirtualCallCheck.cpp",
+    "PointerArithmeticOnPolymorphicObjectCheck.cpp",
     "PosixReturnCheck.cpp",
     "RedundantBranchConditionCheck.cpp",
     "ReservedIdentifierCheck.cpp",
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index fad234a..a344add 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -271,7 +271,7 @@ structured_op: !LinalgStructuredOpConfig
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
   name: negf
-  cpp_class_name: NegfOp
+  cpp_class_name: NegFOp
   doc: |-
     Applies negf(x) elementwise.
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
index e4955fe..0eefe06 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
@@ -81,6 +81,10 @@ struct DoacrossClauseOps {
   IntegerAttr doacrossNumLoopsAttr;
 };
 
+struct FilterClauseOps {
+  Value filteredThreadIdVar;
+};
+
 struct FinalClauseOps {
   Value finalVar;
 };
@@ -254,8 +258,7 @@ using DistributeClauseOps =
 
 using LoopNestClauseOps = detail::Clauses<CollapseClauseOps, LoopRelatedOps>;
 
-// TODO `filter` clause.
-using MaskedClauseOps = detail::Clauses<>;
+using MaskedClauseOps = detail::Clauses<FilterClauseOps>;
 
 using OrderedOpClauseOps = detail::Clauses<DoacrossClauseOps>;
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
index 1fa6edb..99150bc 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
@@ -1204,4 +1204,32 @@ class OpenMP_UseDevicePtrClauseSkip<
 
 def OpenMP_UseDevicePtrClause : OpenMP_UseDevicePtrClauseSkip<>;
 
+//===----------------------------------------------------------------------===//
+// V5.2: [10.5.1] `filter` clause
+//===----------------------------------------------------------------------===//
+
+class OpenMP_FilterClauseSkip<
+    bit traits = false, bit arguments = false, bit assemblyFormat = false,
+    bit description = false, bit extraClassDeclaration = false
+  > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
+                    description, extraClassDeclaration> {
+  let arguments = (ins
+    Optional<IntLikeType>:$filtered_thread_id
+  );
+
+  let assemblyFormat = [{
+    `filter` `(` $filtered_thread_id `:` type($filtered_thread_id) `)`
+  }];
+
+  let description = [{
+    If `filter` is specified, the masked construct masks the execution of
+    the region to only the thread id filtered. Other threads executing the
+    parallel region are not expected to execute the region specified within
+    the `masked` directive. If `filter` is not specified, master thread is
+    expected to execute the region enclosed within `masked` directive.
+  }];
+}
+
+def OpenMP_FilterClause : OpenMP_FilterClauseSkip<>;
+
 #endif // OPENMP_CLAUSES
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 99e14cd..1a1ca5e 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -1577,4 +1577,21 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove,
   let hasRegionVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// [Spec 5.2] 10.5 masked Construct
+//===----------------------------------------------------------------------===//
+def MaskedOp : OpenMP_Op<"masked", clauses = [
+    OpenMP_FilterClause
+  ], singleRegion = 1> {
+  let summary = "masked construct";
+  let description = [{
+    Masked construct allows to specify a structured block to be executed by a subset of 
+    threads of the current team.
+  }] # clausesDescription;
+
+  let builders = [
+    OpBuilder<(ins CArg<"const MaskedClauseOps &">:$clauses)>
+  ];
+}
+
 #endif // OPENMP_OPS
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index bf95fbe..f35ea962 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -303,8 +303,7 @@ def ForallOp : SCF_Op<"forall", [
        DeclareOpInterfaceMethods<LoopLikeOpInterface,
           ["getInitsMutable", "getRegionIterArgs", "getLoopInductionVars", 
            "getLoopLowerBounds", "getLoopUpperBounds", "getLoopSteps",
-           "replaceWithAdditionalYields", "promoteIfSingleIteration",
-           "yieldTiledValuesAndReplace"]>,
+           "promoteIfSingleIteration", "yieldTiledValuesAndReplace"]>,
        RecursiveMemoryEffects,
        SingleBlockImplicitTerminator<"scf::InParallelOp">,
        DeclareOpInterfaceMethods<RegionBranchOpInterface>,
diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
index 6a40304..de807c3 100644
--- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
@@ -181,16 +181,6 @@ Loops tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes);
 void getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
                              scf::ForOp root);
 
-//===----------------------------------------------------------------------===//
-// Fusion related helpers
-//===----------------------------------------------------------------------===//
-
-/// Check structural compatibility between two loops such as iteration space
-/// and dominance.
-bool checkFusionStructuralLegality(LoopLikeOpInterface target,
-                                   LoopLikeOpInterface source,
-                                   Diagnostic &diag);
-
 /// Given two scf.forall loops, `target` and `source`, fuses `target` into
 /// `source`. Assumes that the given loops are siblings and are independent of
 /// each other.
@@ -212,16 +202,6 @@ scf::ForallOp fuseIndependentSiblingForallLoops(scf::ForallOp target,
 scf::ForOp fuseIndependentSiblingForLoops(scf::ForOp target, scf::ForOp source,
                                           RewriterBase &rewriter);
 
-/// Given two scf.parallel loops, `target` and `source`, fuses `target` into
-/// `source`. Assumes that the given loops are siblings and are independent of
-/// each other.
-///
-/// This function does not perform any legality checks and simply fuses the
-/// loops. The caller is responsible for ensuring that the loops are legal to
-/// fuse.
-scf::ParallelOp fuseIndependentSiblingParallelLoops(scf::ParallelOp target,
-                                                    scf::ParallelOp source,
-                                                    RewriterBase &rewriter);
 } // namespace mlir
 
 #endif // MLIR_DIALECT_SCF_UTILS_UTILS_H_
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 097e5e6..44efb7f 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -3017,6 +3017,31 @@ def Vector_ScanOp :
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// VectorStepOp
+//===----------------------------------------------------------------------===//
+
+def Vector_StepOp : Vector_Op<"step", [Pure]> {
+  let summary = "A linear sequence of values from 0 to N";
+  let description = [{
+    A `step` operation produces an index vector, i.e. a 1-D vector of values of
+    index type that represents a linear sequence from 0 to N-1, where N is the
+    number of elements in the `result` vector.
+
+    Supports fixed-width and scalable vectors.
+
+    Examples:
+
+    ```mlir
+    %0 = vector.step : vector<4xindex> ; [0, 1, 2, 3]
+    %1 = vector.step : vector<[4]xindex> ; [0, 1, .., <vscale * 4 - 1>]
+    ```
+  }];
+  let hasFolder = 1;
+  let results = (outs VectorOfRankAndType<[1], [Index]>:$result);
+  let assemblyFormat = "attr-dict `:` type($result)";
+}
+
 def Vector_YieldOp : Vector_Op<"yield", [
     Pure, ReturnLike, Terminator]> {
   let summary = "Terminates and yields values from vector regions.";
diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.h b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
index d08e097..9925fc6 100644
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.h
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
@@ -90,24 +90,4 @@ struct JamBlockGatherer {
 /// Include the generated interface declarations.
 #include "mlir/Interfaces/LoopLikeInterface.h.inc"
 
-namespace mlir {
-/// A function that rewrites `target`'s terminator as a teminator obtained by
-/// fusing `source` into `target`.
-using FuseTerminatorFn =
-    function_ref<void(RewriterBase &rewriter, LoopLikeOpInterface source,
-                      LoopLikeOpInterface &target, IRMapping mapping)>;
-
-/// Returns a fused `LoopLikeOpInterface` created by fusing `source` to
-/// `target`.  The `NewYieldValuesFn` callback is used to pass to the
-/// `replaceWithAdditionalYields` interface method to replace the loop with a
-/// new loop with (possibly) additional yields, while the `FuseTerminatorFn`
-/// callback is repsonsible for updating the fused loop terminator.
-LoopLikeOpInterface createFused(LoopLikeOpInterface target,
-                                LoopLikeOpInterface source,
-                                RewriterBase &rewriter,
-                                NewYieldValuesFn newYieldValuesFn,
-                                FuseTerminatorFn fuseTerminatorFn);
-
-} // namespace mlir
-
 #endif // MLIR_INTERFACES_LOOPLIKEINTERFACE_H_
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 0eac552..6a8a9d8 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1860,6 +1860,19 @@ struct VectorFromElementsLowering
   }
 };
 
+/// Conversion pattern for vector.step.
+struct VectorStepOpLowering : public ConvertOpToLLVMPattern<vector::StepOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::StepOp stepOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type llvmType = typeConverter->convertType(stepOp.getType());
+    rewriter.replaceOpWithNewOp<LLVM::StepVectorOp>(stepOp, llvmType);
+    return success();
+  }
+};
+
 } // namespace
 
 /// Populate the given list with patterns that convert from Vector to LLVM.
@@ -1885,8 +1898,8 @@ void mlir::populateVectorToLLVMConversionPatterns(
                VectorSplatOpLowering, VectorSplatNdOpLowering,
                VectorScalableInsertOpLowering, VectorScalableExtractOpLowering,
                MaskedReductionOpConversion, VectorInterleaveOpLowering,
-               VectorDeinterleaveOpLowering, VectorFromElementsLowering>(
-      converter);
+               VectorDeinterleaveOpLowering, VectorFromElementsLowering,
+               VectorStepOpLowering>(converter);
   // Transfer ops with rank > 1 are handled by VectorToSCF.
   populateVectorTransferLoweringPatterns(patterns, /*maxTransferRank=*/1);
 }
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index abbd857..23f291b 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -2578,6 +2578,15 @@ LogicalResult PrivateClauseOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Spec 5.2: Masked construct (10.5)
+//===----------------------------------------------------------------------===//
+
+void MaskedOp::build(OpBuilder &builder, OperationState &state,
+                     const MaskedClauseOps &clauses) {
+  MaskedOp::build(builder, state, clauses.filteredThreadIdVar);
+}
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc"
 
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index cb15e0e..907d7f7 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -618,44 +618,6 @@ void ForOp::getSuccessorRegions(RegionBranchPoint point,
 
 SmallVector<Region *> ForallOp::getLoopRegions() { return {&getRegion()}; }
 
-FailureOr<LoopLikeOpInterface> ForallOp::replaceWithAdditionalYields(
-    RewriterBase &rewriter, ValueRange newInitOperands,
-    bool replaceInitOperandUsesInLoop,
-    const NewYieldValuesFn &newYieldValuesFn) {
-  // Create a new loop before the existing one, with the extra operands.
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(getOperation());
-  SmallVector<Value> inits(getOutputs());
-  llvm::append_range(inits, newInitOperands);
-  scf::ForallOp newLoop = rewriter.create<scf::ForallOp>(
-      getLoc(), getMixedLowerBound(), getMixedUpperBound(), getMixedStep(),
-      inits, getMapping(),
-      /*bodyBuilderFn =*/[](OpBuilder &, Location, ValueRange) {});
-
-  // Move the loop body to the new op.
-  rewriter.mergeBlocks(getBody(), newLoop.getBody(),
-                       newLoop.getBody()->getArguments().take_front(
-                           getBody()->getNumArguments()));
-
-  if (replaceInitOperandUsesInLoop) {
-    // Replace all uses of `newInitOperands` with the corresponding basic block
-    // arguments.
-    for (auto &&[newOperand, oldOperand] :
-         llvm::zip(newInitOperands, newLoop.getBody()->getArguments().take_back(
-                                        newInitOperands.size()))) {
-      rewriter.replaceUsesWithIf(newOperand, oldOperand, [&](OpOperand &use) {
-        Operation *user = use.getOwner();
-        return newLoop->isProperAncestor(user);
-      });
-    }
-  }
-
-  // Replace the old loop.
-  rewriter.replaceOp(getOperation(),
-                     newLoop->getResults().take_front(getNumResults()));
-  return cast<LoopLikeOpInterface>(newLoop.getOperation());
-}
-
 /// Promotes the loop body of a forallOp to its containing block if it can be
 /// determined that the loop has a single iteration.
 LogicalResult scf::ForallOp::promoteIfSingleIteration(RewriterBase &rewriter) {
diff --git a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
index 41834fe..56ff270 100644
--- a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
+++ b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
@@ -261,10 +261,8 @@ loopScheduling(scf::ForOp forOp,
     return 1;
   };
 
-  std::optional<int64_t> ubConstant =
-      getConstantIntValue(forOp.getUpperBound());
-  std::optional<int64_t> lbConstant =
-      getConstantIntValue(forOp.getLowerBound());
+  std::optional<int64_t> ubConstant = getConstantIntValue(forOp.getUpperBound());
+  std::optional<int64_t> lbConstant = getConstantIntValue(forOp.getLowerBound());
   DenseMap<Operation *, unsigned> opCycles;
   std::map<unsigned, std::vector<Operation *>> wrappedSchedule;
   for (Operation &op : forOp.getBody()->getOperations()) {
@@ -449,6 +447,113 @@ void transform::TakeAssumedBranchOp::getEffects(
 // LoopFuseSiblingOp
 //===----------------------------------------------------------------------===//
 
+/// Check if `target` and `source` are siblings, in the context that `target`
+/// is being fused into `source`.
+///
+/// This is a simple check that just checks if both operations are in the same
+/// block and some checks to ensure that the fused IR does not violate
+/// dominance.
+static DiagnosedSilenceableFailure isOpSibling(Operation *target,
+                                               Operation *source) {
+  // Check if both operations are same.
+  if (target == source)
+    return emitSilenceableFailure(source)
+           << "target and source need to be different loops";
+
+  // Check if both operations are in the same block.
+  if (target->getBlock() != source->getBlock())
+    return emitSilenceableFailure(source)
+           << "target and source are not in the same block";
+
+  // Check if fusion will violate dominance.
+  DominanceInfo domInfo(source);
+  if (target->isBeforeInBlock(source)) {
+    // Since `target` is before `source`, all users of results of `target`
+    // need to be dominated by `source`.
+    for (Operation *user : target->getUsers()) {
+      if (!domInfo.properlyDominates(source, user, /*enclosingOpOk=*/false)) {
+        return emitSilenceableFailure(target)
+               << "user of results of target should be properly dominated by "
+                  "source";
+      }
+    }
+  } else {
+    // Since `target` is after `source`, all values used by `target` need
+    // to dominate `source`.
+
+    // Check if operands of `target` are dominated by `source`.
+    for (Value operand : target->getOperands()) {
+      Operation *operandOp = operand.getDefiningOp();
+      // Operands without defining operations are block arguments. When `target`
+      // and `source` occur in the same block, these operands dominate `source`.
+      if (!operandOp)
+        continue;
+
+      // Operand's defining operation should properly dominate `source`.
+      if (!domInfo.properlyDominates(operandOp, source,
+                                     /*enclosingOpOk=*/false))
+        return emitSilenceableFailure(target)
+               << "operands of target should be properly dominated by source";
+    }
+
+    // Check if values used by `target` are dominated by `source`.
+    bool failed = false;
+    OpOperand *failedValue = nullptr;
+    visitUsedValuesDefinedAbove(target->getRegions(), [&](OpOperand *operand) {
+      Operation *operandOp = operand->get().getDefiningOp();
+      if (operandOp && !domInfo.properlyDominates(operandOp, source,
+                                                  /*enclosingOpOk=*/false)) {
+        // `operand` is not an argument of an enclosing block and the defining
+        // op of `operand` is outside `target` but does not dominate `source`.
+        failed = true;
+        failedValue = operand;
+      }
+    });
+
+    if (failed)
+      return emitSilenceableFailure(failedValue->getOwner())
+             << "values used inside regions of target should be properly "
+                "dominated by source";
+  }
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+/// Check if `target` scf.forall can be fused into `source` scf.forall.
+///
+/// This simply checks if both loops have the same bounds, steps and mapping.
+/// No attempt is made at checking that the side effects of `target` and
+/// `source` are independent of each other.
+static bool isForallWithIdenticalConfiguration(Operation *target,
+                                               Operation *source) {
+  auto targetOp = dyn_cast<scf::ForallOp>(target);
+  auto sourceOp = dyn_cast<scf::ForallOp>(source);
+  if (!targetOp || !sourceOp)
+    return false;
+
+  return targetOp.getMixedLowerBound() == sourceOp.getMixedLowerBound() &&
+         targetOp.getMixedUpperBound() == sourceOp.getMixedUpperBound() &&
+         targetOp.getMixedStep() == sourceOp.getMixedStep() &&
+         targetOp.getMapping() == sourceOp.getMapping();
+}
+
+/// Check if `target` scf.for can be fused into `source` scf.for.
+///
+/// This simply checks if both loops have the same bounds and steps. No attempt
+/// is made at checking that the side effects of `target` and `source` are
+/// independent of each other.
+static bool isForWithIdenticalConfiguration(Operation *target,
+                                            Operation *source) {
+  auto targetOp = dyn_cast<scf::ForOp>(target);
+  auto sourceOp = dyn_cast<scf::ForOp>(source);
+  if (!targetOp || !sourceOp)
+    return false;
+
+  return targetOp.getLowerBound() == sourceOp.getLowerBound() &&
+         targetOp.getUpperBound() == sourceOp.getUpperBound() &&
+         targetOp.getStep() == sourceOp.getStep();
+}
+
 DiagnosedSilenceableFailure
 transform::LoopFuseSiblingOp::apply(transform::TransformRewriter &rewriter,
                                     transform::TransformResults &results,
@@ -464,32 +569,25 @@ transform::LoopFuseSiblingOp::apply(transform::TransformRewriter &rewriter,
            << "source handle (got " << llvm::range_size(sourceOps) << ")";
   }
 
-  auto target = dyn_cast<LoopLikeOpInterface>(*targetOps.begin());
-  auto source = dyn_cast<LoopLikeOpInterface>(*sourceOps.begin());
-  if (!target || !source)
-    return emitSilenceableFailure(target->getLoc())
-           << "target or source is not a loop op";
+  Operation *target = *targetOps.begin();
+  Operation *source = *sourceOps.begin();
 
-  // Check if loops can be fused
-  Diagnostic diag(target.getLoc(), DiagnosticSeverity::Error);
-  if (!mlir::checkFusionStructuralLegality(target, source, diag))
-    return DiagnosedSilenceableFailure::silenceableFailure(std::move(diag));
+  // Check if the target and source are siblings.
+  DiagnosedSilenceableFailure diag = isOpSibling(target, source);
+  if (!diag.succeeded())
+    return diag;
 
   Operation *fusedLoop;
-  // TODO: Support fusion for loop-like ops besides scf.for, scf.forall
-  // and scf.parallel.
-  if (isa<scf::ForOp>(target) && isa<scf::ForOp>(source)) {
+  /// TODO: Support fusion for loop-like ops besides scf.for and scf.forall.
+  if (isForWithIdenticalConfiguration(target, source)) {
     fusedLoop = fuseIndependentSiblingForLoops(
         cast<scf::ForOp>(target), cast<scf::ForOp>(source), rewriter);
-  } else if (isa<scf::ForallOp>(target) && isa<scf::ForallOp>(source)) {
+  } else if (isForallWithIdenticalConfiguration(target, source)) {
     fusedLoop = fuseIndependentSiblingForallLoops(
         cast<scf::ForallOp>(target), cast<scf::ForallOp>(source), rewriter);
-  } else if (isa<scf::ParallelOp>(target) && isa<scf::ParallelOp>(source)) {
-    fusedLoop = fuseIndependentSiblingParallelLoops(
-        cast<scf::ParallelOp>(target), cast<scf::ParallelOp>(source), rewriter);
   } else
     return emitSilenceableFailure(target->getLoc())
-           << "unsupported loop type for fusion";
+           << "operations cannot be fused";
 
   assert(fusedLoop && "failed to fuse operations");
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
index b775f98..5934d85 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
@@ -16,7 +16,6 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
-#include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/OpDefinition.h"
@@ -38,6 +37,24 @@ static bool hasNestedParallelOp(ParallelOp ploop) {
   return walkResult.wasInterrupted();
 }
 
+/// Verify equal iteration spaces.
+static bool equalIterationSpaces(ParallelOp firstPloop,
+                                 ParallelOp secondPloop) {
+  if (firstPloop.getNumLoops() != secondPloop.getNumLoops())
+    return false;
+
+  auto matchOperands = [&](const OperandRange &lhs,
+                           const OperandRange &rhs) -> bool {
+    // TODO: Extend this to support aliases and equal constants.
+    return std::equal(lhs.begin(), lhs.end(), rhs.begin());
+  };
+  return matchOperands(firstPloop.getLowerBound(),
+                       secondPloop.getLowerBound()) &&
+         matchOperands(firstPloop.getUpperBound(),
+                       secondPloop.getUpperBound()) &&
+         matchOperands(firstPloop.getStep(), secondPloop.getStep());
+}
+
 /// Checks if the parallel loops have mixed access to the same buffers. Returns
 /// `true` if the first parallel loop writes to the same indices that the second
 /// loop reads.
@@ -136,10 +153,9 @@ verifyDependencies(ParallelOp firstPloop, ParallelOp secondPloop,
 static bool isFusionLegal(ParallelOp firstPloop, ParallelOp secondPloop,
                           const IRMapping &firstToSecondPloopIndices,
                           llvm::function_ref<bool(Value, Value)> mayAlias) {
-  Diagnostic diag(firstPloop.getLoc(), DiagnosticSeverity::Remark);
   return !hasNestedParallelOp(firstPloop) &&
          !hasNestedParallelOp(secondPloop) &&
-         checkFusionStructuralLegality(firstPloop, secondPloop, diag) &&
+         equalIterationSpaces(firstPloop, secondPloop) &&
          succeeded(verifyDependencies(firstPloop, secondPloop,
                                       firstToSecondPloopIndices, mayAlias));
 }
@@ -158,9 +174,61 @@ static void fuseIfLegal(ParallelOp firstPloop, ParallelOp &secondPloop,
                      mayAlias))
     return;
 
-  IRRewriter rewriter(builder);
-  secondPloop = mlir::fuseIndependentSiblingParallelLoops(
-      firstPloop, secondPloop, rewriter);
+  DominanceInfo dom;
+  // We are fusing first loop into second, make sure there are no users of the
+  // first loop results between loops.
+  for (Operation *user : firstPloop->getUsers())
+    if (!dom.properlyDominates(secondPloop, user, /*enclosingOpOk*/ false))
+      return;
+
+  ValueRange inits1 = firstPloop.getInitVals();
+  ValueRange inits2 = secondPloop.getInitVals();
+
+  SmallVector<Value> newInitVars(inits1.begin(), inits1.end());
+  newInitVars.append(inits2.begin(), inits2.end());
+
+  IRRewriter b(builder);
+  b.setInsertionPoint(secondPloop);
+  auto newSecondPloop = b.create<ParallelOp>(
+      secondPloop.getLoc(), secondPloop.getLowerBound(),
+      secondPloop.getUpperBound(), secondPloop.getStep(), newInitVars);
+
+  Block *newBlock = newSecondPloop.getBody();
+  auto term1 = cast<ReduceOp>(block1->getTerminator());
+  auto term2 = cast<ReduceOp>(block2->getTerminator());
+
+  b.inlineBlockBefore(block2, newBlock, newBlock->begin(),
+                      newBlock->getArguments());
+  b.inlineBlockBefore(block1, newBlock, newBlock->begin(),
+                      newBlock->getArguments());
+
+  ValueRange results = newSecondPloop.getResults();
+  if (!results.empty()) {
+    b.setInsertionPointToEnd(newBlock);
+
+    ValueRange reduceArgs1 = term1.getOperands();
+    ValueRange reduceArgs2 = term2.getOperands();
+    SmallVector<Value> newReduceArgs(reduceArgs1.begin(), reduceArgs1.end());
+    newReduceArgs.append(reduceArgs2.begin(), reduceArgs2.end());
+
+    auto newReduceOp = b.create<scf::ReduceOp>(term2.getLoc(), newReduceArgs);
+
+    for (auto &&[i, reg] : llvm::enumerate(llvm::concat<Region>(
+             term1.getReductions(), term2.getReductions()))) {
+      Block &oldRedBlock = reg.front();
+      Block &newRedBlock = newReduceOp.getReductions()[i].front();
+      b.inlineBlockBefore(&oldRedBlock, &newRedBlock, newRedBlock.begin(),
+                          newRedBlock.getArguments());
+    }
+
+    firstPloop.replaceAllUsesWith(results.take_front(inits1.size()));
+    secondPloop.replaceAllUsesWith(results.take_back(inits2.size()));
+  }
+  term1->erase();
+  term2->erase();
+  firstPloop.erase();
+  secondPloop.erase();
+  secondPloop = newSecondPloop;
 }
 
 void mlir::scf::naivelyFuseParallelOps(
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index abfc9a1..c0ee9d2 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -17,7 +17,6 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Dominance.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
@@ -1263,131 +1262,54 @@ TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,
   return tileLoops;
 }
 
-//===----------------------------------------------------------------------===//
-// Fusion related helpers
-//===----------------------------------------------------------------------===//
-
-/// Check if `target` and `source` are siblings, in the context that `target`
-/// is being fused into `source`.
-///
-/// This is a simple check that just checks if both operations are in the same
-/// block and some checks to ensure that the fused IR does not violate
-/// dominance.
-static bool isOpSibling(Operation *target, Operation *source,
-                        Diagnostic &diag) {
-  // Check if both operations are same.
-  if (target == source) {
-    diag << "target and source need to be different loops";
-    return false;
-  }
-
-  // Check if both operations are in the same block.
-  if (target->getBlock() != source->getBlock()) {
-    diag << "target and source are not in the same block";
-    return false;
-  }
-
-  // Check if fusion will violate dominance.
-  DominanceInfo domInfo(source);
-  if (target->isBeforeInBlock(source)) {
-    // Since `target` is before `source`, all users of results of `target`
-    // need to be dominated by `source`.
-    for (Operation *user : target->getUsers()) {
-      if (!domInfo.properlyDominates(source, user, /*enclosingOpOk=*/false)) {
-        diag << "user of results of target should "
-                "be properly dominated by source";
-        return false;
-      }
-    }
-  } else {
-    // Since `target` is after `source`, all values used by `target` need
-    // to dominate `source`.
-
-    // Check if operands of `target` are dominated by `source`.
-    for (Value operand : target->getOperands()) {
-      Operation *operandOp = operand.getDefiningOp();
-      // Operands without defining operations are block arguments. When `target`
-      // and `source` occur in the same block, these operands dominate `source`.
-      if (!operandOp)
-        continue;
-
-      // Operand's defining operation should properly dominate `source`.
-      if (!domInfo.properlyDominates(operandOp, source,
-                                     /*enclosingOpOk=*/false)) {
-        diag << "operands of target should be properly dominated by source";
-        return false;
-      }
-    }
-
-    // Check if values used by `target` are dominated by `source`.
-    bool failed = false;
-    OpOperand *failedValue = nullptr;
-    visitUsedValuesDefinedAbove(target->getRegions(), [&](OpOperand *operand) {
-      Operation *operandOp = operand->get().getDefiningOp();
-      if (operandOp && !domInfo.properlyDominates(operandOp, source,
-                                                  /*enclosingOpOk=*/false)) {
-        // `operand` is not an argument of an enclosing block and the defining
-        // op of `operand` is outside `target` but does not dominate `source`.
-        failed = true;
-        failedValue = operand;
-      }
-    });
-
-    if (failed) {
-      diag << "values used inside regions of target should be properly "
-              "dominated by source";
-      diag.attachNote(failedValue->getOwner()->getLoc()) << "see operation";
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool mlir::checkFusionStructuralLegality(LoopLikeOpInterface target,
-                                         LoopLikeOpInterface source,
-                                         Diagnostic &diag) {
-  if (target->getName() != source->getName()) {
-    diag << "target and source must be same loop type";
-    return false;
-  }
-
-  bool iterSpaceEq =
-      target.getLoopLowerBounds() == source.getLoopLowerBounds() &&
-      target.getLoopUpperBounds() == source.getLoopUpperBounds() &&
-      target.getLoopSteps() == source.getLoopSteps();
-  // TODO: Decouple checks on concrete loop types and move this function
-  // somewhere for general utility for `LoopLikeOpInterface`
-  if (auto forAllTarget = dyn_cast<scf::ForallOp>(*target))
-    iterSpaceEq = iterSpaceEq && forAllTarget.getMapping() ==
-                                     cast<scf::ForallOp>(*source).getMapping();
-  if (!iterSpaceEq) {
-    diag << "target and source iteration spaces must be equal";
-    return false;
-  }
-  return isOpSibling(target, source, diag);
-}
-
 scf::ForallOp mlir::fuseIndependentSiblingForallLoops(scf::ForallOp target,
                                                       scf::ForallOp source,
                                                       RewriterBase &rewriter) {
-  scf::ForallOp fusedLoop = cast<scf::ForallOp>(createFused(
-      target, source, rewriter,
-      [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBBArgs) {
-        // `ForallOp` does not have yields, rather an `InParallelOp` terminator.
-        return ValueRange{};
-      },
-      [&](RewriterBase &b, LoopLikeOpInterface source,
-          LoopLikeOpInterface &target, IRMapping mapping) {
-        auto sourceForall = cast<scf::ForallOp>(source);
-        auto targetForall = cast<scf::ForallOp>(target);
-        scf::InParallelOp fusedTerm = targetForall.getTerminator();
-        b.setInsertionPointToEnd(fusedTerm.getBody());
-        for (Operation &op : sourceForall.getTerminator().getYieldingOps())
-          b.clone(op, mapping);
-      }));
-  rewriter.replaceOp(source,
-                     fusedLoop.getResults().take_back(source.getNumResults()));
+  unsigned numTargetOuts = target.getNumResults();
+  unsigned numSourceOuts = source.getNumResults();
+
+  // Create fused shared_outs.
+  SmallVector<Value> fusedOuts;
+  llvm::append_range(fusedOuts, target.getOutputs());
+  llvm::append_range(fusedOuts, source.getOutputs());
+
+  // Create a new scf.forall op after the source loop.
+  rewriter.setInsertionPointAfter(source);
+  scf::ForallOp fusedLoop = rewriter.create<scf::ForallOp>(
+      source.getLoc(), source.getMixedLowerBound(), source.getMixedUpperBound(),
+      source.getMixedStep(), fusedOuts, source.getMapping());
+
+  // Map control operands.
+  IRMapping mapping;
+  mapping.map(target.getInductionVars(), fusedLoop.getInductionVars());
+  mapping.map(source.getInductionVars(), fusedLoop.getInductionVars());
+
+  // Map shared outs.
+  mapping.map(target.getRegionIterArgs(),
+              fusedLoop.getRegionIterArgs().take_front(numTargetOuts));
+  mapping.map(source.getRegionIterArgs(),
+              fusedLoop.getRegionIterArgs().take_back(numSourceOuts));
+
+  // Append everything except the terminator into the fused operation.
+  rewriter.setInsertionPointToStart(fusedLoop.getBody());
+  for (Operation &op : target.getBody()->without_terminator())
+    rewriter.clone(op, mapping);
+  for (Operation &op : source.getBody()->without_terminator())
+    rewriter.clone(op, mapping);
+
+  // Fuse the old terminator in_parallel ops into the new one.
+  scf::InParallelOp targetTerm = target.getTerminator();
+  scf::InParallelOp sourceTerm = source.getTerminator();
+  scf::InParallelOp fusedTerm = fusedLoop.getTerminator();
+  rewriter.setInsertionPointToStart(fusedTerm.getBody());
+  for (Operation &op : targetTerm.getYieldingOps())
+    rewriter.clone(op, mapping);
+  for (Operation &op : sourceTerm.getYieldingOps())
+    rewriter.clone(op, mapping);
+
+  // Replace old loops by substituting their uses by results of the fused loop.
+  rewriter.replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));
+  rewriter.replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));
 
   return fusedLoop;
 }
@@ -1395,74 +1317,49 @@ scf::ForallOp mlir::fuseIndependentSiblingForallLoops(scf::ForallOp target,
 scf::ForOp mlir::fuseIndependentSiblingForLoops(scf::ForOp target,
                                                 scf::ForOp source,
                                                 RewriterBase &rewriter) {
-  scf::ForOp fusedLoop = cast<scf::ForOp>(createFused(
-      target, source, rewriter,
-      [&](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBBArgs) {
-        return source.getYieldedValues();
-      },
-      [&](RewriterBase &b, LoopLikeOpInterface source,
-          LoopLikeOpInterface &target, IRMapping mapping) {
-        auto targetFor = cast<scf::ForOp>(target);
-        auto newTerm = b.clone(*targetFor.getBody()->getTerminator(), mapping);
-        b.replaceOp(targetFor.getBody()->getTerminator(), newTerm);
-      }));
-  rewriter.replaceOp(source,
-                     fusedLoop.getResults().take_back(source.getNumResults()));
-  return fusedLoop;
-}
-
-// TODO: Finish refactoring this a la the above, but likely requires additional
-// interface methods.
-scf::ParallelOp mlir::fuseIndependentSiblingParallelLoops(
-    scf::ParallelOp target, scf::ParallelOp source, RewriterBase &rewriter) {
-  OpBuilder::InsertionGuard guard(rewriter);
-  Block *block1 = target.getBody();
-  Block *block2 = source.getBody();
-  auto term1 = cast<scf::ReduceOp>(block1->getTerminator());
-  auto term2 = cast<scf::ReduceOp>(block2->getTerminator());
-
-  ValueRange inits1 = target.getInitVals();
-  ValueRange inits2 = source.getInitVals();
-
-  SmallVector<Value> newInitVars(inits1.begin(), inits1.end());
-  newInitVars.append(inits2.begin(), inits2.end());
-
-  rewriter.setInsertionPoint(source);
-  auto fusedLoop = rewriter.create<scf::ParallelOp>(
-      rewriter.getFusedLoc(target.getLoc(), source.getLoc()),
-      source.getLowerBound(), source.getUpperBound(), source.getStep(),
-      newInitVars);
-  Block *newBlock = fusedLoop.getBody();
-  rewriter.inlineBlockBefore(block2, newBlock, newBlock->begin(),
-                             newBlock->getArguments());
-  rewriter.inlineBlockBefore(block1, newBlock, newBlock->begin(),
-                             newBlock->getArguments());
-
-  ValueRange results = fusedLoop.getResults();
-  if (!results.empty()) {
-    rewriter.setInsertionPointToEnd(newBlock);
-
-    ValueRange reduceArgs1 = term1.getOperands();
-    ValueRange reduceArgs2 = term2.getOperands();
-    SmallVector<Value> newReduceArgs(reduceArgs1.begin(), reduceArgs1.end());
-    newReduceArgs.append(reduceArgs2.begin(), reduceArgs2.end());
-
-    auto newReduceOp = rewriter.create<scf::ReduceOp>(
-        rewriter.getFusedLoc(term1.getLoc(), term2.getLoc()), newReduceArgs);
-
-    for (auto &&[i, reg] : llvm::enumerate(llvm::concat<Region>(
-             term1.getReductions(), term2.getReductions()))) {
-      Block &oldRedBlock = reg.front();
-      Block &newRedBlock = newReduceOp.getReductions()[i].front();
-      rewriter.inlineBlockBefore(&oldRedBlock, &newRedBlock,
-                                 newRedBlock.begin(),
-                                 newRedBlock.getArguments());
-    }
-  }
-  rewriter.replaceOp(target, results.take_front(inits1.size()));
-  rewriter.replaceOp(source, results.take_back(inits2.size()));
-  rewriter.eraseOp(term1);
-  rewriter.eraseOp(term2);
+  unsigned numTargetOuts = target.getNumResults();
+  unsigned numSourceOuts = source.getNumResults();
+
+  // Create fused init_args, with target's init_args before source's init_args.
+  SmallVector<Value> fusedInitArgs;
+  llvm::append_range(fusedInitArgs, target.getInitArgs());
+  llvm::append_range(fusedInitArgs, source.getInitArgs());
+
+  // Create a new scf.for op after the source loop (with scf.yield terminator
+  // (without arguments) only in case its init_args is empty).
+  rewriter.setInsertionPointAfter(source);
+  scf::ForOp fusedLoop = rewriter.create<scf::ForOp>(
+      source.getLoc(), source.getLowerBound(), source.getUpperBound(),
+      source.getStep(), fusedInitArgs);
+
+  // Map original induction variables and operands to those of the fused loop.
+  IRMapping mapping;
+  mapping.map(target.getInductionVar(), fusedLoop.getInductionVar());
+  mapping.map(target.getRegionIterArgs(),
+              fusedLoop.getRegionIterArgs().take_front(numTargetOuts));
+  mapping.map(source.getInductionVar(), fusedLoop.getInductionVar());
+  mapping.map(source.getRegionIterArgs(),
+              fusedLoop.getRegionIterArgs().take_back(numSourceOuts));
+
+  // Merge target's body into the new (fused) for loop and then source's body.
+  rewriter.setInsertionPointToStart(fusedLoop.getBody());
+  for (Operation &op : target.getBody()->without_terminator())
+    rewriter.clone(op, mapping);
+  for (Operation &op : source.getBody()->without_terminator())
+    rewriter.clone(op, mapping);
+
+  // Build fused yield results by appropriately mapping original yield operands.
+  SmallVector<Value> yieldResults;
+  for (Value operand : target.getBody()->getTerminator()->getOperands())
+    yieldResults.push_back(mapping.lookupOrDefault(operand));
+  for (Value operand : source.getBody()->getTerminator()->getOperands())
+    yieldResults.push_back(mapping.lookupOrDefault(operand));
+  if (!yieldResults.empty())
+    rewriter.create<scf::YieldOp>(source.getLoc(), yieldResults);
+
+  // Replace old loops by substituting their uses by results of the fused loop.
+  rewriter.replaceOp(target, fusedLoop.getResults().take_front(numTargetOuts));
+  rewriter.replaceOp(source, fusedLoop.getResults().take_back(numSourceOuts));
 
   return fusedLoop;
 }
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 149723f..53a6648 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -6313,6 +6313,20 @@ OpFoldResult SplatOp::fold(FoldAdaptor adaptor) {
 }
 
 //===----------------------------------------------------------------------===//
+// StepOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult StepOp::fold(FoldAdaptor adaptor) {
+  auto resultType = cast<VectorType>(getType());
+  if (resultType.isScalable())
+    return nullptr;
+  SmallVector<APInt> indices;
+  for (unsigned i = 0; i < resultType.getNumElements(); i++)
+    indices.push_back(APInt(/*width=*/64, i));
+  return DenseElementsAttr::get(resultType, indices);
+}
+
+//===----------------------------------------------------------------------===//
 // WarpExecuteOnLane0Op
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Interfaces/LoopLikeInterface.cpp b/mlir/lib/Interfaces/LoopLikeInterface.cpp
index 5a119a7..1e0e87b 100644
--- a/mlir/lib/Interfaces/LoopLikeInterface.cpp
+++ b/mlir/lib/Interfaces/LoopLikeInterface.cpp
@@ -8,8 +8,6 @@
 
 #include "mlir/Interfaces/LoopLikeInterface.h"
 
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "llvm/ADT/DenseSet.h"
 
@@ -115,60 +113,3 @@ LogicalResult detail::verifyLoopLikeOpInterface(Operation *op) {
 
   return success();
 }
-
-LoopLikeOpInterface mlir::createFused(LoopLikeOpInterface target,
-                                      LoopLikeOpInterface source,
-                                      RewriterBase &rewriter,
-                                      NewYieldValuesFn newYieldValuesFn,
-                                      FuseTerminatorFn fuseTerminatorFn) {
-  auto targetIterArgs = target.getRegionIterArgs();
-  std::optional<SmallVector<Value>> targetInductionVar =
-      target.getLoopInductionVars();
-  SmallVector<Value> targetYieldOperands(target.getYieldedValues());
-  auto sourceIterArgs = source.getRegionIterArgs();
-  std::optional<SmallVector<Value>> sourceInductionVar =
-      *source.getLoopInductionVars();
-  SmallVector<Value> sourceYieldOperands(source.getYieldedValues());
-  auto sourceRegion = source.getLoopRegions().front();
-
-  FailureOr<LoopLikeOpInterface> maybeFusedLoop =
-      target.replaceWithAdditionalYields(rewriter, source.getInits(),
-                                         /*replaceInitOperandUsesInLoop=*/false,
-                                         newYieldValuesFn);
-  if (failed(maybeFusedLoop))
-    llvm_unreachable("failed to replace loop");
-  LoopLikeOpInterface fusedLoop = *maybeFusedLoop;
-  // Since the target op is rewritten at the original's location, we move it to
-  // the soure op's location.
-  rewriter.moveOpBefore(fusedLoop, source);
-
-  // Map control operands.
-  IRMapping mapping;
-  std::optional<SmallVector<Value>> fusedInductionVar =
-      fusedLoop.getLoopInductionVars();
-  if (fusedInductionVar) {
-    if (!targetInductionVar || !sourceInductionVar)
-      llvm_unreachable(
-          "expected target and source loops to have induction vars");
-    mapping.map(*targetInductionVar, *fusedInductionVar);
-    mapping.map(*sourceInductionVar, *fusedInductionVar);
-  }
-  mapping.map(targetIterArgs,
-              fusedLoop.getRegionIterArgs().take_front(targetIterArgs.size()));
-  mapping.map(targetYieldOperands,
-              fusedLoop.getYieldedValues().take_front(targetIterArgs.size()));
-  mapping.map(sourceIterArgs,
-              fusedLoop.getRegionIterArgs().take_back(sourceIterArgs.size()));
-  mapping.map(sourceYieldOperands,
-              fusedLoop.getYieldedValues().take_back(sourceIterArgs.size()));
-  // Append everything except the terminator into the fused operation.
-  rewriter.setInsertionPoint(
-      fusedLoop.getLoopRegions().front()->front().getTerminator());
-  for (Operation &op : sourceRegion->front().without_terminator())
-    rewriter.clone(op, mapping);
-
-  // TODO: Replace with corresponding interface method if added
-  fuseTerminatorFn(rewriter, source, fusedLoop, mapping);
-
-  return fusedLoop;
-}
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index 43410aa..cbb2d1c 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -96,7 +96,7 @@ def floor(
     O[None] = UnaryFn.floor(I[None])
 
 
-@linalg_structured_op
+@linalg_structured_op(op_class_name="NegFOp")
 def negf(
     I=TensorDef(T1),
     O=TensorDef(T1, output=True),
@@ -330,7 +330,7 @@ def min(
     O[None] = BinaryFn.min_signed(lhs[None], rhs[None])
 
 
-@linalg_structured_op
+@linalg_structured_op(op_class_name="PowFOp")
 def powf(
     lhs=TensorDef(T1),
     rhs=TensorDef(T1),
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 09b7970..5f2d280 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2621,3 +2621,14 @@ func.func @vector_from_elements_0d(%a: f32) -> vector<f32> {
   %0 = vector.from_elements %a : vector<f32>
   return %0 : vector<f32>
 }
+
+// -----
+
+// CHECK-LABEL: @vector_step_scalable
+// CHECK: %[[STEPVECTOR:.*]] = llvm.intr.experimental.stepvector : vector<[4]xi64>
+// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[STEPVECTOR]] : vector<[4]xi64> to vector<[4]xindex>
+// CHECK: return %[[CAST]] : vector<[4]xindex>
+func.func @vector_step_scalable() -> vector<[4]xindex> {
+  %0 = vector.step : vector<[4]xindex>
+  return %0 : vector<[4]xindex>
+}
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 2915963..6a04b9e 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -2358,3 +2358,21 @@ func.func @byref_in_private(%arg0: index) {
 
   return
 }
+
+// -----
+func.func @masked_arg_type_mismatch(%arg0: f32) {
+  // expected-error @below {{'omp.masked' op operand #0 must be integer or index, but got 'f32'}}
+  "omp.masked"(%arg0) ({
+      omp.terminator
+    }) : (f32) -> ()
+  return
+}
+
+// -----
+func.func @masked_arg_count_mismatch(%arg0: i32, %arg1: i32) {
+  // expected-error @below {{'omp.masked' op operand group starting at #0 requires 0 or 1 element, but found 2}}
+  "omp.masked"(%arg0, %arg1) ({
+      omp.terminator
+    }) : (i32, i32) -> ()
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index eb28384..d6b655d 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -16,6 +16,20 @@ func.func @omp_master() -> () {
   return
 }
 
+// CHECK-LABEL: omp_masked
+func.func @omp_masked(%filtered_thread_id : i32) -> () {
+  // CHECK: omp.masked filter(%{{.*}} : i32)
+  "omp.masked" (%filtered_thread_id) ({
+    omp.terminator
+  }) : (i32) -> ()
+
+  // CHECK: omp.masked
+  "omp.masked" () ({
+    omp.terminator
+  }) : () -> ()
+  return
+}
+
 func.func @omp_taskwait() -> () {
   // CHECK: omp.taskwait
   omp.taskwait
diff --git a/mlir/test/Dialect/SCF/transform-loop-fuse-sibling.mlir b/mlir/test/Dialect/SCF/transform-loop-fuse-sibling.mlir
index f8246b7..54dd2bd 100644
--- a/mlir/test/Dialect/SCF/transform-loop-fuse-sibling.mlir
+++ b/mlir/test/Dialect/SCF/transform-loop-fuse-sibling.mlir
@@ -47,169 +47,6 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: func @fuse_two_parallel
-// CHECK-SAME:   ([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}}) {
-func.func @fuse_two_parallel(%A: memref<2x2xf32>, %B: memref<2x2xf32>) {
-// CHECK-DAG:  [[C2:%.*]] = arith.constant 2 : index
-// CHECK-DAG:  [[C0:%.*]] = arith.constant 0 : index
-// CHECK-DAG:  [[C1:%.*]] = arith.constant 1 : index
-// CHECK-DAG:  [[C1FP:%.*]] = arith.constant 1.
-  %c2 = arith.constant 2 : index
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c1fp = arith.constant 1.0 : f32
-// CHECK:      [[SUM:%.*]] = memref.alloc()
-  %sum = memref.alloc()  : memref<2x2xf32>
-// CHECK:      scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
-// CHECK-SAME:     to ([[C2]], [[C2]]) step ([[C1]], [[C1]]) {
-// CHECK:        [[B_ELEM:%.*]] = memref.load [[B]]{{\[}}[[I]], [[J]]]
-// CHECK:        [[SUM_ELEM:%.*]] = arith.addf [[B_ELEM]], [[C1FP]]
-// CHECK:        memref.store [[SUM_ELEM]], [[SUM]]{{\[}}[[I]], [[J]]]
-// CHECK-NOT:  scf.parallel
-// CHECK:        [[SUM_ELEM_:%.*]] = memref.load [[SUM]]{{\[}}[[I]], [[J]]]
-// CHECK:        [[A_ELEM:%.*]] = memref.load [[A]]{{\[}}[[I]], [[J]]]
-// CHECK:        [[PRODUCT_ELEM:%.*]] = arith.mulf [[SUM_ELEM_]], [[A_ELEM]]
-// CHECK:        memref.store [[PRODUCT_ELEM]], [[B]]{{\[}}[[I]], [[J]]]
-// CHECK:        scf.reduce
-// CHECK:      }
-  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
-    %B_elem = memref.load %B[%i, %j] : memref<2x2xf32>
-    %sum_elem = arith.addf %B_elem, %c1fp : f32
-    memref.store %sum_elem, %sum[%i, %j] : memref<2x2xf32>
-    scf.reduce
-  }
-  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
-    %sum_elem = memref.load %sum[%i, %j] : memref<2x2xf32>
-    %A_elem = memref.load %A[%i, %j] : memref<2x2xf32>
-    %product_elem = arith.mulf %sum_elem, %A_elem : f32
-    memref.store %product_elem, %B[%i, %j] : memref<2x2xf32>
-    scf.reduce
-  }
-// CHECK:      memref.dealloc [[SUM]]
-  memref.dealloc %sum : memref<2x2xf32>
-  return
-}
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["scf.parallel"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    %parallel:2 = transform.split_handle %0 :  (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %fused = transform.loop.fuse_sibling %parallel#0 into %parallel#1 : (!transform.any_op,!transform.any_op) ->  !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func @fuse_two_parallel_reverse
-// CHECK-SAME:   ([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}}) {
-func.func @fuse_two_parallel_reverse(%A: memref<2x2xf32>, %B: memref<2x2xf32>) {
-// CHECK-DAG:  [[C2:%.*]] = arith.constant 2 : index
-// CHECK-DAG:  [[C0:%.*]] = arith.constant 0 : index
-// CHECK-DAG:  [[C1:%.*]] = arith.constant 1 : index
-// CHECK-DAG:  [[C1FP:%.*]] = arith.constant 1.
-  %c2 = arith.constant 2 : index
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c1fp = arith.constant 1.0 : f32
-// CHECK:      [[SUM:%.*]] = memref.alloc()
-  %sum = memref.alloc()  : memref<2x2xf32>
-// CHECK:      scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
-// CHECK-SAME:     to ([[C2]], [[C2]]) step ([[C1]], [[C1]]) {
-// CHECK:        [[SUM_ELEM_:%.*]] = memref.load [[SUM]]{{\[}}[[I]], [[J]]]
-// CHECK:        [[A_ELEM:%.*]] = memref.load [[A]]{{\[}}[[I]], [[J]]]
-// CHECK:        [[PRODUCT_ELEM:%.*]] = arith.mulf [[SUM_ELEM_]], [[A_ELEM]]
-// CHECK:        memref.store [[PRODUCT_ELEM]], [[B]]{{\[}}[[I]], [[J]]]
-// CHECK-NOT:  scf.parallel
-// CHECK:        [[B_ELEM:%.*]] = memref.load [[B]]{{\[}}[[I]], [[J]]]
-// CHECK:        [[SUM_ELEM:%.*]] = arith.addf [[B_ELEM]], [[C1FP]]
-// CHECK:        memref.store [[SUM_ELEM]], [[SUM]]{{\[}}[[I]], [[J]]]
-// CHECK:        scf.reduce
-// CHECK:      }
-  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
-    %B_elem = memref.load %B[%i, %j] : memref<2x2xf32>
-    %sum_elem = arith.addf %B_elem, %c1fp : f32
-    memref.store %sum_elem, %sum[%i, %j] : memref<2x2xf32>
-    scf.reduce
-  }
-  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
-    %sum_elem = memref.load %sum[%i, %j] : memref<2x2xf32>
-    %A_elem = memref.load %A[%i, %j] : memref<2x2xf32>
-    %product_elem = arith.mulf %sum_elem, %A_elem : f32
-    memref.store %product_elem, %B[%i, %j] : memref<2x2xf32>
-    scf.reduce
-  }
-// CHECK:      memref.dealloc [[SUM]]
-  memref.dealloc %sum : memref<2x2xf32>
-  return
-}
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["scf.parallel"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    %parallel:2 = transform.split_handle %0 :  (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %fused = transform.loop.fuse_sibling %parallel#1 into %parallel#0 : (!transform.any_op,!transform.any_op) ->  !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func @fuse_reductions_two
-//  CHECK-SAME:  (%[[A:.*]]: memref<2x2xf32>, %[[B:.*]]: memref<2x2xf32>) -> (f32, f32)
-func.func @fuse_reductions_two(%A: memref<2x2xf32>, %B: memref<2x2xf32>) -> (f32, f32) {
-//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
-//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//   CHECK-DAG:   %[[INIT1:.*]] = arith.constant 1.000000e+00 : f32
-//   CHECK-DAG:   %[[INIT2:.*]] = arith.constant 2.000000e+00 : f32
-//       CHECK:   %[[RES:.*]]:2 = scf.parallel (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-//  CHECK-SAME:   to (%[[C2]], %[[C2]]) step (%[[C1]], %[[C1]])
-//  CHECK-SAME:   init (%[[INIT1]], %[[INIT2]]) -> (f32, f32)
-//       CHECK:   %[[VAL_A:.*]] = memref.load %[[A]][%[[I]], %[[J]]]
-//       CHECK:   %[[VAL_B:.*]] = memref.load %[[B]][%[[I]], %[[J]]]
-//       CHECK:   scf.reduce(%[[VAL_A]], %[[VAL_B]] : f32, f32) {
-//       CHECK:   ^bb0(%[[LHS:.*]]: f32, %[[RHS:.*]]: f32):
-//       CHECK:     %[[R:.*]] = arith.addf %[[LHS]], %[[RHS]] : f32
-//       CHECK:     scf.reduce.return %[[R]] : f32
-//       CHECK:   }
-//       CHECK:   ^bb0(%[[LHS:.*]]: f32, %[[RHS:.*]]: f32):
-//       CHECK:     %[[R:.*]] = arith.mulf %[[LHS]], %[[RHS]] : f32
-//       CHECK:     scf.reduce.return %[[R]] : f32
-//       CHECK:   }
-//       CHECK:   return %[[RES]]#0, %[[RES]]#1 : f32, f32
-  %c2 = arith.constant 2 : index
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %init1 = arith.constant 1.0 : f32
-  %init2 = arith.constant 2.0 : f32
-  %res1 = scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) init(%init1) -> f32 {
-    %A_elem = memref.load %A[%i, %j] : memref<2x2xf32>
-    scf.reduce(%A_elem : f32) {
-    ^bb0(%lhs: f32, %rhs: f32):
-      %1 = arith.addf %lhs, %rhs : f32
-      scf.reduce.return %1 : f32
-    }
-  }
-  %res2 = scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) init(%init2) -> f32 {
-    %B_elem = memref.load %B[%i, %j] : memref<2x2xf32>
-    scf.reduce(%B_elem : f32) {
-    ^bb0(%lhs: f32, %rhs: f32):
-      %1 = arith.mulf %lhs, %rhs : f32
-      scf.reduce.return %1 : f32
-    }
-  }
-  return %res1, %res2 : f32, f32
-}
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["scf.parallel"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    %parallel:2 = transform.split_handle %0 :  (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %fused = transform.loop.fuse_sibling %parallel#0 into %parallel#1 : (!transform.any_op,!transform.any_op) ->  !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
 // CHECK: func.func @fuse_2nd_for_into_1st([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}}
 func.func @fuse_2nd_for_into_1st(%A: tensor<128xf32>, %B: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) {
   // CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index
@@ -371,62 +208,6 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-
-// -----
-
-// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 32) 
-#map = affine_map<(d0) -> (d0 * 32)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-module {
-  // CHECK: func.func @loop_sibling_fusion(%[[ARG0:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %[[ARG2:.*]]: {{.*}}, %[[ARG3:.*]]: {{.*}}
-  func.func @loop_sibling_fusion(%arg0: tensor<128xf32>, %arg1: tensor<128x128xf16>, %arg2: tensor<128x64xf32>, %arg3: tensor<128x128xf32>) -> (tensor<128xf32>, tensor<128x128xf16>) {
-  // CHECK:      %[[EMPTY:.*]] = tensor.empty() : tensor<128x128xf16>
-  // CHECK-NEXT: %[[RESULTS:.*]]:2 = scf.forall (%[[I:.*]]) in (4) shared_outs(%[[S1:.*]] = %[[ARG0]], %[[S2:.*]] = %[[ARG1]]) -> (tensor<128xf32>, tensor<128x128xf16>) {
-  // CHECK-NEXT:  %[[IDX:.*]] = affine.apply #[[$MAP]](%[[I]])
-  // CHECK-NEXT:  %[[SLICE0:.*]] = tensor.extract_slice %[[ARG3]][%[[IDX]], 0] [32, 1] [1, 1] : tensor<128x128xf32> to tensor<32xf32>
-  // CHECK-NEXT:  %[[SLICE1:.*]] = tensor.extract_slice %[[ARG3]][%[[IDX]], 0] [32, 128] [1, 1] : tensor<128x128xf32> to tensor<32x128xf32>
-  // CHECK-NEXT:  %[[SLICE2:.*]] = tensor.extract_slice %[[EMPTY]][%[[IDX]], 0] [32, 128] [1, 1] : tensor<128x128xf16> to tensor<32x128xf16>
-  // CHECK-NEXT:  %[[GENERIC:.*]] = linalg.generic {{.*}} ins(%[[SLICE1]] : {{.*}}) outs(%[[SLICE2]] : {{.*}})
-  // CHECK:       scf.forall.in_parallel {
-  // CHECK-NEXT:    tensor.parallel_insert_slice %[[SLICE0]] into %[[S1]][%[[IDX]]] [32] [1] : tensor<32xf32> into tensor<128xf32>
-  // CHECK-NEXT:    tensor.parallel_insert_slice %[[GENERIC]] into %[[S2]][%[[IDX]], 0] [32, 128] [1, 1] : tensor<32x128xf16> into tensor<128x128xf16>
-  // CHECK-NEXT:  }
-  // CHECK-NEXT: } {mapping = [#gpu.warp<linear_dim_0>]}
-  // CHECK-NEXT: return %[[RESULTS]]#0, %[[RESULTS]]#1
-    %0 = scf.forall (%arg4) in (4) shared_outs(%arg5 = %arg0) -> (tensor<128xf32>) {
-      %3 = affine.apply #map(%arg4)
-      %extracted_slice = tensor.extract_slice %arg3[%3, 0] [32, 1] [1, 1] : tensor<128x128xf32> to tensor<32xf32>
-      scf.forall.in_parallel {
-        tensor.parallel_insert_slice %extracted_slice into %arg5[%3] [32] [1] : tensor<32xf32> into tensor<128xf32>
-      }
-    } {mapping = [#gpu.warp<linear_dim_0>]}
-    %1 = tensor.empty() : tensor<128x128xf16>
-    %2 = scf.forall (%arg4) in (4) shared_outs(%arg5 = %arg1) -> (tensor<128x128xf16>) {
-      %3 = affine.apply #map(%arg4)
-      %extracted_slice = tensor.extract_slice %arg3[%3, 0] [32, 128] [1, 1] : tensor<128x128xf32> to tensor<32x128xf32>
-      %extracted_slice_0 = tensor.extract_slice %1[%3, 0] [32, 128] [1, 1] : tensor<128x128xf16> to tensor<32x128xf16>
-      %4 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<32x128xf32>) outs(%extracted_slice_0 : tensor<32x128xf16>) {
-      ^bb0(%in: f32, %out: f16):
-        %5 = arith.truncf %in : f32 to f16 
-        linalg.yield %5 : f16 
-      } -> tensor<32x128xf16>
-      scf.forall.in_parallel {
-        tensor.parallel_insert_slice %4 into %arg5[%3, 0] [32, 128] [1, 1] : tensor<32x128xf16> into tensor<128x128xf16>
-      }   
-    } {mapping = [#gpu.warp<linear_dim_0>]}
-    return %0, %2 : tensor<128xf32>, tensor<128x128xf16>
-  }
-}
-
-module attributes { transform.with_named_sequence } { 
-  transform.named_sequence @__transform_main(%root: !transform.any_op) {
-    %loops = transform.structured.match ops{["scf.forall"]} in %root : (!transform.any_op) -> !transform.any_op
-    %loop1, %loop2 = transform.split_handle %loops : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %loop3 = transform.loop.fuse_sibling %loop1 into %loop2 : (!transform.any_op, !transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
 // -----
 
 func.func @source_for_uses_result_of_target_for_err(%A: tensor<128xf32>, %B: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) {
@@ -501,9 +282,8 @@ func.func @target_for_region_uses_result_of_source_for_err(%A: tensor<128xf32>,
     %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32>
     scf.yield %6 : tensor<128xf32>
   }
-  // expected-error @below {{values used inside regions of target should be properly dominated by source}}
   %dup1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %B) -> (tensor<128xf32>) {
-    // expected-note @below {{see operation}}
+  // expected-error @below {{values used inside regions of target should be properly dominated by source}}
     %dup2 = vector.transfer_read %1[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
     %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
     %dup5 = arith.addf %dup3, %dup2 : vector<16xf32>
@@ -548,74 +328,6 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-
-// -----
-
-func.func @non_matching_iteration_spaces_err(%A: memref<2x2xf32>, %B: memref<2x2xf32>) {
-  %c2 = arith.constant 2 : index
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c1fp = arith.constant 1.0 : f32
-  %sum = memref.alloc()  : memref<2x2xf32>
-  // expected-error @below {{target and source iteration spaces must be equal}}
-  scf.parallel (%i) = (%c0) to (%c2) step (%c1) {
-    %B_elem = memref.load %B[%i, %c0] : memref<2x2xf32>
-    %sum_elem = arith.addf %B_elem, %c1fp : f32
-    memref.store %sum_elem, %sum[%i, %c0] : memref<2x2xf32>
-    scf.reduce
-  }
-  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
-    %sum_elem = memref.load %sum[%i, %j] : memref<2x2xf32>
-    %A_elem = memref.load %A[%i, %j] : memref<2x2xf32>
-    %product_elem = arith.mulf %sum_elem, %A_elem : f32
-    memref.store %product_elem, %B[%i, %j] : memref<2x2xf32>
-    scf.reduce
-  }
-  memref.dealloc %sum : memref<2x2xf32>
-  return
-}
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["scf.parallel"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    %parallel:2 = transform.split_handle %0 :  (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %fused = transform.loop.fuse_sibling %parallel#0 into %parallel#1 : (!transform.any_op,!transform.any_op) ->  !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-func.func @non_matching_loop_types_err(%A: memref<2xf32>, %B: memref<2xf32>) {
-  %c2 = arith.constant 2 : index
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c1fp = arith.constant 1.0 : f32
-  %sum = memref.alloc()  : memref<2xf32>
-  // expected-error @below {{target and source must be same loop type}}
-  scf.for %i = %c0 to %c2 step %c1 {
-    %B_elem = memref.load %B[%i] : memref<2xf32>
-    %sum_elem = arith.addf %B_elem, %c1fp : f32
-    memref.store %sum_elem, %sum[%i] : memref<2xf32>
-  }
-  scf.parallel (%i) = (%c0) to (%c2) step (%c1) {
-    %sum_elem = memref.load %sum[%i] : memref<2xf32>
-    %A_elem = memref.load %A[%i] : memref<2xf32>
-    %product_elem = arith.mulf %sum_elem, %A_elem : f32
-    memref.store %product_elem, %B[%i] : memref<2xf32>
-    scf.reduce
-  }
-  memref.dealloc %sum : memref<2xf32>
-  return
-}
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.structured.match ops{["scf.parallel"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    %fused = transform.loop.fuse_sibling %0 into %1 : (!transform.any_op,!transform.any_op) ->  !transform.any_op
-    transform.yield
-  }
-}
-
 // -----
 
 // CHECK: func.func @foreach_loop_pair_fuse([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}}
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index fc5651f..1a674d7 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2719,3 +2719,13 @@ func.func @from_elements_to_splat(%a: f32, %b: f32) -> (vector<2x3xf32>, vector<
   // CHECK: return %[[splat]], %[[from_el]], %[[splat2]]
   return %0, %1, %2 : vector<2x3xf32>, vector<2x3xf32>, vector<f32>
 }
+
+// -----
+
+// CHECK-LABEL: @fold_vector_step_to_constant
+// CHECK: %[[CONSTANT:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK: return %[[CONSTANT]] : vector<4xindex>
+func.func @fold_vector_step_to_constant() -> vector<4xindex> {
+  %0 = vector.step : vector<4xindex>
+  return %0 : vector<4xindex>
+}
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index d0eaed8f..db169a6 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1871,3 +1871,19 @@ func.func @invalid_from_elements(%a: f32, %b: i32) {
   vector.from_elements %a, %b : vector<2xf32>
   return
 }
+
+// -----
+
+func.func @invalid_step_0d() {
+  // expected-error @+1 {{vector.step' op result #0 must be vector of index values of ranks 1, but got 'vector<f32>'}}
+  vector.step : vector<f32>
+  return
+}
+
+// -----
+
+func.func @invalid_step_2d() {
+  // expected-error @+1 {{vector.step' op result #0 must be vector of index values of ranks 1, but got 'vector<2x4xf32>'}}
+  vector.step : vector<2x4xf32>
+  return
+}
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 4da0958..531e2db 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -1171,4 +1171,13 @@ func.func @from_elements(%a: f32, %b: f32) -> (vector<f32>, vector<1xf32>, vecto
   // CHECK: vector.from_elements %[[b]], %[[b]], %[[a]], %[[a]] : vector<2x2xf32>
   %3 = vector.from_elements %b, %b, %a, %a : vector<2x2xf32>
   return %0, %1, %2, %3 : vector<f32>, vector<1xf32>, vector<1x2xf32>, vector<2x2xf32>
-}
-\ No newline at end of file
+}
+
+// CHECK-LABEL: @step
+func.func @step() {
+  // CHECK: vector.step : vector<2xindex>
+  %0 = vector.step : vector<2xindex>
+  // CHECK: vector.step : vector<[4]xindex>
+  %1 = vector.step : vector<[4]xindex>
+  return
+}
diff --git a/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py b/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py
index 0dceba9..ee76b6d 100644
--- a/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py
+++ b/mlir/test/python/dialects/linalg/opdsl/test_core_named_ops.py
@@ -2,3 +2,7 @@
 
 # Just verify that at least one known op is generated.
 # CHECK: name: matmul
+
+# verify some special cases: negf->NegFOp, powf->PowFOp
+# CHECK cpp_class_name: NegFOp
+# CHECK cpp_class_name: PowFOp
author	Luke Lau <luke@igalia.com>	2024-07-05 16:47:46 +0800
committer	Wang Pengcheng <wangpengcheng.pp@bytedance.com>	2024-07-05 16:47:46 +0800
commit	ca94bf9f2bd15fe34936f1759bffd26d7e93b182 (patch)
tree	c5441e7125b948e51d862f973afac0fc3d5a1652
parent	52d9cac11192a50c8a3ac1526f295c248d7534ab (diff)
parent	db782b44b3471c0ab41950c3f79d0ea7b916c135 (diff)
download	llvm-users/wangpc-pp/spr/main.riscv-support-select-optimization.zip llvm-users/wangpc-pp/spr/main.riscv-support-select-optimization.tar.gz llvm-users/wangpc-pp/spr/main.riscv-support-select-optimization.tar.bz2